In [1]:
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import sentence_bleu
import rouge
from argparse import ArgumentParser
import yaml
import re
import glob
import os
import numpy as np
from bert_score import score as bscore
import torch
from sklearn.metrics.pairwise import cosine_similarity as cos_sim
from sentence_transformers import SentenceTransformer


In [2]:
config_path = '../configs/dpng_transformer.yaml'

# for training size
with open(config_path) as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
    print(config)

train_size = config['train_size']
val_size = config['val_size']
test_size = config['test_size']

{'save_model_path': '../models/DNPG_base_transformer.pth', 'log_file': '../logs/DNPG_base_transformer_training.txt', 'test_output_file': '../outputs/test_DNPG_base_transformer.txt', 'val_output_file': '../outputs/val_DNPG_base_transformer.txt', 'dataset': 'quora_dataset', 'num_epochs': 50, 'batch_size': 128, 'd_model': 450, 'd_inner_hid': 512, 'd_k': 50, 'd_v': 50, 'n_head': 9, 'n_layers': 3, 'n_warmup_steps': 12000, 'dropout': 0.1, 'embs_share_weight': True, 'proj_share_weight': True, 'label_smoothing': False, 'train_size': 100000, 'val_size': 4000, 'test_size': 20000, 'is_bow': False, 'lr': '1e-3'}


In [9]:
# seeds = [0, 777, 33333]
seeds = [33333]

ref_path = '../data/quora_train.txt'

In [10]:
seed_dir_dic = {}
seed_root = './fixseed'
for seed in seeds:
    seed_dir = 'seed{}'.format(seed)
    seed_dir = os.path.join(seed_root, seed_dir)
    txt_path = os.path.join(seed_dir, '*.txt')
    seed_files = glob.glob(txt_path)
    seed_dir_dic[seed] = seed_files

In [11]:
bertcs_model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')

In [12]:
def read_ref(ref_path):
    # read reference sentence and prediction sentence
    reference_text = open(ref_path, 'r').readlines()
    np.random.shuffle(reference_text)
    reference_text = reference_text[train_size+val_size:train_size+val_size+test_size]
    reference_text = [text.strip().split('\t')[1] for text in reference_text]

    # normalize reference corpus , eg: seperate question mark , remove '(' ,')' etc
    reference_text = [re.sub(r"([.!?])", r" \1", seq) for seq in reference_text]
    reference_text = [text.lower() for text in reference_text]

    # reference_text = [re.sub(r"[^a-zA-Z.!?]+", r" ", seq) for seq in reference_text]
    reference_corpus = [[text.split()] for text in reference_text]
    return reference_text, reference_corpus



In [13]:
def evaluate(reference_corpus, prediction_corpus):
    print("[Info] Calculating BLEU 1...")
    bleu1 = corpus_bleu(reference_corpus, prediction_corpus, weights=(1, 0, 0, 0))
    print("[Info] Calculating BLEU 2...")
    bleu2 = corpus_bleu(reference_corpus, prediction_corpus, weights=(0.5, 0.5, 0, 0))
    print("[Info] Calculating BLEU 3...")
    bleu3 = corpus_bleu(reference_corpus, prediction_corpus, weights=(0.33, 0.33, 0.34, 0))
    print("[Info] Calculating BLEU 4...")
    bleu4 = corpus_bleu(reference_corpus, prediction_corpus, weights=(0.25, 0.25, 0.25, 0.25))
    print("[Info] Done")

    print("[Info] BLEU1 Score: {}".format(bleu1))
    print("[Info] BLEU2 Score: {}".format(bleu2))
    print("[Info] BLEU3 Score: {}".format(bleu3))
    print("[Info] BLEU4 Score: {}".format(bleu4))
    
    rouge_evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l'], max_n=2)
    rouge_scores = rouge_evaluator.get_scores(prediction_text, reference_text)

    print("[Info] Rouge 1 score: {}".format(rouge_scores["rouge-1"]["f"]))
    print("[Info] Rouge 2 score: {}".format(rouge_scores["rouge-2"]["f"]))
    print("[Info] Rouge l score: {}".format(rouge_scores["rouge-l"]["f"]))
    
    P, R, F1 = bscore(prediction_text, reference_text, lang="en", model_type="bert-base-uncased", verbose=True)
    f1 = F1.mean().item()
    print("[Info] BERT F1 score: {}".format(f1))
    
    print("[Info] Calculating BERT CS")
    pred_embeddings = bertcs_model.encode(prediction_text)
    ref_embeddings = bertcs_model.encode(reference_text)
    bertcs = []
    for pred_emb, ref_emb in zip(pred_embeddings, ref_embeddings):
        sim = cos_sim(pred_emb.reshape(1, -1), ref_emb.reshape(1, -1))
        bertcs.append(sim[0][0])

    print("[Info] BERT CS score: {}".format(np.mean(bertcs)))
    return bleu1, bleu2, bleu3, bleu4, rouge_scores, f1, np.mean(bertcs)

In [14]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="2"

In [15]:
for seed in seeds:
    print("### Evaluating seed {} ###".format(seed))
    # fixseed
    # get reference text
    np.random.seed(seed)
    reference_text, reference_corpus = read_ref(ref_path)
    print(reference_corpus[0])

    # for every file in seeds
    files = seed_dir_dic[seed]
    # get predict text
    for file in files:
        if 'aug' not in file or 'uncased' in file:
            continue
        print("Calculating file: ", file)
        prediction_text = open(file, 'r').readlines()
        prediction_text = [text.replace('Predict: ', '').strip().lower() for text in prediction_text if 'Predict: ' in text]
        prediction_corpus = [text.split() for text in prediction_text]
        assert len(reference_text) == len(prediction_text), "len mismatch, ref: {}, pred: {}".format(len(reference_text), len(prediction_text))
        
        # evaluate score
        bleu1, bleu2, bleu3, bleu4, rouge_scores, bert_f1, bertcs = evaluate(reference_corpus, prediction_corpus)
        # write to output
        score_dir = '../scores/fixseed/seed{}'.format(seed)
        score_path = os.path.join(score_dir, file.split('/')[-1])
        print("Writing to file:", score_path)
        
        f = open(score_path, 'w')
        f.write("[Info] BLEU1 Score: {}\n".format(bleu1))
        f.write("[Info] BLEU2 Score: {}\n".format(bleu2))
        f.write("[Info] BLEU3 Score: {}\n".format(bleu3))
        f.write("[Info] BLEU4 Score: {}\n".format(bleu4))

        f.write("\n\n[Info] Rouge 1 score: {}\n".format(rouge_scores["rouge-1"]["f"]))
        f.write("[Info] Rouge 2 score: {}\n".format(rouge_scores["rouge-2"]["f"]))
        f.write("[Info] Rouge l score: {}\n".format(rouge_scores["rouge-l"]["f"]))
        f.write("[Info] BERT F1 score: {}\n".format(bert_f1))
        f.write("[Info] BERT CS score: {}\n".format(bertcs))

        f.close()
        print("==============================")

### Evaluating seed 33333 ###
[['what', 'is', 'the', 'pace', 'of', 'walking', 'in', 'google', 'maps', '?']]
Calculating file:  ./fixseed/seed33333/test_transformer_bert_enc_attention_wordnet_aug.txt
[Info] Calculating BLEU 1...
[Info] Calculating BLEU 2...
[Info] Calculating BLEU 3...
[Info] Calculating BLEU 4...
[Info] Done
[Info] BLEU1 Score: 0.5704743609541874
[Info] BLEU2 Score: 0.4324606837140314
[Info] BLEU3 Score: 0.3371044924156258
[Info] BLEU4 Score: 0.2692524162672311
[Info] Rouge 1 score: 0.6000191696302655
[Info] Rouge 2 score: 0.36273784472992326
[Info] Rouge l score: 0.5763139926579922


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


calculating scores...
computing bert embedding.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=481.0), HTML(value='')))


computing greedy matching.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=313.0), HTML(value='')))


done in 80.47 seconds, 248.53 sentences/sec
[Info] BERT F1 score: 0.7708830237388611
[Info] Calculating BERT CS
[Info] BERT CS score: 0.8451414108276367
Writing to file: ../scores/fixseed/seed33333/test_transformer_bert_enc_attention_wordnet_aug.txt
