In [1]:
import yaml
import re
import glob
import os
import numpy as np

In [2]:
config_path = '../configs/dpng_transformer.yaml'

# for training size
with open(config_path) as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
    print(config)

train_size = config['train_size']
val_size = config['val_size']
test_size = config['test_size']

{'save_model_path': '../models/DNPG_base_transformer.pth', 'log_file': '../logs/DNPG_base_transformer_training.txt', 'test_output_file': '../outputs/test_DNPG_base_transformer.txt', 'val_output_file': '../outputs/val_DNPG_base_transformer.txt', 'dataset': 'quora_dataset', 'num_epochs': 50, 'batch_size': 128, 'd_model': 450, 'd_inner_hid': 512, 'd_k': 50, 'd_v': 50, 'n_head': 9, 'n_layers': 3, 'n_warmup_steps': 12000, 'dropout': 0.1, 'embs_share_weight': True, 'proj_share_weight': True, 'label_smoothing': False, 'train_size': 100000, 'val_size': 4000, 'test_size': 20000, 'is_bow': False, 'lr': '1e-3'}


In [3]:
seeds = [0, 777, 33333]
ref_path = '../data/quora_train.txt'

In [9]:
seed_dir_dic = {}
seed_root = './fixseed'
for seed in seeds:
    seed_dir = 'seed{}'.format(seed)
    seed_dir = os.path.join(seed_root, seed_dir)
    txt_paths = []
    txt_paths.append(os.path.join(seed_dir, 'test_DNPG_base_transformer.txt'))
    txt_paths.append(os.path.join(seed_dir, 'transformer_key_enc_bert_val_attention_alpha0.5.txt'))
    
    seed_dir_dic[seed] = txt_paths

In [39]:
def build_ref(ref_path):
    # read source sentence and ref sentence
    reference_text = open(ref_path, 'r').readlines()
    np.random.shuffle(reference_text)
    reference_text = reference_text[train_size+val_size:train_size+val_size+test_size]

    source_text = [text.strip().split('\t')[0] for text in reference_text]
    reference_text = [text.strip().split('\t')[1] for text in reference_text]
    
    # normalize reference corpus , eg: seperate question mark , remove '(' ,')' etc
    source_text = [re.sub(r"([.!?])", r" \1", seq) for seq in source_text]
    source_text = [text.lower() for text in source_text]

    reference_text = [re.sub(r"([.!?])", r" \1", seq) for seq in reference_text]
    reference_text = [text.lower() for text in reference_text]
    return source_text, reference_text

In [60]:
def build_dic(texts):
    dic = {}
    for text in texts:
#         text = re.sub(r"([.!?])", r" \1", text)
        text = text.lower()
        words = text.split()
        for word in words:
            if word not in dic:
                dic[word] = 1
            else:
                dic[word] = dic[word] + 1
    
    return dic

In [61]:
st, rt = build_ref(ref_path)
print(st[0], rt[0])

sdic = build_dic(st)
rdic = build_dic(rt)

print(len(sdic), len(rdic))

what is the best way to become rich ? let's cut out the crap . how can somebody really become rich ?
13419 13545


In [10]:
seed_dir_dic

{0: ['./fixseed/seed0/test_DNPG_base_transformer.txt',
  './fixseed/seed0/transformer_key_enc_bert_val_attention_alpha0.5.txt'],
 777: ['./fixseed/seed777/test_DNPG_base_transformer.txt',
  './fixseed/seed777/transformer_key_enc_bert_val_attention_alpha0.5.txt'],
 33333: ['./fixseed/seed33333/test_DNPG_base_transformer.txt',
  './fixseed/seed33333/transformer_key_enc_bert_val_attention_alpha0.5.txt']}

In [62]:
for seed in seeds:
    print("### Evaluating seed {} ###".format(seed))
    # fixseed
    # get reference text
    np.random.seed(seed)
    reference_text, reference_corpus = read_ref(ref_path)
    print(reference_corpus[0])

    # for every file in seeds
    files = seed_dir_dic[seed]
    # get predict text
    for file in files:
        print("Calculating file: ", file)
        prediction_text = open(file, 'r').readlines()
        prediction_text = [text.replace('Predict: ', '').strip().lower() for text in prediction_text if 'Predict: ' in text]
        pdic = build_dic(prediction_text)
        print("number of words in dictionary: ", len(pdic))
        
        print("==============================")

### Evaluating seed 0 ###
[['what', 'are', 'the', 'benefits', 'of', 'using', 'digital', 'signage', 'for', 'your', 'business', '?']]
Calculating file:  ./fixseed/seed0/test_DNPG_base_transformer.txt
number of words in dictionary:  6268
Calculating file:  ./fixseed/seed0/transformer_key_enc_bert_val_attention_alpha0.5.txt
number of words in dictionary:  8513
### Evaluating seed 777 ###
[['what', 'was', 'the', 'first', 'thing', 'you', 'did', 'when', 'you', 'found', 'out', 'donald', 'trump', 'won', 'the', 'election', '?']]
Calculating file:  ./fixseed/seed777/test_DNPG_base_transformer.txt
number of words in dictionary:  6111
Calculating file:  ./fixseed/seed777/transformer_key_enc_bert_val_attention_alpha0.5.txt
number of words in dictionary:  8691
### Evaluating seed 33333 ###
[['what', 'is', 'the', 'assumed', 'walking', 'speed', 'in', 'google', "maps's", 'time', 'estimates', '?']]
Calculating file:  ./fixseed/seed33333/test_DNPG_base_transformer.txt
number of words in dictionary:  6283


In [63]:
file = './fixseed/seed33333/test_DNPG_base_transformer.txt'
prediction_text = open(file, 'r').readlines()
prediction_text = [text.replace('Predict: ', '').strip().lower() for text in prediction_text if 'Predict: ' in text]
bdic = build_dic(prediction_text)

In [64]:
source_ws = set(sdic.keys())
ref_ws = set(rdic.keys())
pred_ws = set(pdic.keys())
base_ws = set(bdic.keys())

In [65]:
print(len(base_ws - pred_ws))
print(len(pred_ws - base_ws))

773
2920


In [70]:
print(pred_ws - base_ws)

{'tries', 'sharps', 'carta', 'censor', 'gma', '51', 'm606x', 'jammer', 'sfc', 'gulf', '45k', 'jew', 'ferulos', 'frame', 'visitors', 'commuative', 'squats', 'epigenetic', 'bet', 'impairment', 'perceptron', '40000', 'girth', 'melo', 'braking', 'ass', 'tutorial', 'enlightr', 'earthing', 'fgfa', 'timeings', 'astrophysics', 'scratches', 'vjp', 'taxa', '2014', 'sur19', 'singleton', 'f1', 'inmortal', 'du', 'akbar', 'programing', 'rubibi', 'langely', 'accentedstststst', 'si', '₹1000', 'goldman', 'elenoge', 'nukeed', 'omnintary', 'grudges', 'manrogate', 'trivanrgy', 'trafficda', 'kuan', 'spoke', 'mountaineering', 'bre', 'minora', 'slavs', 'redheads', 'colonize', 'geope', 'cuthvicing', 'landry', 'bandung', 'crest3', 'rim', 'arbitrage', 'kakatiya', 'fears', 'kitten', 'z3', 'goviem', 'flooring', 'biroma', 'trustus', 'bantu', '400', 'ganege', 'documentary', 'reboot', 'homeschooler', 'indo', 'temperament', 'piercing', 'hpc', 'rudisha', 'serveries', 'explained', 'alpha', 'costsochondritis', 'componen

In [69]:
print(len(pred_ws - source_ws))
print(len(source_ws - pred_ws))
print(len(pred_ws-ref_ws))

2539
7528
2459


In [67]:
pred_ws - ref_ws

{'bubble',
 'sharps',
 'lochte',
 'gma',
 'censor',
 'm606x',
 'sfc',
 'subsurface',
 'jew',
 'dth',
 'selfless',
 'ferulos',
 'commuative',
 'squats',
 'impairment',
 'perceptron',
 'proteins',
 'melo',
 'klout',
 'braking',
 'enlightr',
 'norse',
 'icelandic',
 'fgfa',
 'timeings',
 'lewis',
 'productivity',
 'spleen',
 'vjp',
 'taxa',
 'sur19',
 'singleton',
 'programing',
 'rubibi',
 'langely',
 'accentedstststst',
 'mandarin',
 'si',
 'elenoge',
 'nukeed',
 'omnintary',
 'grudges',
 'uncertainty',
 'manrogate',
 'trivanrgy',
 'trafficda',
 'kuan',
 'bre',
 'minora',
 'redheads',
 'expressions',
 'cuthvicing',
 'geope',
 'landry',
 'hop',
 'bandung',
 'crest3',
 'rim',
 'kakatiya',
 'pnb',
 'z3',
 'sized',
 'goviem',
 'flooring',
 'biroma',
 'trustus',
 'bantu',
 '400',
 'ganege',
 'documentary',
 'tune',
 'reboot',
 'homeschooler',
 'indo',
 'hpc',
 'piercing',
 'serveries',
 'alpha',
 'wizards',
 'costsochondritis',
 'componentsing',
 'thc',
 'mammals',
 'construct',
 'hotspot',
