In [1]:
import os
import pickle
import argparse
import numpy as np
from model import Options, Seq2SeqAttn
import tensorflow as tf

In [2]:
from beautifultable import BeautifulTable
import pandas as pd
pd.set_option('display.max_colwidth', -1)

In [3]:
# Parse the command line arguments.
save_dir = '/Users/yan/Documents/document/EPFL/MA2/semesterprj/code/seq2seq_attn/affect-rich/input/'
output_dir = '/Users/yan/Documents/document/EPFL/MA2/semesterprj/code/seq2seq_attn/affect-rich/output/'

parser = argparse.ArgumentParser()
parser.add_argument('--data_path', type = str, default = save_dir,
                    help = 'the directory to the data')

parser.add_argument('--word_embeddings_path', type = str, default = save_dir+'word_embeddings.npy',
                    help = 'the directory to the pre-trained word embeddings')
parser.add_argument('--VAD_path', type = str, default = save_dir+'VAD.npy',
                    help = 'the directory to VAD')
parser.add_argument('--tf_path', type = str, default = save_dir+'tf.npy',
                    help = 'the directory to term frequency')
parser.add_argument('--VAD_loss_path', type = str, default = save_dir+'VAD_loss.npy',
                    help = 'the directory to VAD loss for each word')
parser.add_argument('--ti_path', type = str, default = save_dir+'mu_li.npy',
                    help = 'the directory to term importance')

parser.add_argument('--num_epochs', type = int, default = 3,
                    help = 'the number of epochs to train the data')
parser.add_argument('--batch_size', type = int, default = 64,
                    help = 'the batch size')
parser.add_argument('--learning_rate', type = float, default = 0.001,
                    help = 'the learning rate')
parser.add_argument('--beam_width', type = int, default = 32,
                    help = 'the beam width when decoding')
parser.add_argument('--word_embed_size', type = int, default = 300,
                    help = 'the size of word embeddings')
parser.add_argument('--n_hidden_units_enc', type = int, default = 256,
                    help = 'the number of hidden units of encoder')
parser.add_argument('--n_hidden_units_dec', type = int, default = 256,
                    help = 'the number of hidden units of decoder')
parser.add_argument('--attn_depth', type = int, default = 128,
                    help = 'attention depth')

parser.add_argument('--restore_path_TS', type = str, default = output_dir+'model_dailydialog_rf/model_TS',
                    help = 'the path to restore the trained model')
parser.add_argument('--save_path_TS', type = str, default = output_dir+'/model_dailydialog_rf/model_TS',
                    help = 'the path to save the trained model to')

parser.add_argument('--restore_path_ST', type = str, default = output_dir+'model_dailydialog_rf/model_ST',
                    help = 'the path to restore the trained model')
parser.add_argument('--save_path_ST', type = str, default = output_dir+'/model_dailydialog_rf/model_ST',
                    help = 'the path to save the trained model to')

parser.add_argument('--restore_epoch', type = int, default = 3,
                    help = 'the epoch to restore')

# args = parser.parse_args()
args, unknown = parser.parse_known_args()


In [4]:
def read_data(data_path):
    def load_np_files(path):
        my_set = {}
        my_set['enc_input'] = np.load(os.path.join(path, 'enc_input.npy'))
        my_set['dec_input'] = np.load(os.path.join(path, 'dec_input.npy'))
        my_set['target'] = np.load(os.path.join(path, 'target.npy'))
        my_set['enc_input_len'] = np.load(os.path.join(path, 'enc_input_len.npy'))
        my_set['dec_input_len'] = np.load(os.path.join(path, 'dec_input_len.npy'))
        
        # to check if or not to complete the last batch
        idx = np.arange(my_set['dec_input'].shape[0])
        left_samples = idx[-1]%args.batch_size
        if left_samples:
            last_batch_idx = np.random.randint(0,idx[-1]-left_samples,size = args.batch_size - left_samples - 1)
            idx = np.concatenate([idx,last_batch_idx])
            
            my_set['enc_input'] = my_set['enc_input'][idx]
            my_set['dec_input'] = my_set['dec_input'][idx]
            my_set['target'] = my_set['target'][idx]
            my_set['enc_input_len'] = my_set['enc_input_len'][idx]
            my_set['dec_input_len'] = my_set['dec_input_len'][idx]
        return my_set
    test_set = load_np_files(os.path.join(data_path, 'test'))
    # dictionary index of words
    with open(os.path.join(data_path, 'token2id.pickle'), 'rb') as file:
        token2id = pickle.load(file)
    with open(os.path.join(data_path, 'id2token.pickle'), 'rb') as file:
        id2token = pickle.load(file)
    return test_set, token2id, id2token


In [5]:
def ids_to_sentence(ids, uttr_len, id2token): # ?
    tokens = []
    if uttr_len is not None:
        for i in range(uttr_len):
            if id2token[ids[i]] != '<eos>' and id2token[ids[i]] != '<go>':
                tokens.append(id2token[ids[i]])
    else:
        i = 0
        while i < len(ids) and id2token[ids[i]] != '<eos>':
            tokens.append(id2token[ids[i]])
            i += 1
    return ' '.join(tokens)


In [6]:
def revert(myset):
    enc_input = myset['dec_input'][:,1:]
    dec_input = np.insert(myset['enc_input'], 0, token2id['<go>'], axis=1) # add <go> in the beginning of encoder

    target = np.insert(myset['enc_input'], -1, 0, axis=1) 
    tmp_idx = [np.where(s==0)[0][0] for s in target] 
    target[np.arange(target.shape[0]),tmp_idx] = token2id['<eos>'] # add <eos> at the end of encoder
    
    newset = {}
    
    newset['enc_input'] = enc_input
    newset['dec_input'] = dec_input
    newset['target'] = target
    newset['enc_input_len'] = myset['dec_input_len']
    newset['dec_input_len'] = myset['enc_input_len']
    return newset

given enc_input predict prediction P(T|S)

In [7]:
test_set, token2id, id2token = read_data(args.data_path)

In [8]:
len(test_set['enc_input'])

9984

In [9]:
if __name__ == '__main__':
#     test_set, token2id, id2token = read_data(args.data_path)
    max_uttr_len_enc = test_set['enc_input'].shape[1]
    max_uttr_len_dec = test_set['dec_input'].shape[1]
    
    test_set['enc_input'] = test_set['enc_input'][:5*args.batch_size]

    word_embeddings = np.load(args.word_embeddings_path)
    VAD = np.load(args.VAD_path)
    termfreq = np.load(args.ti_path) # term importance
    termfreq = termfreq.reshape(-1,1)
    VAD_loss = np.load(args.VAD_loss_path)
    VAD_loss = VAD_loss.reshape(-1,1)

    options = Options(mode = 'PREDICT',
                      VAD_mode = 'FALSE',
                      num_epochs = args.num_epochs,
                      batch_size = args.batch_size,
                      learning_rate = args.learning_rate,
                      beam_width = args.beam_width,
                      corpus_size = len(token2id),
                      max_uttr_len_enc = max_uttr_len_enc,
                      max_uttr_len_dec = max_uttr_len_dec,
                      go_index = token2id['<go>'],
                      eos_index = token2id['<eos>'],
                      word_embed_size = args.word_embed_size,
                      n_hidden_units_enc = args.n_hidden_units_enc,
                      n_hidden_units_dec = args.n_hidden_units_dec,
                      attn_depth = args.attn_depth,
                      word_embeddings = word_embeddings)
    model_TS = Seq2SeqAttn(options)

    for var in model_TS.tvars:
        print(var.name)

    model_TS.restore(os.path.join(args.restore_path_TS, 'model_epoch_{:03d}.ckpt'.format(args.restore_epoch)))
    prediction_TS,probability_TS = model_TS.predict(test_set['enc_input'], test_set['enc_input_len'],VAD,termfreq)

Building the TensorFlow graph...
embedding/embedding:0
encoding/rnn/gru_cell/gates/kernel:0
encoding/rnn/gru_cell/gates/bias:0
encoding/rnn/gru_cell/candidate/kernel:0
encoding/rnn/gru_cell/candidate/bias:0
decoding/memory_layer/kernel:0
decoding/attention_v:0
decoding/my_bahdanau_attention/query_layer/kernel:0
decoding/my_bahdanau_attention/attention_Wb/kernel:0
decoding/attention_wrapper/gru_cell/gates/kernel:0
decoding/attention_wrapper/gru_cell/gates/bias:0
decoding/attention_wrapper/gru_cell/candidate/kernel:0
decoding/attention_wrapper/gru_cell/candidate/bias:0
decoding/dense/kernel:0
decoding/dense/bias:0
Restoring a pre-trained model from /Users/yan/Documents/document/EPFL/MA2/semesterprj/code/seq2seq_attn/affect-rich/output/model_dailydialog_rf/model_TS/model_epoch_003.ckpt...
INFO:tensorflow:Restoring parameters from /Users/yan/Documents/document/EPFL/MA2/semesterprj/code/seq2seq_attn/affect-rich/output/model_dailydialog_rf/model_TS/model_epoch_003.ckpt


In [10]:
prediction_TS.shape

(320, 21, 32)

In [11]:
probability_TS.shape

(320, 32)

In [13]:
probability_TS

array([[ -6.4579206,  -6.605519 ,  -6.68667  , ..., -18.483994 ,
        -18.982075 , -25.699299 ],
       [ -5.67196  ,  -7.569267 ,  -7.5700526, ..., -21.53505  ,
        -21.718372 , -22.024775 ],
       [ -6.3648877,  -7.661137 ,  -7.7334   , ..., -18.038395 ,
        -18.908665 , -28.184574 ],
       ...,
       [ -6.554825 ,  -7.3182216,  -7.3226833, ..., -19.175753 ,
        -23.30402  , -27.547388 ],
       [ -6.267722 ,  -7.088844 ,  -7.1873302, ..., -26.72459  ,
        -29.60926  , -31.220829 ],
       [ -6.704087 ,  -6.955397 ,  -7.663645 , ..., -18.191755 ,
        -18.73783  , -24.330801 ]], dtype=float32)

In [16]:
with open('/Users/yan/Documents/document/EPFL/MA2/semesterprj/code/seq2seq_attn/affect-rich/output/prediction/prediction_TS.pickle', 'wb') as f:
    pickle.dump(prediction_TS, f)
    
with open('/Users/yan/Documents/document/EPFL/MA2/semesterprj/code/seq2seq_attn/affect-rich/output/prediction/probability_TS.pickle', 'wb') as f:
    pickle.dump(probability_TS, f)

---
given prediction predict enc_input P(T|S)

In [8]:
with open('../pre-data/test/prediction_TS_noVAD.pickle', 'rb') as file:
    prediction_TS = pickle.load(file)
    
with open('../pre-data/test/probability_TS_noVAD.pickle', 'rb') as file:
    probability_TS = pickle.load(file)

In [14]:
# for every i in range(prediction_TS.shape[0])
# enc_input[i:i+args.beam_width] is the prediction of top args.beam_width of one given source from model_TS
enc_input = prediction_TS[0,:,:].T
for i in range(prediction_TS.shape[0]-1):
    enc_input = np.concatenate((enc_input,prediction_TS[i+1,:,:].T))

In [15]:
print(prediction_TS.shape,enc_input.shape)

(320, 21, 32) (10240, 21)


In [16]:
new_test_set = {}
new_test_set['enc_input'] = enc_input

multi_idx = np.tile(np.arange(test_set['enc_input'].shape[0]).T,(args.beam_width,1)).T.ravel()

new_test_set['dec_input'] = np.insert(test_set['enc_input'][multi_idx], 0, token2id['<go>'], axis=1) 

In [17]:
target = np.insert(test_set['enc_input'], -1, 0, axis=1) 
tmp_idx = [np.where(s==0)[0][0] for s in target] 
target[np.arange(target.shape[0]),tmp_idx] = token2id['<eos>'] # add <eos> at the end of encoder

new_test_set['target'] = target[multi_idx]

In [18]:
new_test_set['enc_input_len'] = (enc_input.shape[1]*np.ones(enc_input.shape[0])).astype(int)
_,idx = np.unique(np.argwhere(enc_input==token2id['<eos>'])[:,0],return_index=True)
# multi <eos> in one sentence, so find the first <eos> in each row
# for those predictions without <eos> the length is max_len
new_test_set['enc_input_len'][np.argwhere(enc_input==token2id['<eos>'])[idx,0]] = np.argwhere(enc_input==token2id['<eos>'])[idx,1]

new_test_set['dec_input_len'] = np.tile(test_set['enc_input_len'],(args.beam_width,1)).T.ravel()


In [20]:
if __name__ == '__main__':
#     test_set, token2id, id2token = read_data(args.data_path)
    word_embeddings = np.load(args.word_embeddings_path)
    VAD = np.load(args.VAD_path)
    termfreq = np.load(args.ti_path) # term importance
    termfreq = termfreq.reshape(-1,1)

    max_uttr_len_enc = new_test_set['enc_input'].shape[1]
    max_uttr_len_dec = new_test_set['dec_input'].shape[1]
    
    options = Options(mode = 'POST_PREDICT',
                      VAD_mode = 'FALSE',
                      num_epochs = 1,
                      batch_size = 1,
                      learning_rate = args.learning_rate,
                      beam_width = args.beam_width,
                      corpus_size = len(token2id),
                      max_uttr_len_enc = max_uttr_len_enc,
                      max_uttr_len_dec = max_uttr_len_dec,
                      go_index = token2id['<go>'],
                      eos_index = token2id['<eos>'],
                      word_embed_size = args.word_embed_size,
                      n_hidden_units_enc = args.n_hidden_units_enc,
                      n_hidden_units_dec = args.n_hidden_units_dec,
                      attn_depth = args.attn_depth,
                      word_embeddings = word_embeddings)
    model_ST = Seq2SeqAttn(options)

    for var in model_ST.tvars:
        print(var.name)

    model_ST.restore(os.path.join(args.restore_path_ST, 'model_epoch_{:03d}.ckpt'.format(args.restore_epoch)))
    probability_ST = model_ST.post_predict(new_test_set, VAD,termfreq)
    probability_ST = probability_ST.reshape(-1,args.beam_width)

Building the TensorFlow graph...
embedding/embedding:0
encoding/rnn/gru_cell/gates/kernel:0
encoding/rnn/gru_cell/gates/bias:0
encoding/rnn/gru_cell/candidate/kernel:0
encoding/rnn/gru_cell/candidate/bias:0
decoding/memory_layer/kernel:0
decoding/attention_v:0
decoding/my_bahdanau_attention/query_layer/kernel:0
decoding/my_bahdanau_attention/attention_Wb/kernel:0
decoding/attention_wrapper/gru_cell/gates/kernel:0
decoding/attention_wrapper/gru_cell/gates/bias:0
decoding/attention_wrapper/gru_cell/candidate/kernel:0
decoding/attention_wrapper/gru_cell/candidate/bias:0
decoding/dense/kernel:0
decoding/dense/bias:0
Restoring a pre-trained model from /Users/yan/Documents/document/EPFL/MA2/semesterprj/code/seq2seq_attn/affect-rich/output/model_dailydialog_rf/model_ST/model_epoch_003.ckpt...
INFO:tensorflow:Restoring parameters from /Users/yan/Documents/document/EPFL/MA2/semesterprj/code/seq2seq_attn/affect-rich/output/model_dailydialog_rf/model_ST/model_epoch_003.ckpt
Start to train the mod

In [21]:
with open('/Users/yan/Documents/document/EPFL/MA2/semesterprj/code/seq2seq_attn/affect-rich/output/prediction/probability_ST.pickle', 'wb') as f:
    pickle.dump(probability_ST, f)

---
MMI

In [22]:
def sentence_VAD(sentence,VAD,s_len):
    vad = 0
    for i in range(s_len):
        vad += sum(abs(VAD[sentence[i]]))
    return vad/len(sentence)

In [186]:
def MMI_bidi(pred_TS,prob_TS,prob_ST,VAD,id2token):
    """
    pred_TS: [num_sentence, max_uttr_len_dec, beam_width]
    prob_TS, prob_ST: [num_sentence, beam_width]
    """
    df = pd.DataFrame(columns=['label_num','label','target',"prediction", "prob_TS", "prob_ST",'ABS_VAD'])
    
    for sentence_num in range(pred_TS.shape[0]):
        bias = sentence_num*args.beam_width
        target = ids_to_sentence(test_set['dec_input'][sentence_num], test_set['dec_input_len'][sentence_num], id2token)
        for i in range(args.beam_width):
            label = ids_to_sentence(new_test_set['dec_input'][bias+i], new_test_set['dec_input_len'][bias+i]+1, id2token)
            pred_s = ids_to_sentence(pred_TS[sentence_num,:,i], new_test_set['enc_input_len'][bias+i], id2token)
            vad = sentence_VAD(pred_TS[sentence_num,:,i], VAD, new_test_set['enc_input_len'][bias+i])
            df.loc[bias+i] = list([sentence_num,label,target,pred_s,prob_TS[sentence_num,i],
                              prob_ST[sentence_num,i],vad])    
#     print(table)
    return df

In [188]:
df_VAD = MMI_bidi(prediction_TS,probability_TS,probability_ST,VAD,id2token)

In [25]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [189]:
df_VAD.to_csv('MMI_VAD.csv',index=False)
df = pd.read_csv('MMI.csv')
df_VAD = pd.read_csv('MMI_VAD.csv')

In [193]:
inputs = []
targets = []
outputs1 = []
outputs2 = []

for i in range(int(len(df)/32)):
    input_ = df_VAD.loc[32*i:32*i+31].sort_values(by='prob_TS',ascending=False).head(10).iloc[0].label
    target = df_VAD.loc[32*i:32*i+31].sort_values(by='prob_TS',ascending=False).head(10).iloc[0].target
    TS_response = df_VAD.loc[32*i:32*i+31].sort_values(by='prob_TS',ascending=False).head(10).iloc[0].prediction
    ST_response = df_VAD.loc[32*i:32*i+31].sort_values(by='prob_ST',ascending=False).head(10).sort_values(by="ABS_VAD",ascending=False).iloc[0].prediction
    
    inputs.append(input_)
    targets.append(target)
    outputs1.append(TS_response)
    outputs2.append(ST_response)

In [199]:
pred_df = pd.DataFrame([inputs,targets,outputs1,outputs2]).T
pred_df.columns = ['Input','Target','Basic','MMI_VAD']
pred_df

Unnamed: 0,Input,Target,Basic,MMI_VAD
0,what 's your name ?,"elena , but my friends call me yo-yo .",my name .,i do n't know . i was just looking for my friend .
1,why did i agree to this ?,"it 's like you said , there 's no roads in or out of here .",i do n't know .,"i do n't know , but i did n't know why i was talking about it ."
2,done . what do you call yourself ?,winston will have to do .,i do n't know .,i do n't know . i do n't know . i do n't understand .
3,care to help ?,"uh , why do n't i go with you , since i know the world and all ?",of course .,"i do n't know . i do n't know , sweetheart ."
4,do you want to see your daughter alive ?,do n't hurt her .,i do n't know .,i do n't think .
5,someone other than fitz ?,"oh , well , of course , fitz , but he has been rather distracted lately .","yeah , of course .","well , of course he does , but he does n't know ."
6,how far is the renaissance hotel ?,it 's too far to walk .,i do n't know .,i do n't know . i do n't know . it 's all .
7,so you think he 's doing it again ?,"uh , well , uh , uh ...",i do n't know .,"no , i do n't know , but i do n't know , but i do n't know , but i"
8,he caught it for me ?,"you know , in that moment , it all seemed possible .",he stabs it .,i do n't know . i do n't know what you 're talking about .
9,turn around ? what are you gonna fuck me first ?,"no , you 're the fuck up , you fucked up royally .",i 'm sorry .,i do n't know . i do n't know . i do n't want you .


In [198]:
pred_df.to_csv('predictions.csv',index=False)

In [6]:
pred_S2S = []
f = open('../pre-data/test/pred_S2S.txt', 'w', encoding = 'utf-8')
N = (test_set['enc_input'].shape[0] // args.batch_size) * args.batch_size
for i in range(N):
    f.write('HISTORY:\n')
    uttr = ids_to_sentence(test_set['enc_input'][i,:], test_set['enc_input_len'][i], id2token)
    f.write('- {}\n'.format(uttr))
    f.write('LABEL:\n')
    label = ids_to_sentence(test_set['target'][i,:], test_set['dec_input_len'][i], id2token)
    f.write('- {}\n'.format(label))
    f.write('PREDICTION:\n')
    pred = ids_to_sentence(prediction[i//args.batch_size][i%args.batch_size,:,0], None, id2token)
    f.write('- {}\n\n'.format(pred))
    pred_S2S.append(pred)
f.close()
with open('../pre-data/test/pred_S2S.pickle', 'wb') as f:
    pickle.dump(pred_S2S, f)


In [21]:
with open('../pre-data/test/prediction.pickle', 'wb') as f:
    pickle.dump(prediction, f)