In [1]:
import time
import pandas as pd
import numpy as np
import random
from pandas import DataFrame
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Flatten, Input, Embedding, TimeDistributed, Conv1D, concatenate, Lambda, Dropout
from keras import backend as K
from keras.models import Model, load_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train_doc_question = pd.read_csv('../data/FiQA/FiQA_train_question_doc_final.tsv', sep='\t')
train_question = pd.read_csv('../data/FiQA/FiQA_train_question_final.tsv', sep='\t')
train_doc = pd.read_csv('../data/FiQA/FiQA_train_doc_final.tsv', sep='\t')
vocabulary = pd.read_csv('../data/FiQA/vocabulary.csv')
vocab_size = len(vocabulary)
max_length = 100

In [3]:
def get_similarity(similarity):
    dot = lambda a, b: K.batch_dot(a, b, axes=1)
    if similarity == 'cosine':
            return lambda x: dot(x[0], x[1]) / K.maximum(K.sqrt(dot(x[0], x[0]) * dot(x[1], x[1])), K.epsilon())

In [4]:
def CNN_LSTM_model():
    question = Input(shape=(max_length,), dtype='int32', name='question_base')
    answer = Input(shape=(max_length,), dtype='int32', name='answer_base')

    # add embedding layers
    weights = np.load(self.config['initial_embed_weights'])
    embedding = Embedding(input_dim=self.config['n_words'],
                          output_dim=weights.shape[1],
                          weights=[weights])
    question_embedding = embedding(question)
    answer_embedding = embedding(answer)

    f_rnn = LSTM(141, return_sequences=True, implementation=1)
    b_rnn = LSTM(141, return_sequences=True, implementation=1, go_backwards=True)

    qf_rnn = f_rnn(question_embedding)
    qb_rnn = b_rnn(question_embedding)
    # question_pool = merge([qf_rnn, qb_rnn], mode='concat', concat_axis=-1)
    question_pool = concatenate([qf_rnn, qb_rnn], axis=-1)

    af_rnn = f_rnn(answer_embedding)
    ab_rnn = b_rnn(answer_embedding)
    # answer_pool = merge([af_rnn, ab_rnn], mode='concat', concat_axis=-1)
    answer_pool = concatenate([af_rnn, ab_rnn], axis=-1)

    # cnn
    cnns = [Conv1D(kernel_size=kernel_size,
                   filters=500,
                   activation='tanh',
                   padding='same') for kernel_size in [1, 2, 3, 5]]
    # question_cnn = merge([cnn(question_pool) for cnn in cnns], mode='concat')
    question_cnn = concatenate([cnn(question_pool) for cnn in cnns], axis=-1)
    # answer_cnn = merge([cnn(answer_pool) for cnn in cnns], mode='concat')
    answer_cnn = concatenate([cnn(answer_pool) for cnn in cnns], axis=-1)

    maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]))
    maxpool.supports_masking = True
    question_pool = maxpool(question_cnn)
    answer_pool = maxpool(answer_cnn)
    
    #dropout layer
    dropout = Dropout(0.2)
    similarity = get_similarity('cosine')
    qa_model = Lambda(similarity, output_shape=lambda _: (None, 1))([dropout(question_pool),
                                                                     dropout(answer_pool)])
    model = Model(inputs=[question, answer], outputs=qa_model, name='qa_model')

    return model

In [5]:
def CNN_model():
    question = Input(shape=(max_length,), dtype='int32', name='question_base')
    answer = Input(shape=(max_length,), dtype='int32', name='answer_base')
    
    #embedding layer
    embedding = Embedding(input_dim=vocab_size, output_dim=200)
    question_embedding = embedding(question)
    answer_embedding = embedding(answer)
    
    #hidden layer
    hidden_layer = TimeDistributed(Dense(200, activation='tanh'))
    question_hl = hidden_layer(question_embedding)
    answer_hl = hidden_layer(answer_embedding)
    
    #cnn layer
    cnns = [Conv1D(kernel_size=kernel_size,
                       filters=100,
                       activation='tanh',
                       padding='same') for kernel_size in [2, 3, 5, 7]]
    question_cnn = concatenate([cnn(question_hl) for cnn in cnns], axis=-1)
    answer_cnn = concatenate([cnn(answer_hl) for cnn in cnns], axis=-1)
    
    #max pooling layer
    maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]))
    question_pool = maxpool(question_cnn)
    answer_pool = maxpool(answer_cnn)
    
    #dropout layer
    dropout = Dropout(0.2)
    similarity = get_similarity('cosine')
    qa_model = Lambda(similarity, output_shape=lambda _: (None, 1))([dropout(question_pool),
                                                                     dropout(answer_pool)])
    model = Model(inputs=[question, answer], outputs=qa_model, name='qa_model')
        
    return model

In [6]:
def get_train_data():
#     qdic = train_question.set_index('qid').T.to_dict('list')
#     docdic = train_doc.set_index('docid').T.to_dict('list')

    #question id and the corresponding doc id
    question_id_list = train_doc_question['qid']
    doc_id_list = train_doc_question['docid']

    questions = []
    good_answers = []
    bad_answers = []
    train_doc_list = train_doc.dropna(axis=0, how='any')
    train_doc_list = list(train_doc_list['doc'])
    
    for i in range(0, len(train_doc_question)):
        doc_value = train_doc[train_doc.docid == doc_id_list[i]]['doc'].values[0]
        if doc_value == doc_value:
            question = train_question[train_question.qid == question_id_list[i]]['question'].values[0]
            questions.append(question)
            good_answers.append(doc_value)
            bad_answers.append(random.choice(train_doc_list))

    return [questions, good_answers, bad_answers]

In [7]:
questions, good_answers, bad_answers = get_train_data()

In [8]:
encoded_qs = [one_hot(d, vocab_size) for d in questions]
padded_qs = pad_sequences(encoded_qs, maxlen=max_length, padding='post')

encoded_good_answers = [one_hot(d, vocab_size) for d in good_answers]
padded_good_answers = pad_sequences(encoded_good_answers, maxlen=max_length, padding='post')

encoded_bad_answers = [one_hot(d, vocab_size) for d in bad_answers]
padded_bad_answers = pad_sequences(encoded_bad_answers, maxlen=max_length, padding='post')

In [9]:
#define the input of the model
q_input = Input(shape=(max_length,), dtype='int32', name='question_base')
good_answers_input = Input(shape=(max_length,), dtype='int32', name='good_answers_base')
bad_answers_input = Input(shape=(max_length,), dtype='int32', name='bad_answers_base')

# get the cnn model
model = CNN_model()
# model.summary()
good_similarity = model([q_input, good_answers_input])
bad_similarity = model([q_input, bad_answers_input])

#define the loss function, simialrity with the good_answers 
#need to be larger while similarity with the bad_answers need to be smaller
loss = Lambda(lambda x: K.relu(0.009 - x[0] + x[1]),
                      output_shape=lambda x: x[0])([good_similarity, bad_similarity])
training_model = Model(inputs=[q_input, good_answers_input, bad_answers_input], outputs=loss,
                                name='training_model')

training_model.compile(loss=lambda y_true, y_pred: y_pred, optimizer='adam')
# training_model.summary()

prediction_model = Model(inputs=[q_input, good_answers_input], outputs=good_similarity,
                                      name='prediction_model')

prediction_model = Model(inputs=[q_input, good_answers_input], outputs=good_similarity,
                                      name='prediction_model')
origin_weight = prediction_model.get_weights()
prediction_model.compile(loss=lambda y_true, y_pred: y_pred, optimizer='adam')

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead


In [10]:

y = np.zeros(shape=(len(encoded_qs),)) # doesn't get used

training_model.fit([padded_qs, padded_good_answers, padded_bad_answers], y, batch_size=512, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x13c97ad68>

In [13]:
late_weight = prediction_model.get_weights()

In [148]:
prediction_model.save('../model_result/cnn_model.h5')

In [105]:
prediction_model.load_weights('../model_result/cnn_model.h5')

In [16]:
prediction_model.predict([padded_qs, padded_good_answers])

array([[0.49149403],
       [0.51208264],
       [0.5983377 ],
       ...,
       [0.6669734 ],
       [0.58721685],
       [0.5839964 ]], dtype=float32)

In [17]:
prediction_model.predict([padded_qs, padded_bad_answers])

array([[0.3205113 ],
       [0.20022056],
       [0.427616  ],
       ...,
       [0.22471167],
       [0.4220238 ],
       [0.16637047]], dtype=float32)

In [18]:
train_doc = train_doc.dropna()
select_qs = [train_question['question'][0]] * len(train_doc)
select_encoded_qs = [one_hot(d, vocab_size) for d in select_qs]
select_padded_qs = pad_sequences(select_encoded_qs, maxlen=max_length, padding='post')
select_answers = train_doc['doc'].values 
select_encoded_answers = [one_hot(d, vocab_size) for d in select_answers]
select_padded_answers = pad_sequences(select_encoded_answers, maxlen=max_length, padding='post')

In [19]:
select_result = prediction_model.predict([select_padded_qs, select_padded_answers])

In [20]:
result = DataFrame(data={'probability': list(select_result), 'docid': train_doc['docid'].values})
result = result.sort_values('probability', ascending=False)
result =  result.reset_index(drop=True)

print(train_question.values[0])
print(train_question['qid'][0])
print(train_question['question'][0])
print(result[result.docid == 18850])

[0 0 'What is considered a business expense on a business trip?'
 "Nov 8 '11 at 15:14"]
0
What is considered a business expense on a business trip?
     docid   probability
984  18850  [0.49149403]


In [23]:
print(train_doc_question[train_doc_question.qid == train_question['qid'][0]])
print(len(select_qs))

   id  qid  docid
0   0    0  18850
57600


In [34]:
select_qs_02 = [train_question['question'][1]] * len(train_doc)
select_encoded_qs_02 = [one_hot(d, vocab_size) for d in select_qs_02]
select_padded_qs_02 = pad_sequences(select_encoded_qs_02, maxlen=max_length, padding='post')
select_answers_02 = train_doc['doc'].values 
select_encoded_answers_02 = [one_hot(d, vocab_size) for d in select_answers_02]
select_padded_answers_02 = pad_sequences(select_encoded_answers_02, maxlen=max_length, padding='post')

In [35]:
select_result_02 = prediction_model.predict([select_padded_qs_02, select_padded_answers_02])

In [36]:
result_02 = DataFrame(data={'probability': list(select_result_02), 'docid': train_doc['docid'].values})
result_02 = result_02.sort_values('probability', ascending=False)
result_02 =  result_02.reset_index(drop=True)

In [38]:
print(result_02[result_02.docid == 18850])

     docid  probability
396  18850  [0.5503769]


In [185]:
def create_test_sample(padded_qs, padded_good_answers, padded_bad_answers, test_len, qs_len):
    result = []
    for i in range(0, test_len):
        tmp_qs = np.array([padded_qs[i]]*qs_len)
        tmp_answers = np.vstack((padded_good_answers[i],  random.sample(list(padded_bad_answers), qs_len - 1)))
        result.append({'q': tmp_qs, 'answers': tmp_answers})
    return result

def test_accuracy(test_sample, prediction_model, recall_len):
    accuracy = 0;
    for item in test_sample:
        result = prediction_model.predict([item['q'], item['answers']])
        if np.argmax(result) < recall_len:
            accuracy += 1
    accuracy /= len(test_sample) 
    print(accuracy)
    return accuracy

In [188]:
recall = 1
print('recall:', recall)
print('answer_length: 10')
test_sample_01 = create_test_sample(padded_qs, padded_good_answers, padded_bad_answers, 100, 10)
accuracy = test_accuracy(test_sample_01, prediction_model, recall)
print('answer_length: 20')
test_sample_02 = create_test_sample(padded_qs, padded_good_answers, padded_bad_answers, 100, 20)
accuracy = test_accuracy(test_sample_02, prediction_model, recall)
print('answer_length: 30')
test_sample_03 = create_test_sample(padded_qs, padded_good_answers, padded_bad_answers, 100, 30)
accuracy = test_accuracy(test_sample_03, prediction_model, recall)
print('answer_length: 40')
test_sample_04 = create_test_sample(padded_qs, padded_good_answers, padded_bad_answers, 100, 40)
accuracy = test_accuracy(test_sample_04, prediction_model, recall)
print('answer_length: 50')
test_sample_05 = create_test_sample(padded_qs, padded_good_answers, padded_bad_answers, 100, 50)
accuracy = test_accuracy(test_sample_05, prediction_model, recall)
print('answer_length: 100')
test_sample_06 = create_test_sample(padded_qs, padded_good_answers, padded_bad_answers, 100, 100)
accuracy = test_accuracy(test_sample_06, prediction_model, recall)

recall: 1
answer_length: 10
0.76
answer_length: 20
0.69
answer_length: 30
0.61
answer_length: 40
0.6
answer_length: 50
0.48
answer_length: 100
0.39


In [189]:
recall = 2
print('recall:', recall)
print('answer_length: 10')
accuracy = test_accuracy(test_sample_01, prediction_model, recall)
print('answer_length: 20')
accuracy = test_accuracy(test_sample_02, prediction_model, recall)
print('answer_length: 30')
accuracy = test_accuracy(test_sample_03, prediction_model, recall)
print('answer_length: 40')
accuracy = test_accuracy(test_sample_04, prediction_model, recall)
print('answer_length: 50')
accuracy = test_accuracy(test_sample_05, prediction_model, recall)
print('answer_length: 100')
accuracy = test_accuracy(test_sample_06, prediction_model, recall)

recall: 2
answer_length: 10
0.8
answer_length: 20
0.69
answer_length: 30
0.63
answer_length: 40
0.61
answer_length: 50
0.48
answer_length: 100
0.39


In [190]:
recall = 3
print('recall:', recall)
print('answer_length: 10')
accuracy = test_accuracy(test_sample_01, prediction_model, recall)
print('answer_length: 20')
accuracy = test_accuracy(test_sample_02, prediction_model, recall)
print('answer_length: 30')
accuracy = test_accuracy(test_sample_03, prediction_model, recall)
print('answer_length: 40')
accuracy = test_accuracy(test_sample_04, prediction_model, recall)
print('answer_length: 50')
accuracy = test_accuracy(test_sample_05, prediction_model, recall)
print('answer_length: 100')
accuracy = test_accuracy(test_sample_06, prediction_model, recall)

recall: 3
answer_length: 10
0.82
answer_length: 20
0.71
answer_length: 30
0.65
answer_length: 40
0.65
answer_length: 50
0.49
answer_length: 100
0.4


In [191]:
recall = 4
print('recall:', recall)
print('answer_length: 10')
accuracy = test_accuracy(test_sample_01, prediction_model, recall)
print('answer_length: 20')
accuracy = test_accuracy(test_sample_02, prediction_model, recall)
print('answer_length: 30')
accuracy = test_accuracy(test_sample_03, prediction_model, recall)
print('answer_length: 40')
accuracy = test_accuracy(test_sample_04, prediction_model, recall)
print('answer_length: 50')
accuracy = test_accuracy(test_sample_05, prediction_model, recall)
print('answer_length: 100')
accuracy = test_accuracy(test_sample_06, prediction_model, recall)

recall: 4
answer_length: 10
0.85
answer_length: 20
0.71
answer_length: 30
0.66
answer_length: 40
0.65
answer_length: 50
0.5
answer_length: 100
0.4


In [193]:
recall = 5
print('recall:', recall)
print('answer_length: 10')
accuracy = test_accuracy(test_sample_01, prediction_model, recall)
print('answer_length: 20')
accuracy = test_accuracy(test_sample_02, prediction_model, recall)
print('answer_length: 30')
accuracy = test_accuracy(test_sample_03, prediction_model, recall)
print('answer_length: 40')
accuracy = test_accuracy(test_sample_04, prediction_model, recall)
print('answer_length: 50')
accuracy = test_accuracy(test_sample_05, prediction_model, recall)
print('answer_length: 100')
accuracy = test_accuracy(test_sample_06, prediction_model, recall)

recall: 5
answer_length: 10
0.87
answer_length: 20
0.72
answer_length: 30
0.67
answer_length: 40
0.67
answer_length: 50
0.5
answer_length: 100
0.4
