In [10]:
import pandas as pd
import numpy as np
import random
from pandas import DataFrame
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Flatten, Input, Embedding, TimeDistributed, Conv1D, concatenate, Lambda, Dropout
from keras import backend as K
from keras.models import Model

In [11]:
train_doc_question = pd.read_csv('../data/FiQA_train_question_doc_final.tsv', sep='\t')
train_question = pd.read_csv('../data/FiQA_train_question_final.tsv', sep='\t')
train_doc = pd.read_csv('../data/FiQA_train_doc_final.tsv', sep='\t')
vocabulary = pd.read_csv('../data/vocabulary.csv')
vocab_size = len(vocabulary)
max_length = 100

In [12]:
def get_similarity(similarity):
    dot = lambda a, b: K.batch_dot(a, b, axes=1)
    if similarity == 'cosine':
            return lambda x: dot(x[0], x[1]) / K.maximum(K.sqrt(dot(x[0], x[0]) * dot(x[1], x[1])), K.epsilon())

In [13]:
def CNN_LSTM_model():
    question = Input(shape=(max_length,), dtype='int32', name='question_base')
    answer = Input(shape=(max_length,), dtype='int32', name='answer_base')

    # add embedding layers
    weights = np.load(self.config['initial_embed_weights'])
    embedding = Embedding(input_dim=self.config['n_words'],
                          output_dim=weights.shape[1],
                          weights=[weights])
    question_embedding = embedding(question)
    answer_embedding = embedding(answer)

    f_rnn = LSTM(141, return_sequences=True, implementation=1)
    b_rnn = LSTM(141, return_sequences=True, implementation=1, go_backwards=True)

    qf_rnn = f_rnn(question_embedding)
    qb_rnn = b_rnn(question_embedding)
    # question_pool = merge([qf_rnn, qb_rnn], mode='concat', concat_axis=-1)
    question_pool = concatenate([qf_rnn, qb_rnn], axis=-1)

    af_rnn = f_rnn(answer_embedding)
    ab_rnn = b_rnn(answer_embedding)
    # answer_pool = merge([af_rnn, ab_rnn], mode='concat', concat_axis=-1)
    answer_pool = concatenate([af_rnn, ab_rnn], axis=-1)

    # cnn
    cnns = [Conv1D(kernel_size=kernel_size,
                   filters=500,
                   activation='tanh',
                   padding='same') for kernel_size in [1, 2, 3, 5]]
    # question_cnn = merge([cnn(question_pool) for cnn in cnns], mode='concat')
    question_cnn = concatenate([cnn(question_pool) for cnn in cnns], axis=-1)
    # answer_cnn = merge([cnn(answer_pool) for cnn in cnns], mode='concat')
    answer_cnn = concatenate([cnn(answer_pool) for cnn in cnns], axis=-1)

    maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]))
    maxpool.supports_masking = True
    question_pool = maxpool(question_cnn)
    answer_pool = maxpool(answer_cnn)
    
    #dropout layer
    dropout = Dropout(0.2)
    similarity = get_similarity('cosine')
    qa_model = Lambda(similarity, output_shape=lambda _: (None, 1))([dropout(question_pool),
                                                                     dropout(answer_pool)])
    model = Model(inputs=[question, answer], outputs=qa_model, name='qa_model')

    return model

In [14]:
def CNN_model():
    question = Input(shape=(max_length,), dtype='int32', name='question_base')
    answer = Input(shape=(max_length,), dtype='int32', name='answer_base')
    
    #embedding layer
    embedding = Embedding(input_dim=vocab_size, output_dim=200)
    question_embedding = embedding(question)
    answer_embedding = embedding(answer)
    
    #hidden layer
    hidden_layer = TimeDistributed(Dense(200, activation='tanh'))
    question_hl = hidden_layer(question_embedding)
    answer_hl = hidden_layer(answer_embedding)
    
    #cnn layer
    cnns = [Conv1D(kernel_size=kernel_size,
                       filters=100,
                       activation='tanh',
                       padding='same') for kernel_size in [2, 3, 5, 7]]
    question_cnn = concatenate([cnn(question_hl) for cnn in cnns], axis=-1)
    answer_cnn = concatenate([cnn(answer_hl) for cnn in cnns], axis=-1)
    
    #max pooling layer
    maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]))
    question_pool = maxpool(question_cnn)
    answer_pool = maxpool(answer_cnn)
    
    #dropout layer
    dropout = Dropout(0.2)
    similarity = get_similarity('cosine')
    qa_model = Lambda(similarity, output_shape=lambda _: (None, 1))([dropout(question_pool),
                                                                     dropout(answer_pool)])
    model = Model(inputs=[question, answer], outputs=qa_model, name='qa_model')
        
    return model

In [15]:
def get_train_data():
#     qdic = train_question.set_index('qid').T.to_dict('list')
#     docdic = train_doc.set_index('docid').T.to_dict('list')

    #question id and the corresponding doc id
    question_id_list = train_doc_question['qid']
    doc_id_list = train_doc_question['docid']

    questions = []
    good_answers = []
    bad_answers = []
    train_doc_list = train_doc.dropna(axis=0, how='any')
    train_doc_list = list(train_doc_list['doc'])
    
    for i in range(0, len(train_doc_question)):
        doc_value = train_doc[train_doc.docid == doc_id_list[i]]['doc'].values[0]
        if doc_value == doc_value:
            question = train_question[train_question.qid == question_id_list[i]]['question'].values[0]
            questions.append(question)
            good_answers.append(doc_value)
            bad_answers.append(random.choice(train_doc_list))

    return [questions, good_answers, bad_answers]

In [16]:
questions, good_answers, bad_answers = get_train_data()

In [17]:
encoded_qs = [one_hot(d, vocab_size) for d in questions]
padded_qs = pad_sequences(encoded_qs, maxlen=max_length, padding='post')

encoded_good_answers = [one_hot(d, vocab_size) for d in good_answers]
padded_good_answers = pad_sequences(encoded_good_answers, maxlen=max_length, padding='post')

encoded_bad_answers = [one_hot(d, vocab_size) for d in bad_answers]
padded_bad_answers = pad_sequences(encoded_bad_answers, maxlen=max_length, padding='post')

In [18]:
#define the input of the model
q_input = Input(shape=(max_length,), dtype='int32', name='question_base')
good_answers_input = Input(shape=(max_length,), dtype='int32', name='good_answers_base')
bad_answers_input = Input(shape=(max_length,), dtype='int32', name='bad_answers_base')

# get the cnn model
model = CNN_model()
model.summary()
good_similarity = model([q_input, good_answers_input])
bad_similarity = model([q_input, bad_answers_input])

#define the loss function, simialrity with the good_answers 
#need to be larger while similarity with the bad_answers need to be smaller
loss = Lambda(lambda x: K.relu(0.009 - x[0] + x[1]),
                      output_shape=lambda x: x[0])([good_similarity, bad_similarity])
training_model = Model(inputs=[q_input, good_answers_input, bad_answers_input], outputs=loss,
                                name='training_model')

training_model.compile(loss=lambda y_true, y_pred: y_pred, optimizer='adam')
training_model.summary()

prediction_model = Model(inputs=[q_input, good_answers_input], outputs=good_similarity,
                                      name='prediction_model')
prediction_model.compile(loss=lambda y_true, y_pred: y_pred, optimizer='adam')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
question_base (InputLayer)      (None, 100)          0                                            
__________________________________________________________________________________________________
answer_base (InputLayer)        (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 100, 200)     11982600    question_base[0][0]              
                                                                 answer_base[0][0]                
__________________________________________________________________________________________________
time_distributed_2 (TimeDistrib (None, 100, 200)     40200       embedding_2[0][0]                
          

In [18]:

y = np.zeros(shape=(len(encoded_qs),)) # doesn't get used

training_model.fit([padded_qs, padded_good_answers, padded_bad_answers], y, batch_size=256, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x12017d710>

In [23]:
training_model.save('../model/my_model.h5')

In [22]:
prediction_model.save('../model/predict_cnn.h5')

In [19]:
prediction_model.predict([padded_qs, padded_good_answers])

array([[0.91682464],
       [0.9243847 ],
       [0.9281581 ],
       ...,
       [0.94948924],
       [0.94622576],
       [0.9294157 ]], dtype=float32)

In [20]:
prediction_model.predict([padded_qs, padded_bad_answers])

array([[0.91746026],
       [0.918111  ],
       [0.9249023 ],
       ...,
       [0.93947494],
       [0.94490683],
       [0.9210839 ]], dtype=float32)

In [75]:
train_doc = train_doc.dropna()
select_qs = [train_question['question'][0]] * len(train_doc)
select_encoded_qs = [one_hot(d, vocab_size) for d in select_qs]
select_padded_qs = pad_sequences(select_encoded_qs, maxlen=max_length, padding='post')
select_answers = train_doc['doc'].values 
select_encoded_answers = [one_hot(d, vocab_size) for d in select_answers]
select_padded_answers = pad_sequences(select_encoded_answers, maxlen=max_length, padding='post')

In [77]:
select_result = prediction_model.predict([select_padded_qs, select_padded_answers])

In [106]:
result = DataFrame(data={'probability': list(select_result), 'docid': train_doc['docid'].values})
result = result.sort_values('probability', ascending=False)
result =  result.reset_index(drop=True)

print(train_question.values[0])
print(train_question['qid'][0])
print(train_question['question'][0])
print(result[result.docid == 18850])

[0 0 'What is considered a business expense on a business trip?'
 "Nov 8 '11 at 15:14"]
0
What is considered a business expense on a business trip?
      docid  probability
4881  18850  [0.3935241]


In [105]:
print(train_doc_question[train_doc_question.qid == train_question['qid'][0]])

   id  qid  docid
0   0    0  18850
