In [2]:
import time
import pandas as pd
import numpy as np
import random
from pandas import DataFrame
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Flatten, Input, Embedding, TimeDistributed, Conv1D, concatenate, Lambda, Dropout
from keras import backend as K
from keras.models import Model, load_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
train_doc_question = pd.read_csv('../data/FiQA/FiQA_train_question_doc_final.tsv', sep='\t')
train_question = pd.read_csv('../data/FiQA/FiQA_train_question_final.tsv', sep='\t')
train_doc = pd.read_csv('../data/FiQA/FiQA_train_doc_final.tsv', sep='\t')
vocabulary = pd.read_csv('../data/FiQA/vocabulary.csv')
vocab_size = len(vocabulary)
max_length = 100

In [17]:
def CNN_model():
    question = Input(shape=(max_length,), dtype='int32', name='question_base')
    answer = Input(shape=(max_length,), dtype='int32', name='answer_base')
    
    #embedding layer
    embedding = Embedding(input_dim=vocab_size, output_dim=200)
    question_embedding = embedding(question)
    answer_embedding = embedding(answer)
    
    #hidden layer
    hidden_layer = TimeDistributed(Dense(200, activation='tanh'))
    question_hl = hidden_layer(question_embedding)
    answer_hl = hidden_layer(answer_embedding)
    
    #cnn layer
    cnns = [Conv1D(kernel_size=kernel_size,
                       filters=100,
                       activation='tanh',
                       padding='same') for kernel_size in [2, 3, 5, 7]]
    question_cnn = concatenate([cnn(question_hl) for cnn in cnns], axis=-1)
    answer_cnn = concatenate([cnn(answer_hl) for cnn in cnns], axis=-1)
    
    #max pooling layer
    maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]))
    question_pool = maxpool(question_cnn)
    answer_pool = maxpool(answer_cnn)
    
    #dropout layer
    dropout = Dropout(0.2)
    similarity = get_similarity('cosine')
    qa_model = Lambda(similarity, output_shape=lambda _: (None, 1))([dropout(question_pool),
                                                                     dropout(answer_pool)])
    model = Model(inputs=[question, answer], outputs=qa_model, name='qa_model')
        
    return model

In [68]:
def get_similarity(similarity):
    dot = lambda a, b: K.batch_dot(a, b, axes=1)
    if similarity == 'cosine':
            return lambda x: dot(x[0], x[1]) / K.maximum(K.sqrt(dot(x[0], x[0]) * dot(x[1], x[1])), K.epsilon())

def get_train_data():
    question_id_list = train_doc_question['qid']
    doc_id_list = train_doc_question['docid']

    questions = []
    good_answers = []
    bad_answers = []
    train_doc_list = train_doc.dropna(axis=0, how='any')
    train_doc_list = list(train_doc_list['doc'])
    
    for i in range(0, len(train_doc_question)):
        doc_value = train_doc[train_doc.docid == doc_id_list[i]]['doc'].values[0]
        if doc_value == doc_value:
            question = train_question[train_question.qid == question_id_list[i]]['question'].values[0]
            questions.append(question)
            good_answers.append(doc_value)
            bad_answers.append(random.choice(train_doc_list))

    return [questions, good_answers, bad_answers]

def create_test_sample(padded_qs, padded_good_answers, padded_bad_answers, test_len, qs_len):
    result = []
    for i in range(0, test_len):
        tmp_qs = np.array([padded_qs[i]] * qs_len)
        tmp_answers = np.vstack((padded_good_answers[i],  random.sample(list(padded_bad_answers), qs_len - 1)))
        result.append({'q': tmp_qs, 'answers': tmp_answers})
    return result

def test_accuracy(test_sample, prediction_model):
    accuracy = 0;
    for item in test_sample:
        result = prediction_model.predict([item['q'], item['answers']])
        if np.argmax(result) == 0:
            accuracy += 1
    accuracy /= len(test_sample) 
    return accuracy

In [69]:
questions, good_answers, bad_answers = get_train_data()
encoded_qs = [one_hot(d, vocab_size) for d in questions]
padded_qs = pad_sequences(encoded_qs, maxlen=max_length, padding='post')

encoded_good_answers = [one_hot(d, vocab_size) for d in good_answers]
padded_good_answers = pad_sequences(encoded_good_answers, maxlen=max_length, padding='post')

encoded_bad_answers = [one_hot(d, vocab_size) for d in bad_answers]
padded_bad_answers = pad_sequences(encoded_bad_answers, maxlen=max_length, padding='post')

In [74]:
#define the input of the model
q_input = Input(shape=(max_length,), dtype='int32', name='question_base')
good_answers_input = Input(shape=(max_length,), dtype='int32', name='good_answers_base')
bad_answers_input = Input(shape=(max_length,), dtype='int32', name='bad_answers_base')

# get the cnn model
model = CNN_model()
# model.summary()
good_similarity = model([q_input, good_answers_input])
bad_similarity = model([q_input, bad_answers_input])

#define the loss function, simialrity with the good_answers 
#need to be larger while similarity with the bad_answers need to be smaller
loss = Lambda(lambda x: K.relu(0.009 - x[0] + x[1]),
                      output_shape=lambda x: x[0])([good_similarity, bad_similarity])
prediction_model = Model(inputs=[q_input, good_answers_input], outputs=good_similarity,
                                      name='prediction_model')
origin_weight = prediction_model.get_weights()
prediction_model.compile(loss=lambda y_true, y_pred: y_pred, optimizer='adam')

In [61]:
prediction_model.save('../model_result/test_model.h5')

In [85]:
prediction_model = load_model('../model_result/my_model.h5', custom_objects={
    "backend": K
})

TypeError: 'str' object is not callable

In [82]:
print(prediction_model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
question_base (InputLayer)      (None, 100)          0                                            
__________________________________________________________________________________________________
good_answers_base (InputLayer)  (None, 100)          0                                            
__________________________________________________________________________________________________
qa_model (Model)                (None, 1)            12363200    question_base[0][0]              
                                                                 good_answers_base[0][0]          
Total params: 12,363,200
Trainable params: 12,363,200
Non-trainable params: 0
__________________________________________________________________________________________________
None


In [77]:
prediction_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
question_base (InputLayer)      (None, 100)          0                                            
__________________________________________________________________________________________________
good_answers_base (InputLayer)  (None, 100)          0                                            
__________________________________________________________________________________________________
qa_model (Model)                (None, 1)            12363200    question_base[0][0]              
                                                                 good_answers_base[0][0]          
Total params: 12,363,200
Trainable params: 12,363,200
Non-trainable params: 0
__________________________________________________________________________________________________


In [54]:
start = time.clock()
test_sample = create_test_sample(padded_qs, padded_good_answers, padded_bad_answers, 100, 10)
accuracy = test_accuracy(test_sample, prediction_model)
print('accuracy:', accuracy)
elapsed = (time.clock() - start)
print(elapsed)

accuracy: 0.0
1.4650589999999966


In [57]:
start = time.clock()
test_sample = create_test_sample(padded_qs, padded_good_answers, padded_bad_answers, 100, 10)
accuracy = test_accuracy(test_sample, prediction_model)
print('accuracy:', accuracy)
elapsed = (time.clock() - start)


accuracy: 0.1


In [None]:
test_sample = create_test_sample(padded_qs, padded_good_answers, padded_bad_answers, 100, 100)
accuracy = test_accuracy(test_sample, prediction_model)
print('accuracy:', accuracy)
