<b>Build BM25 models for each document.</b>

In [67]:
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))
def tokenization(sent):
    return word_tokenize(sent)
    #tokenized_sent = word_tokenize(sent)
    #return [word for word in tokenized_sent if word.lower() not in stop_words]

def removeStopw(tokenized_sent):
    return [word for word in tokenized_sent if word.lower() not in stop_words]

In [68]:
import json

f = open("./data/training.json", encoding='utf-8')
training = json.load(f)
train_qs = [tokenization(item['question']) for item in training]
train_texts = [tokenization(item['text']) for item in training]
train_aps = [item['answer_paragraph'] for item in training]
train_docids = [item['docid'] for item in training]
f.close()

In [69]:
f = open("./data/testing.json", encoding='utf-8')
testing = json.load(f)
test_qs = [tokenization(item['question']) for item in testing]
test_docids = [item['docid'] for item in testing]
f.close()

In [38]:
f = open("./data/documents.json", encoding='utf-8')
documents = json.load(f)
docs = [ [tokenization(para) for para in doc['text']] for doc in documents]
docids = [doc['docid'] for doc in documents]
f.close()

In [26]:
from gensim.summarization import bm25
bm25Model = [bm25.BM25(corpus) for corpus in docs]
average_idf = [sum(map(lambda k: float(bm25Model[i].idf[k]), bm25Model[i].idf.keys())) / len(bm25Model[i].idf.keys()) for i in range(len(bm25Model))]

<b>Find answer paragraphs for all questions in training and testing set.</b>

In [5]:
train_ap_text = []
for i in range(len(train_qs)):
    docid = train_docids[i]
    ap = train_aps[i]
    train_ap_text.append(docs[docid][ap])

test_ap_text = []
for i in range(len(test_qs)):
    docid = test_docids[i]
    question = test_qs[i]
    scores = bm25Model[docid].get_scores(question, average_idf[docid])
    ap = scores.index(max(scores))
    test_ap_text.append(docs[docid][ap])

<b>Train a NN to predict the answer to a question after a paragraph is given.</b>

In [105]:
from gensim.models import Word2Vec

embedding_size = 500
paras = [para for doc in docs for para in doc] + train_qs + train_texts
paras.append(["\t","\n"])
embedding_model = Word2Vec(paras, size = embedding_size, min_count = 0)

vocab = dict([(word,i) for i,word in enumerate(list(embedding_model.wv.vocab))])
reverse_vocab = dict([(i,word) for word,i in vocab.items()])
vocab_size = len(vocab)

#merge answer paragraph with question. They are seperated by a word "Q"
train_xs = [ para+["\t"]+question for i, (para,question) in  enumerate(zip(train_ap_text, train_qs))]
train_ys = [ ["\t"]+answer+["\n"] for answer in train_texts]
test_xs = [ para+["\t"]+question for i, (para,question) in  enumerate(zip(test_ap_text, test_qs))]

In [106]:
input_size = len(train_xs)
max_encoder_seq_length = max([len(x) for x in train_xs])
max_decoder_seq_length = max([len(y) for y in train_ys])

encoder_input_data = np.zeros(
    (input_size, max_encoder_seq_length, embedding_size),
    dtype='float32')
decoder_input_data = np.zeros(
    (input_size, max_decoder_seq_length, embedding_size),
    dtype='float32')
decoder_target_data = np.zeros(
    (input_size, max_decoder_seq_length, vocab_size),
    dtype='float32')

for i,x in enumerate(train_xs):
    if i < input_size:
        for j,word in enumerate(x):
            encoder_input_data[i,j] = embedding_model[word]
        
for i,y in enumerate(train_ys):
    if i < input_size:
        for j,word in enumerate(y):
            decoder_input_data[i,j] = embedding_model[word]
            if j > 0 :
                decoder_target_data[i, j-1, vocab[word] = 1.



In [107]:
'''Train a lstm_seq2seq model.
        Code reused from https://github.com/keras-team/keras/blob/master/examples/lstm_seq2seq.py
'''

from keras.models import Model
from keras import layers
from keras.layers import Input, LSTM, Dense
import numpy as np

batch_size = 128  # Batch size for training.
epochs = 10  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, embedding_size))
encoder = LSTM(latent_dim, return_state=True, dropout=0.2)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, embedding_size))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim,
                    return_sequences=True,
                    return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

# Next: inference mode (sampling).
# Here's the drill:
# 1) encode input and retrieve initial decoder state
# 2) run one step of decoder with this initial state
# and a "start of sequence" token as target.
# Output will be the next target token
# 3) Repeat with the current target token and current states

# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs,
                                                 initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

Train on 240 samples, validate on 60 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


  str(node.arguments) + '. They will not be included '


In [136]:
def answer(para_question):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(para_question)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, embedding_size))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = embedding_model['\t']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    answer = []
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        token_index = np.argmax(output_tokens[0, -1, :])
        word = reverse_vocab[token_index]
    
        # Exit condition: either hit max length
        # or find stop character.
        if (word == '\n' or len(answer) > max_decoder_seq_length):
            stop_condition = True
        else:
            answer.append(word)
        
        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, embedding_size))
        target_seq[0, 0] = embedding_model[word]

        # Update states
        states_value = [h, c]

    return answer

In [152]:
import csv

with open('result.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["id","answer"])
    for i in range(len(test_xs)):
        embedding_x = np.array([[embedding_model[word] for word in train_xs[i]]])
        my_answer = answer(np.array(embedding_x))
        writer.writerow([str(i), " ".join(my_answer)])

TypeError: 'int' object is not iterable