In [3]:
import numpy as np
import json
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer

<b>Word embedding</b>

In [4]:
embedding_size = 50
gloveFilePath = "data/glove.6B.50d.txt"
def loadGlove(filePath):
    file = open(filePath, 'r')
    
    vocab = {}
    embedding = []
    for i, line in enumerate(file.readlines()):
        line = line.strip().split(' ')
        word = line[0]
        vocab[word] = i
        embedding.append(line[1:])
    file.close()
    return vocab,embedding
vocab, embedding = loadGlove(gloveFilePath)
vocab_size = len(vocab)
print(vocab_size)

400000


<b>Load data</b>

In [5]:
f = open("./data/training.json", encoding='utf-8')
js = json.load(f)
#discard the token '?' at the end of each query
train_qs = [item['question'][:-1] for item in js]
tokenized_train_qs_all = [word_tokenize(item['question'][:-1]) for item in js]
train_texts = [item['text'] for item in js]
tokenized_train_texts_all = [word_tokenize(item['text']) for item in js]
train_aps = [item['answer_paragraph'] for item in js]
train_docids = [item['docid'] for item in js]
f.close()
print(train_qs[0])
print(tokenized_train_qs_all[0])
print(train_texts[0])
print(tokenized_train_texts_all[0])

A kilogram could be definined as having a Planck constant of what value
['A', 'kilogram', 'could', 'be', 'definined', 'as', 'having', 'a', 'Planck', 'constant', 'of', 'what', 'value']
6966662606895999999♠6.62606896×10−34 j⋅s
['6966662606895999999♠6.62606896×10−34', 'j⋅s']


In [6]:
f = open("./data/testing.json", encoding='utf-8')
js = json.load(f)
#discard the token '?' at the end of each query
test_qs = [item['question'][:-1] for item in js]
tokenized_test_qs_all = [word_tokenize(item['question'][:-1]) for item in js]
test_docids = [item['docid'] for item in js]
f.close()
print(test_qs[0])
print(tokenized_test_qs_all[0])

Modern browser support standards-based and defacto what
['Modern', 'browser', 'support', 'standards-based', 'and', 'defacto', 'what']


In [7]:
f = open("./data/documents.json", encoding='utf-8')
js = json.load(f)
docs = [ [ sent for para in item['text'] for sent in sent_tokenize(para)] for item in js]
tokenized_docs = [[word_tokenize(sent) for sent in doc] for doc in docs]
docids = [item['docid'] for item in js]
f.close()
print(docs[0])
print(tokenized_docs[0])

['First recognized in 1900 by Max Planck, it was originally the proportionality constant between the minimal increment of energy, E, of a hypothetical electrically charged oscillator in a cavity that contained black body radiation, and the frequency, f, of its associated electromagnetic wave.', 'In 1905 the value E, the minimal energy increment of a hypothetical oscillator, was theoretically associated by Einstein with a "quantum" or minimal element of the energy of the electromagnetic wave itself.', 'The light quantum behaved in some respects as an electrically neutral particle, as opposed to an electromagnetic wave.', 'It was eventually called the photon.', 'Classical statistical mechanics requires the existence of h (but does not define its value).', "Eventually, following upon Planck's discovery, it was recognized that physical action cannot take on an arbitrary value.", 'Instead, it must be some multiple of a very small quantity, the "quantum of action", now called the Planck cons

<b>Locate answer sentence with TF-IDF</b>

In [8]:
TfidfVectorizers = []
tfidf_paras_mats = []
for paras in docs:
    tfidf = TfidfVectorizer(tokenizer=word_tokenize, lowercase=True)
    tfidf_paras_mats.append(tfidf.fit_transform(paras))
    TfidfVectorizers.append(tfidf)

train_context_sents = []
train_queries = []
train_answers = []
for i,query in enumerate(train_qs):
    docid = train_docids[i]
    tfidf_query = TfidfVectorizers[docid].transform([query])
    relativities = np.dot(tfidf_query, tfidf_paras_mats[docid].T).toarray()
    indices = (-relativities).argsort()[0]
    
    answer = train_texts[i]
    tokenized_answer = word_tokenize(answer)
    for index in indices:
        tokenized_context = tokenized_docs[docid][index]
        #if answer is not in context, which means we failed to find the right context. We discard this train item
        if tokenized_answer[0] in tokenized_context and tokenized_answer[-1] in tokenized_context:
            train_context_sents.append(docs[docid][index])
            train_queries.append(query)
            train_answers.append(answer)
            break
print(train_context_sents[6])
print(train_queries[6])
print(train_answers[6])
            
test_context_sents = []
for i,query in enumerate(test_qs):
    docid = test_docids[i]
    tfidf_query = TfidfVectorizers[docid].transform([query])
    relativities = np.dot(tfidf_query, tfidf_paras_mats[docid].T)
    index = np.argmax(relativities)
    test_context_sents.append(docs[docid][index])
print(test_context_sents[0])

First recognized in 1900 by Max Planck, it was originally the proportionality constant between the minimal increment of energy, E, of a hypothetical electrically charged oscillator in a cavity that contained black body radiation, and the frequency, f, of its associated electromagnetic wave.
What is frequency also known as in science
f
Modern web browsers support a combination of standards-based and de facto HTML and XHTML, which should be rendered in the same way by all browsers.


<b>Get answer indices in context</b>

In [9]:
tokenized_train_context = [word_tokenize(context) for context in train_context_sents]
tokenized_train_qs = [word_tokenize(query) for query in train_queries]
tokenized_train_answers = [word_tokenize(answer) for answer in train_answers]

In [104]:
#indices of answer words in context 
train_answers_indices = []
for i,answer in enumerate(tokenized_train_answers):
    start_w = answer[0]
    start_i = -1
    end_w = answer[-1]
    end_i = -1
    
    has_answer = False
    for j,word in enumerate(tokenized_train_context[i]):
        if start_w == word:
            start_i = j
        if end_w == word:
            end_i = j
        
        answer_indices = []
        #answer should be less than 5 words, and start index should smaller than end_i
        if start_i!=-1 and end_i - start_i <= 5 and end_i >= start_i :
            for index in range(start_i, end_i+1):
                answer_indices.append(index)
            train_answers_indices.append(answer_indices)
            has_answer = True
            break
    if not has_answer:
        train_answers_indices.append([])
        
print(tokenized_train_context[0])
print(tokenized_train_answers[0])
print(train_answers_indices[0])
    
tokenized_test_context = [word_tokenize(context) for context in test_context_sents]
tokenized_test_qs = [word_tokenize(query) for query in test_qs]

['The', 'most', 'urgent', 'unit', 'on', 'the', 'list', 'for', 'redefinition', 'is', 'the', 'kilogram', ',', 'whose', 'value', 'has', 'been', 'fixed', 'for', 'all', 'science', '(', 'since', '1889', ')', 'by', 'the', 'mass', 'of', 'a', 'small', 'cylinder', 'of', 'platinum–iridium', 'alloy', 'kept', 'in', 'a', 'vault', 'just', 'outside', 'Paris', '.']
['cylinder']
[31]


In [11]:
embed_train_context = [[embedding[vocab[word]] for word in context if word in vocab] for context in tokenized_train_context]
embed_train_qs = [[embedding[vocab[word]] for word in q if word in vocab] for q in tokenized_train_qs]
embed_train_answers = [[embedding[vocab[word]] for word in answer if word in vocab] for answer in tokenized_train_answers]
embed_test_context = [[embedding[vocab[word]] for word in context if word in vocab] for context in tokenized_test_context]
embed_test_qs = [[embedding[vocab[word]] for word in q if word in vocab] for q in tokenized_test_qs]
print(embed_train_answers[0])

[['0.53501', '0.51619', '1.2869', '-0.44228', '-0.6153', '1.5837', '0.77163', '-0.74072', '-0.59075', '0.44771', '0.35128', '0.16073', '-0.43567', '1.5196', '-0.32378', '0.0080341', '-0.37728', '1.537', '-0.44858', '-2.0515', '0.050697', '-1.3163', '-0.074289', '-0.13453', '0.1929', '-0.46173', '0.51555', '1.3476', '-0.38719', '0.38547', '1.7052', '-0.84175', '0.16006', '1.0468', '1.2382', '0.33616', '0.95423', '0.14869', '0.48572', '0.44431', '0.54502', '-0.20186', '-0.43291', '0.1342', '-0.58757', '0.40151', '0.93869', '0.16292', '-0.1962', '-0.99166']]


<b>Deep learning model</b>

In [12]:
max_context_size = 0
for tokenized_doc in tokenized_docs:
    for tokenized_sent in tokenized_doc:
        len_sent = len(tokenized_sent)
        if max_context_size < len_sent:
            max_context_size = len_sent

max_query_size = 0
for query in tokenized_train_qs_all:
    len_query = len(query)
    if max_query_size < len_query:
        max_query_size = len_query
for query in tokenized_test_qs_all:
    len_query = len(query)
    if max_query_size < len_query:
        max_query_size = len_query
        
max_answer_size = 0
for answer in tokenized_train_answers:
    len_answer = len(answer)
    if max_answer_size < len_answer:
        max_answer_size = len_answer

print(max_context_size)
print(max_query_size)
print(max_answer_size)

382
59
8



training data

In [43]:
trainig_size = len(embed_train_qs)

training_contexts = np.zeros(
    (trainig_size, max_context_size, embedding_size),
    dtype='float32')
training_queries = np.zeros(
    (trainig_size, max_query_size, embedding_size),
    dtype='float32')
#answers are indices in context
# training_answers = np.zeros(
#     (trainig_size, max_answer_size, max_context_size),
#     dtype='float32')

#answers are start and end indices in context
training_answers = np.zeros(
    ( 2, trainig_size, max_context_size),
    dtype='float32')

#similarity matrix of context words to context words
training_cc = np.zeros(
    (trainig_size, max_context_size, max_context_size),
    dtype='float32')
#similarity matrix of query words to query words
training_qq = np.zeros(
    (trainig_size, max_query_size, max_query_size),
    dtype='float32')
#similarity matrix of context words to query words
training_qc = np.zeros(
    (trainig_size, max_query_size, max_context_size),
    dtype='float32')
training_input = np.zeros(
    (trainig_size, max_query_size, max_context_size),
    dtype='float32')

for i, embed_context in enumerate(embed_train_context):
    for j, embed_word in enumerate(embed_context):
        training_contexts[i,j] = embed_word
        
for i, embed_query in enumerate(embed_train_qs):
    for j, embed_word in enumerate(embed_query):
        training_queries[i,j] = embed_word

for i, answer_indices in enumerate(train_answers_indices):
    if len(answer_indices) > 1:
        training_answers[0, i, answer_indices[0]] = 1.
        training_answers[1, i, answer_indices[-1]] = 1.
        
for i in range(trainig_size):
    training_cc[i] = np.dot(training_contexts[i], training_contexts[i].T)
    training_qq[i] = np.dot(training_queries[i], training_queries[i].T)
    training_qc[i] = np.dot(training_queries[i], training_contexts[i].T)
    training_input[i] = np.dot( np.dot(training_qq[i], training_qc[i]), training_cc[i])

testing data

In [99]:
testing_size = len(embed_test_qs)

testing_contexts = np.zeros(
    (testing_size, max_context_size, embedding_size),
    dtype='float32')
testing_size_queries = np.zeros(
    (testing_size, max_query_size, embedding_size),
    dtype='float32')

#similarity matrix of context words to context words
testing_cc = np.zeros(
    (testing_size, max_context_size, max_context_size),
    dtype='float32')
#similarity matrix of query words to query words
testing_qq = np.zeros(
    (testing_size, max_query_size, max_query_size),
    dtype='float32')
#similarity matrix of context words to query words
testing_qc = np.zeros(
    (testing_size, max_query_size, max_context_size),
    dtype='float32')
testing_input = np.zeros(
    (testing_size, max_query_size, max_context_size),
    dtype='float32')

for i, embed_context in enumerate(embed_test_context):
    for j, embed_word in enumerate(embed_context):
        testing_contexts[i,j] = embed_word
        
for i, embed_query in enumerate(embed_test_qs):
    for j, embed_word in enumerate(embed_query):
        testing_queries[i,j] = embed_word
        
for i in range(testing_size):
    testing_cc[i] = np.dot(testing_contexts[i], testing_contexts[i].T)
    testing_qq[i] = np.dot(testing_queries[i], testing_queries[i].T)
    testing_qc[i] = np.dot(testing_queries[i], testing_contexts[i].T)
    testing_input[i] = np.dot( np.dot(testing_qq[i], testing_qc[i]), testing_cc[i])

NameError: name 'testing_queries' is not defined

'''
testing_size = len(embed_test_qs)

input_contexts = np.zeros(
    (testing_size, max_context_size, embedding_size),
    dtype='float32')

input_queries = np.zeros(
    (testing_size, max_query_size, embedding_size),
    dtype='float32')
'''

In [116]:
import keras.backend as K
from keras.models import Model
from keras import layers
from keras.layers import Input, GRU, Dense, Reshape, Flatten

batch_size = 128  
epochs = 12
qc_relations_size = max_context_size*max_query_size

<b>model 1 pure GRU</b>

In [44]:
qc_relations = Input(shape=(None, max_context_size))
qc_relations_dense = Dense(max_query_size, activation='relu')(qc_relations)
encoder_outputs = GRU(output_dim = max_context_size, activation = 'tanh')(qc_relations_dense) #output_dim #activation

start = Dense(max_context_size, activation='softmax')(encoder_outputs)
end = Dense(max_context_size, activation='softmax')(encoder_outputs)

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
model = Model(qc_relations, [start, end])
model.compile(optimizer='adam', loss='categorical_crossentropy') 
model.fit(training_input, [training_answers[0], training_answers[1]] ,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

In [117]:
my_answers = []
for i,embed_query in enumerate(embed_train_qs):
    start_idx, end_idx = model.predict(training_input[i:i+1])
    start_idx = np.argmax(start_idx)
    end_idx = np.argmax(eand_idx)
    print(train_answers_indices[i], "\t", start_idx, end_idx)
    print("Answer:", train_answers[i], "My answer:", tokenized_train_context[i][start_idx:end_idx+1])

ValueError: Error when checking model : the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 2 array(s), but instead got the following list of 1 arrays: [array([[[2629954.5, 1538674.2, 2095450. , ...,       0. ,       0. ,
               0. ],
        [2796211. , 1635883.6, 2227734. , ...,       0. ,       0. ,
               0. ],
        [2092697.8,...

<b>model 2 GRU seq2seq</b>

In [69]:
decoder_input_data = np.zeros(
    (trainig_size, 2, max_context_size),
    dtype='float32')

decoder_target_data = np.zeros(
    (trainig_size, 2, max_context_size),
    dtype='float32')

for i, answer_indices in enumerate(train_answers_indices):
    if len(answer_indices) > 1:
        decoder_input_data[i, 1, answer_indices[0]] = 1.
        decoder_target_data[i, 0, answer_indices[0]] = 1.
        decoder_target_data[i, 1, answer_indices[-1]] = 1.

In [119]:
qc_relations = Input(shape=(None, max_context_size))
encoder_outputs, encoder_state= GRU(output_dim = max_context_size,
                                    return_state = True,
                                    activation = 'tanh')(qc_relations)

decoder_inputs = Input(shape=(None, max_context_size))
decoder_gru = GRU(output_dim = max_context_size,
                  return_sequences = True,
                  return_state = True,
                  activation = 'tanh')
decoder_outputs, _ = decoder_gru(decoder_inputs,
                                 initial_state = encoder_state)
decoder_dense = Dense(max_context_size,
                      activation='softmax')

start_end = decoder_dense(decoder_outputs)

  after removing the cwd from sys.path.
  # Remove the CWD from sys.path while we load stuff.


In [120]:
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([qc_relations, decoder_inputs], start_end)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy') #optimizer
model.fit([training_input, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

#define inference model
encoder_model = Model(qc_relations, encoder_state)

decoder_state_inputs = Input(shape=(max_context_size,))
decoder_outputs, decoder_state = decoder_gru(decoder_inputs,
                                             initial_state = decoder_state_inputs)
start_end = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs, decoder_state_inputs],
    [start_end, decoder_state])

Train on 19400 samples, validate on 4850 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [96]:
def get_start_end(question):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(question)

    # Generate empty target sequence of length 1.
    Go_index = np.zeros((1, 1, max_context_size))
    
    start_index , states_value = decoder_model.predict( [Go_index, states_value])
    end_index, _ = decoder_model.predict( [start_index, states_value])

    return start_index, end_index

In [114]:
my_answers = []
for i,embed_query in enumerate(embed_train_qs):
    start_idx, end_idx = get_start_end(training_input[i:i+1])
    start_idx = np.argmax(start_idx)
    end_idx = np.argmax(end_idx)
    print(train_answers_indices[i], "\t", start_idx, end_idx)
    print("Answer:", train_answers[i], "My answer:", tokenized_train_context[i][start_idx:end_idx+1])

[31] 	 1 10
Answer: cylinder My answer: ['most', 'urgent', 'unit', 'on', 'the', 'list', 'for', 'redefinition', 'is', 'the']
[3, 4, 5] 	 3 5
Answer: time vs. energy My answer: ['time', 'vs.', 'energy']
[15, 16, 17] 	 15 17
Answer: quantum of action My answer: ['quantum', 'of', 'action']
[11] 	 16 17
Answer: 1913 My answer: ['to', 'overcome']
[7] 	 5 9
Answer: h My answer: ['existence', 'of', 'h', '(', 'but']
[7] 	 5 9
Answer: h My answer: ['existence', 'of', 'h', '(', 'but']
[42] 	 34 24
Answer: f My answer: []
[9] 	 13 15
Answer: 1839 My answer: ['is', 'usually', 'reserved']
[19] 	 6 10
Answer: 1911 My answer: ['the', '``', 'ultraviolet', 'catastrophe', "''"]
[78, 79] 	 78 79
Answer: multiphoton effect My answer: ['multiphoton', 'effect']
[5, 6] 	 5 6
Answer: nuclear magneton My answer: ['nuclear', 'magneton']
[9, 10] 	 9 10
Answer: 555 nanometres My answer: ['555', 'nanometres']
[6] 	 16 18
Answer: 1911 My answer: ['quanta', "''", '.']
[22, 23] 	 22 23
Answer: hot atoms My answer: ['h

[4] 	 6 8
Answer: co-producer My answer: ['the', '1987', 'film']
[2] 	 5 6
Answer: five My answer: ['nominations', ',']
[16] 	 12 13
Answer: 26-minute My answer: ['theatrical', 'release']
[13] 	 7 8
Answer: actress My answer: ['also', 'met']
[13] 	 6 9
Answer: four My answer: ['work', ',', 'Universal', 'signed']
[23] 	 1 2
Answer: seven-year My answer: ['vice', 'president']
[18] 	 1 3
Answer: 1989 My answer: ['two', 'forays', 'into']
[20] 	 13 14
Answer: cinematography My answer: ['of', 'America']
[13] 	 6 9
Answer: four My answer: ['work', ',', 'Universal', 'signed']
[7] 	 15 17
Answer: poorly My answer: ['limited', 'release', '.']
[23] 	 16 18
Answer: 1996 My answer: ["'s", 'Chair', ',']
[24] 	 8 10
Answer: 1980 My answer: [',', 'with', 'financial']
[13] 	 3 4
Answer: six My answer: ['received', 'positive']
[18] 	 49 50
Answer: 1982 My answer: ['London', 'which']
[15, 16, 17] 	 15 17
Answer: reportedly close friends My answer: ['reportedly', 'close', 'friends']
[50] 	 35 36
Answer: t

[4, 5] 	 4 5
Answer: important buildings My answer: ['important', 'buildings']
[14] 	 1 2
Answer: 1950s My answer: ['90', 'mm']
[23, 24] 	 23 24
Answer: the u.s. army My answer: ['the', 'army']
[6, 7] 	 6 7
Answer: maximum ceiling My answer: ['maximum', 'ceiling']
[9] 	 6 8
Answer: .50-inch My answer: ['anything', 'larger', 'than']
[25, 26, 27, 28, 29] 	 25 29
Answer: `` flaktürme '' flak towers My answer: ['``', 'Flaktürme', "''", 'flak', 'towers']
[9, 10] 	 9 10
Answer: homeland air defence My answer: ['homeland', 'defence']
[1] 	 3 4
Answer: 1925 My answer: ['British', 'adopted']
[8, 9] 	 8 9
Answer: the laser My answer: ['the', 'laser']
[38] 	 14 15
Answer: 1990 My answer: ['artillery', 'arm']
[6, 7, 8] 	 6 8
Answer: machine-gun based weapons My answer: ['machine-gun', 'based', 'weapons']
[4] 	 13 31
Answer: eight My answer: ['advanced', 'by', 'the', 'late', '1930s', 'for', 'development', 'work', 'on', 'sound', 'locating', 'acoustic', 'devices', 'to', 'be', 'generally', 'halted', '

[33, 34, 35, 36] 	 33 36
Answer: less than 20 % My answer: ['less', 'than', '20', '%']
[0, 1] 	 0 1
Answer: 83 % My answer: ['83', '%']
[1, 2, 3, 4] 	 1 4
Answer: childless , unmarried girls My answer: ['childless', ',', 'unmarried', 'girls']
[17, 18] 	 17 18
Answer: 81 mi My answer: ['81', 'mi']
[10, 11] 	 10 11
Answer: declined markedly My answer: ['declined', 'markedly']
[5, 6, 7] 	 5 7
Answer: all young girls My answer: ['all', 'young', 'girls']
[12] 	 14 15
Answer: 1972 My answer: ['an', 'International']
[11, 12] 	 11 12
Answer: 55 % My answer: ['55', '%']
[3, 4, 5] 	 3 5
Answer: a traditional diviner My answer: ['a', 'traditional', 'diviner']
[12, 13] 	 12 13
Answer: a minority My answer: ['a', 'minority']
[27] 	 17 18
Answer: 2013 My answer: ['would', 'not']
[16, 17] 	 16 17
Answer: home appliances My answer: ['home', 'appliances']
[3, 4, 5] 	 5 6
Answer: about 45 % My answer: ['%', 'of']
[4, 5] 	 4 5
Answer: twelve jurists My answer: ['twelve', 'jurists']
[15, 16, 17] 	 15 17
A

[44, 45] 	 44 45
Answer: race-conscious policy My answer: ['race-conscious', 'policy']
[5] 	 11 13
Answer: white My answer: ['not', 'be', 'covered']
[16] 	 15 16
Answer: federal My answer: ['of', 'federal']
[10, 11, 12] 	 10 12
Answer: more than triple My answer: ['more', 'than', 'triple']
[1] 	 7 8
Answer: 1947 My answer: [',', 'To']
[29, 30, 31] 	 29 31
Answer: high dropout rates My answer: ['high', 'dropout', 'rates']
[7, 8, 9] 	 7 9
Answer: more than 60 My answer: ['more', 'than', '60']
[7] 	 1 3
Answer: white My answer: ['UNC-Chapel', 'Hill', 'lawsuit']
[34, 35, 36] 	 34 36
Answer: equality of opportunity My answer: ['equality', 'of', 'opportunity']
[5, 6, 7] 	 5 7
Answer: against affirmative action My answer: ['against', 'affirmative', 'action']
[24, 25] 	 24 25
Answer: four times My answer: ['four', 'times']
[20, 21] 	 24 25
Answer: eight percent My answer: ['action', 'programs']
[14, 15] 	 14 15
Answer: 3.1 % My answer: ['3.1', '%']
[25] 	 12 13
Answer: reverse My answer: ['gen

[3] 	 10 11
Answer: labrys My answer: ['the', 'Cretan']
[10] 	 10 11
Answer: paion My answer: ['paion', "''"]
[4] 	 43 49
Answer: kouros My answer: []
[4] 	 4 5
Answer: kouros My answer: ['kouros', '(']
[6] 	 0 1
Answer: paeans My answer: ['Hymns', 'sung']
[7, 8, 9] 	 7 9
Answer: early archaic period My answer: ['early', 'archaic', 'period']
[13] 	 0 1
Answer: dolphins My answer: ['An', 'etiology']
[24] 	 23 24
Answer: amber My answer: ['of', 'amber']
[7] 	 13 14
Answer: pella My answer: ["''", 'and']
[12] 	 34 35
Answer: apella My answer: ['the', 'limits']
[4, 5] 	 4 5
Answer: the bow My answer: ['the', 'bow']
[12] 	 14 15
Answer: apella My answer: ['ἀπέλλα', ')']
[5, 6] 	 5 6
Answer: peripteral temples My answer: ['peripteral', 'temples']
[18] 	 23 24
Answer: physician My answer: ['the', 'Iliad']
[7] 	 19 20
Answer: paean My answer: ['to', 'implore']
[] 	 14 15
Answer: a crow My answer: ['they', 'were']
[] 	 14 15
Answer: keep away evil , My answer: ['in', 'popular']
[17, 18] 	 17 18

[15] 	 33 20
Answer: thin My answer: []
[25] 	 18 22
Answer: weapons My answer: ['material', 'for', 'making', 'houses', ',']
[23] 	 36 37
Answer: decay My answer: ['the', 'trunk']
[24] 	 11 12
Answer: density My answer: ['color', 'than']
[25] 	 44 46
Answer: large My answer: ['a', 'hand', 'lens']
[19] 	 11 15
Answer: conifers My answer: ['in', 'deciduous', 'trees', 'but', 'near']
[19, 20] 	 19 20
Answer: decay organisms My answer: ['decay', 'organisms']
[14] 	 7 9
Answer: timber-framed My answer: ['the', 'world', 'today']
[15] 	 1 3
Answer: warmth My answer: ['a', 'woodstove', 'or']
[35] 	 10 12
Answer: thick My answer: ['black', 'locust', ',']
[20] 	 13 17
Answer: age My answer: ['than', 'near', 'the', 'base', ',']
[12] 	 13 14
Answer: quality My answer: [',', 'that']
[3] 	 8 9
Answer: strong My answer: ['of', 'wood']
[4] 	 10 12
Answer: denser My answer: ['the', 'season', '.']
[10] 	 26 27
Answer: spruce My answer: ['great', 'as']
[44] 	 28 30
Answer: elasticity My answer: ['modulus'

[32] 	 14 15
Answer: little My answer: ['its', 'uniformity']
[1] 	 4 6
Answer: density My answer: ['varies', 'with', 'species']
[35, 36, 37] 	 35 37
Answer: over a year My answer: ['over', 'a', 'year']
[34] 	 18 22
Answer: paper My answer: ['material', 'for', 'making', 'houses', ',']
[7] 	 9 11
Answer: resin My answer: [',', 'waxes', 'and']
[10] 	 6 6
Answer: hardwood My answer: ['as']
[18] 	 12 13
Answer: wider My answer: ['more', 'rapid']
[5, 6] 	 5 6
Answer: inner heartwood My answer: ['inner', 'heartwood']
[9] 	 13 15
Answer: thicker My answer: ['size', 'than', 'trees']
[25, 26] 	 25 26
Answer: model building My answer: ['model', 'building']
[31] 	 30 32
Answer: structural My answer: ['composite', 'structural', 'unit']
[6, 7] 	 6 7
Answer: fat lighter My answer: ['fat', 'lighter']
[4, 5] 	 4 5
Answer: tree-ring widths My answer: ['tree-ring', 'widths']
[1] 	 13 14
Answer: ring-porous My answer: ['the', 'large']
[27, 28, 29] 	 27 29
Answer: the year before My answer: ['the', 'year',

[12] 	 3 5
Answer: duralumin My answer: ['for', 'the', 'phenomenon']
[30, 31, 32] 	 30 32
Answer: harden over time My answer: ['harden', 'over', 'time']
[] 	 13 17
Answer: the secondary constituents My answer: ['only', 'two', 'constituents', ',', 'like']
[3, 4] 	 3 4
Answer: heat-treatable alloys My answer: ['heat-treatable', 'alloys']
[10, 11, 12] 	 10 12
Answer: remove excess impurities My answer: ['remove', 'excess', 'impurities']
[23] 	 8 10
Answer: steel My answer: ['began', 'to', 'develop']
[20] 	 9 10
Answer: copper My answer: ['bronze', ',']
[9] 	 11 13
Answer: 1900s My answer: ['such', 'as', 'various']
[1] 	 2 4
Answer: 1906 My answer: [',', 'precipitation', 'hardening']
[7, 8] 	 7 8
Answer: quaternary alloy My answer: ['quaternary', 'alloy']
[9] 	 9 10
Answer: bronze My answer: ['bronze', ',']
[4, 5] 	 4 5
Answer: intermetallic alloys My answer: ['intermetallic', 'alloys']
[4, 5, 6] 	 4 6
Answer: lower melting point My answer: ['lower', 'melting', 'point']
[21, 22, 23] 	 21 2

[27] 	 17 19
Answer: 43 My answer: ['an', 'increase', 'of']
[14] 	 8 9
Answer: 6,000 My answer: ['and', 'a']
[9] 	 5 6
Answer: 1795 My answer: ['Paris', 'was']
[11] 	 27 28
Answer: 7 My answer: ['9', 'percent']
[5, 6] 	 5 6
Answer: 5.8 million My answer: ['5.8', 'million']
[10] 	 21 22
Answer: 1860 My answer: ['the', '20']
[19] 	 27 28
Answer: 2004 My answer: ['time', 'since']
[24] 	 20 11
Answer: 75,000 My answer: []
[32] 	 28 29
Answer: five My answer: ['were', 'the']
[25] 	 3 4
Answer: 70,852 My answer: ['2012', 'census']
[7] 	 15 17
Answer: 106 My answer: ['plus', 'separate', 'parishes']
[28] 	 35 37
Answer: 64 My answer: ['and', 'the', 'Paris']
[20] 	 4 5
Answer: 110 My answer: ['eighty', 'male']
[5] 	 4 5
Answer: 5th My answer: ['the', '5th']
[25, 26] 	 25 26
Answer: five percent My answer: ['five', 'percent']
[49] 	 35 25
Answer: 44 My answer: []
[24] 	 7 8
Answer: 1991 My answer: ['Pont', 'de']
[1] 	 8 9
Answer: 987 My answer: ['(', 'comte']
[1, 2, 3] 	 1 3
Answer: 14 june 1940

[4] 	 11 13
Answer: angiosperms My answer: ['or', 'Magnoliophyta', ',']
[16] 	 15 16
Answer: magnoliids My answer: ['the', 'magnoliids']
[28] 	 37 35
Answer: wasps My answer: []
[10, 11] 	 10 11
Answer: the ovule My answer: ['the', 'ovule']
[5, 6, 7] 	 5 7
Answer: form and elaboration My answer: ['form', 'and', 'elaboration']
[28, 29] 	 28 29
Answer: abominable mystery My answer: ['abominable', 'mystery']
[44, 45, 46] 	 44 46
Answer: other dicotyledonous plants My answer: ['other', 'dicotyledonous', 'plants']
[15] 	 10 11
Answer: fruit My answer: ['the', 'ovule']
[2] 	 8 10
Answer: classification My answer: ['considerable', 'revision', '.']
[8, 9] 	 8 9
Answer: fused together My answer: ['fused', 'together']
[6] 	 11 12
Answer: angiosperms My answer: ['all', 'plant-based']
[] 	 19 22
Answer: the cronquist system My answer: ['Magnoliophyta', '(', 'from', 'the']
[5, 6] 	 5 6
Answer: seed-producing plants My answer: ['seed-producing', 'plants']
[11] 	 1 3
Answer: coherent My answer: ['mos

KeyboardInterrupt: 

In [115]:
import csv
with open('result.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["id","answer"])
    for i,embed_query in enumerate(embed_test_qs):
        start_idx, end_idx = get_start_end(testing_input[i:i+1])
        start_idx = np.argmax(start_idx)
        end_idx = np.argmax(end_idx)
        my_answer = tokenized_test_context[i][start_idx:end_idx+1]
        writer.writerow([str(i), " ".join(my_answer)])