## Import packages

In [1]:
import models
from tensorflow import keras
import tensorflow as tf
import numpy as np
import json
import config
import pickle
import helper
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

  from ._conv import register_converters as _register_converters


## Load dataset and RNN model

In [2]:
with open(config.DATA_PATH + "testing.json", "rb") as f:
    testing_data = json.load(f)

with open(config.DATA_PATH + "vocabulary.pickle", "rb") as f:
    voc = pickle.load(f)

with open(config.DATA_PATH + "documents.json", "rb") as f:
    docs = json.load(f)

In [3]:
print(testing_data[0])

{'question': 'Modern browser support standards-based and defacto what?', 'docid': 410, 'id': 0}


In [4]:
context = dict()
for t in tqdm(testing_data):
    
    question_text = t['question']
    docid = t['docid']
    
    for d in docs:
        if d['docid'] == docid:
            
            tfidf_vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, min_df=0)
            d_tfidf = tfidf_vectorizer.fit_transform(d['text']).toarray()
            q_tfidf = tfidf_vectorizer.transform([question_text]).toarray()
            
            para_tfidf_sim = np.dot(d_tfidf, q_tfidf.T).flatten()
            matched_para_id = np.argmax(para_tfidf_sim)

            context[t['id']] = d['text'][matched_para_id]
            break

100%|██████████| 3618/3618 [01:15<00:00, 48.24it/s]


In [5]:
emb_mat = np.load(config.DATA_PATH + "word_embedding_matrix.npy")

rm = models.RnnModel(emb_mat)

In [6]:
print(testing_data[1])

{'question': 'What do people typically call a web browser?', 'docid': 410, 'id': 1}


## Define tool functions

In [7]:
def f1_score(p,t):
    p_tokens=nltk.word_tokenize(p.lower())
    t_tokens=nltk.word_tokenize(t.lower())
    common=Counter(p_tokens) & Counter(t_tokens)
    num_same=sum(common.values())
    if num_same==0:return 0
    precision=1.0*num_same / len(p_tokens)
    recall=1.0*num_same/len(t_tokens)
    f1=(2*precision*recall)/(precision+recall)
    return f1


def generate_batch(batch_sample, context, voc):
    
    batch_q, batch_c = [], []
    
    for q in batch_sample:
        batch_q.append(helper.text_to_index(q['question'], voc))
        batch_c.append(helper.text_to_index(context[q['id']], voc))
        
    batch_q = keras.preprocessing.sequence.pad_sequences(batch_q,
                                                         value=voc[
                                                             "<PAD>"],
                                                         padding='post')

    batch_c = keras.preprocessing.sequence.pad_sequences(batch_c,
                                                         value=voc[
                                                             "<PAD>"],
                                                         padding='post')

    return batch_q, batch_c

## Get predictions

In [8]:
test_ans = []

with tf.Session() as sess:
    saver = tf.train.Saver()
    saver.restore(sess, "model/RNN")
    
    batch_i = 0

    pbar = tqdm(total = len(testing_data) + 1)
    while batch_i < len(testing_data):
        
        start = batch_i
        end = batch_i + config.BATCH_SIZE

        batch_data = testing_data[start: end]
        batch_q, batch_c = generate_batch(batch_data, context, voc)

        pred_s, pred_e = sess.run([rm.output_layer_1, rm.output_layer_2], feed_dict={rm.context_input: batch_c,
                                                                                     rm.question_input: batch_q,
                                                                                     rm.dropout_keep_prob: 1
                                                                                     })

        for i in range(batch_c.shape[0]):
            
            answer_dict = dict()
            answer_dict["id"] = batch_i + i
            
            text = keras.preprocessing.text.text_to_word_sequence(context[i + start])
            answer_dict["text"] = " ".join(text[pred_s[i]: pred_e[i] + 1])
            test_ans.append(answer_dict)
        
        batch_i += config.BATCH_SIZE
        pbar.update(config.BATCH_SIZE)

INFO:tensorflow:Restoring parameters from model/RNN


3648it [19:32,  4.06it/s]                          

## Write to csv file

In [10]:
import pandas as pd

df = pd.DataFrame(test_ans)

df['id'] = df.index

df = df.drop(['id'],axis=1)

df.to_csv("dataset/test.csv")