## Import packages

In [None]:
import models
import tensorflow as tf
import numpy as np
import pickle
import constants as const
from collections import Counter
from tqdm import tqdm

## Load dataset and RNN model

In [None]:
with open("testing_data.pickle", "rb") as input_file:
    testing_data = pickle.load(input_file)

with open("/mnt/new_vocabulary.pickle", "rb") as input_file:
    voc = pickle.load(input_file)

emb_mat = np.load("/mnt/new_word_embedding_matrix.npy")

rm = models.RnnModel(emb_mat)

## Define tool functions

In [None]:
def find_max_length(lst):
    length = max((len(e) for e in lst))
    return length

def convert_word_to_embedding_index(word, voc):
    if word in voc:
        return voc[word]
    else:
        return 0

def f1_score(p,t):
    p_tokens=nltk.word_tokenize(p.lower())
    t_tokens=nltk.word_tokenize(t.lower())
    common=Counter(p_tokens) & Counter(t_tokens)
    num_same=sum(common.values())
    if num_same==0:return 0
    precision=1.0*num_same / len(p_tokens)
    recall=1.0*num_same/len(t_tokens)
    f1=(2*precision*recall)/(precision+recall)
    return f1

## Get predictions

In [None]:
from nltk import word_tokenize

test_ans = []

with tf.Session() as sess:
    saver = tf.train.Saver()
    saver.restore(sess, "model/rnn")
    print("load sucessfully")
    
    batch_i = 0

    pbar = tqdm(total = len(testing_data) + 1)
    while batch_i < len(testing_data):
        
        start = batch_i
        end = batch_i + const.BATCH_SIZE

        batch_data = testing_data[start: end]
        q_list = []
        c_list = []
        s_list = []
        e_list = []
        for ins in batch_data:
            q_list.append(list(map(lambda x: convert_word_to_embedding_index(x, voc), word_tokenize(ins['question']))))
            c_list.append(list(map(lambda x: convert_word_to_embedding_index(x, voc), word_tokenize(ins['text']))))

        # padding to a matrix by '0'
        max_q = find_max_length(q_list)
        for i in q_list:
            i.extend([0] * (max_q - len(i)))
        batch_q = np.asarray(q_list)

        max_c = find_max_length(c_list)
        for i in c_list:
            i.extend([0] * (max_c - len(i)))
        batch_c = np.asarray(c_list)

        pred_start_point = tf.argmax(rm.output_layer_1, axis=1)
        pred_end_point = tf.argmax(rm.output_layer_2, axis=1)

        pred_s, pred_e = sess.run([pred_start_point,pred_end_point], feed_dict={rm.context_input: batch_c,
                                                                                rm.question_input: batch_q,
                                                                                rm.dropout_keep_prob: 1
                                                                                })

        for i in range(const.BATCH_SIZE):
            answer_dict = dict()
            index = start + i
            answer_dict["id"] = index

            if pred_s[i] > pred_e[i]:
                pred_s[i], pred_e[i] = pred_e[i], pred_s[i]
            
            sent = word_tokenize(testing_data[index]["text"])
            answer_dict["text"] = " ".join(sent[pred_s[i]: pred_e[i] + 1])

            test_ans.append(answer_dict)
        
        batch_i += const.BATCH_SIZE
        pbar.update(const.BATCH_SIZE)

## Write to csv file

In [None]:
import pandas as pd

df = pd.DataFrame(test_ans)
display(df)
df.index=df['id']
df = df.drop(['id'],axis=1)

df.to_csv("test.csv")