## Import packages

In [None]:
import numpy as np
import json
import pickle
import nltk
from tqdm import tqdm

## Load GloVe pre-trained word vectors 

In [None]:
file = '/mnt/glove.6B.100d.txt'

def load_GloVe(filename):
    vocab = dict()
    embd = []
    file = open(filename,'r')
    for index, line in enumerate(file.readlines()):
        row = line.strip().split(' ')
        vocab[row[0]] = index
        embd.append(row[1:])
    file.close()
    return vocab,embd

vocab, embd = load_GloVe(file)
embedding = np.asarray(embd)
np.save("/mnt/new_word_embedding_matrix", embedding)

with open('vocabulary.pickle', 'wb') as handle:
    pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Preprocess training data

Load training set

In [None]:
file_name = "training.json"
with open(file_name) as json_data:
    training_set = json.load(json_data)

Load documents

In [None]:
file_name = "/mnt/documents.json"
with open(file_name) as json_data:
    documents = json.load(json_data)

Generate a list of dictionary used for RNN training

In [None]:
from nltk import word_tokenize

def sublist(lst1, lst2):
    return set(lst1) <= set(lst2)

def generate_dataset(dataset):
    data_list = []
    abandon_count = 0
    for index, x in enumerate(dataset):
        curr_dct = dict()

        q = x["question"].lower()
        curr_dct["question"] = word_tokenize(q)

        ans = x["text"].lower()
        tokenized_ans = word_tokenize(ans)

        docid = x["docid"]
        answer_para = x["answer_paragraph"]
        para = documents[docid]["text"][answer_para].lower()
        sentences = nltk.sent_tokenize(para)
        tokenized_sents = [word_tokenize(sent) for sent in sentences]

        context = None
        for sent in tokenized_sents:
            if sublist(tokenized_ans, sent):
                context = sent
                break 
        
        if not context:
            # print(index, 'cannot find answer!')
            abandon_count += 1
            continue

        curr_dct['context'] = context
        # find the position that answer occur
        for pos, i in enumerate(context):
            ac_list = zip(tokenized_ans, context[pos:])
            if False in [True if i in j else False for i, j in ac_list]:
                continue
            else:
                start = [0] * len(context)
                end = [0] * len(context)
                start[pos] = 1
                
                end_pos = pos + len(tokenized_ans) - 1
                if end_pos > len(context) - 1:
                    end_pos = len(context) - 1
                end[end_pos] = 1
                    
                curr_dct["start"] = start
                curr_dct["end"] = end
                break
        
        if "start" in curr_dct and "end" in curr_dct and "context" in curr_dct:
            data_list.append(curr_dct)
        else:
            abandon_count += 1
            # print("Cannot match answer:", index)
    
    print('The number of instances which have been removed:', abandon_count)
    return data_list

In [None]:
training_data = generate_dataset(training_set)

with open('/mnt/training_data.pickle', 'wb') as handle:
    pickle.dump(training_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Preprocess developing data

In [None]:
file_name = "devel.json"
with open(file_name) as json_data:
    devel_set = json.load(json_data)

In [None]:
devel_data = generate_dataset(devel_set)

with open('devel_data.pickle', 'wb') as handle:
    pickle.dump(devel_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Tf-idf term weighting

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer

In [None]:
file_name = "testing.json"
with open(file_name) as json_data:
    testing_set = json.load(json_data)

### Get Top N similar sentences

In [None]:
from gensim.summarization import bm25

def cos_sim(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

In [None]:
testing_data = []

for doc in tqdm(documents[410:]):
    
    tfidf = TfidfVectorizer(tokenizer=nltk.word_tokenize, lowercase=True)
    
    raw_text = []
    for para in doc['text']:
        raw_text.extend(nltk.sent_tokenize(para))

    td_mat = tfidf.fit_transform(raw_documents=raw_text)
    
    bm_model = bm25.BM25([nltk.word_tokenize(doc) for doc in raw_text])
    average_idf = sum(map(lambda k: float(bm_model.idf[k]), bm_model.idf.keys())) / len(bm_model.idf.keys())
    
    for question in testing_set:
        if question['docid'] == doc['docid']:
            temp_dict = dict()
            
            query = tfidf.transform([question['question']])
            tf_idf_scores = np.dot(query, td_mat.T).toarray()
            norm_tf_idf_scores = (tf_idf_scores - np.mean(tf_idf_scores)) / np.std(tf_idf_scores)
            
            # tfidf_idx = (-doc_rank).argsort()[0][:3]
            bm_25_scores = bm_model.get_scores(nltk.word_tokenize(question['question']), average_idf)

            norm_bm_25_scores = (bm_25_scores - np.mean(bm_25_scores)) / np.std(bm_25_scores)
            
            scores = norm_tf_idf_scores + norm_bm_25_scores
            idx = np.argmax(scores)
            
            temp_dict["question"], temp_dict["id"] = question['question'], question["id"]
            temp_dict["text"] = ""
            
            temp_dict["text"] += raw_text[idx]
            
            testing_data.append(temp_dict)

In [None]:
with open('testing_data.pickle', 'wb') as handle:
    pickle.dump(testing_data, handle, protocol=pickle.HIGHEST_PROTOCOL)