In [31]:
import warnings
warnings.filterwarnings('ignore')
import pickle
import numpy as np
import pandas as pd
import json
from textblob import TextBlob
import nltk
from scipy import spatial
import torch
import spacy
en_nlp = spacy.load('en')

## Convert Json to Pandas Dataframe

In [32]:
# read data
train = pd.read_json("data/train-v1.1.json")

valid = pd.read_json("data/dev-v1.1.json")

In [33]:
# construct relative table
# to save time, use only 1% of orginial data 
contexts = []
questions = []
answers_text = []
answers_start = []
for i in range((int)(train.shape[0]/100)):
    topic = train.iloc[i,0]['paragraphs']
    for sub_para in topic:
        for q_a in sub_para['qas']:
            questions.append(q_a['question'])
            answers_start.append(q_a['answers'][0]['answer_start'])
            answers_text.append(q_a['answers'][0]['text'])
            contexts.append(sub_para['context'])   
df = pd.DataFrame({"context":contexts, "question": questions, "answer_start": answers_start, "text": answers_text})

In [51]:
# df.to_csv("data/train.csv", index = None)

## Create dictionary of sentence embeddings for faster computation (To save time now I only use the first 1% of original data)

In [34]:
# number of paragraphs
paras = list(df["context"].drop_duplicates().reset_index(drop= True))

In [35]:
len(paras)

191

In [36]:
# use blob to separate sentence
blob = TextBlob(" ".join(paras))
sentences = [item.raw for item in blob.sentences]

In [37]:
# use infersent to encode sentence
infersent = torch.load('InferSent/encoder/infersent.allnli.pickle', map_location=lambda storage, loc: storage)
infersent.set_glove_path("InferSent/dataset/GloVe/glove.840B.300d.txt")
# infersent.set_w2v_path("InferSent/dataset/wiki-news-300d-1M.vec")

In [38]:
infersent.build_vocab(sentences, tokenize=True)

Found 5871(/6007) words with glove vectors
Vocab size : 5871


In [24]:
dict_embeddings

NameError: name 'dict_embeddings' is not defined

In [39]:
# create sentence embedding
dict_embeddings = {}
for i in range(len(sentences)):
    if (i % 100 == 0):
        print(i)
    dict_embeddings[sentences[i]] = infersent.encode([sentences[i]], tokenize=True)

0


KeyboardInterrupt: 

In [22]:
questions = list(df["question"])

In [21]:
len(questions)

1314

In [29]:
# create question embedding
for i in range(len(questions)):
    if (i % 100 == 0):
        print(i)
    dict_embeddings[questions[i]] = infersent.encode([questions[i]], tokenize=True)

0


KeyboardInterrupt: 

In [30]:
dict_embeddings[df["question"][0]][0]

array([ 0.1095634 ,  0.1142294 ,  0.04428943, ...,  0.02811733,
       -0.01866924,  0.12806854], dtype=float32)

In [99]:
# use d1 and d2 as train and test set
d1 = {key:dict_embeddings[key] for i, key in enumerate(dict_embeddings) if i % 2 == 0}
d2 = {key:dict_embeddings[key] for i, key in enumerate(dict_embeddings) if i % 2 == 1}

In [100]:
d1

{'Architecturally, the school has a Catholic character.': array([[ 0.05519996,  0.05013141,  0.04787038, ...,  0.00821209,
         -0.03642813,  0.044685  ]], dtype=float32),
 'Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".': array([[0.11262652, 0.11146841, 0.14750297, ..., 0.00293285, 0.03322018,
         0.06657628]], dtype=float32),
 'Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection.': array([[ 0.04149357,  0.0703306 ,  0.03724371, ...,  0.01096805,
         -0.02892281,  0.0428066 ]], dtype=float32),
 'At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.': array([[ 0.04795522,  0.16508998,  0.09383532, ...,  0.05321149,
         -0.01826634,  0.10806957]], dtype=float32),
 'The nine student-run outlets include three newspapers, both a radio and television sta

In [65]:
d2

{"Atop the Main Building's gold dome is a golden statue of the Virgin Mary.": array([[ 0.07475325,  0.11794458,  0.06240867, ...,  0.01915886,
         -0.02436746,  0.10806957]], dtype=float32),
 'Next to the Main Building is the Basilica of the Sacred Heart.': array([[ 0.08010551,  0.11775322,  0.02186233, ...,  0.01656766,
         -0.01024127,  0.04706628]], dtype=float32),
 'It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.': array([[ 0.10776819,  0.0805801 ,  0.10461736, ...,  0.01522135,
         -0.03814263,  0.14945611]], dtype=float32),
 "As at most other universities, Notre Dame's students run a number of news media outlets.": array([[0.09720325, 0.09345725, 0.05466026, ..., 0.08443642, 0.00817084,
         0.02197512]], dtype=float32),
 'Begun as a one-page journal in September 1876, the Scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in t

In [66]:
# with open('data/dict_embeddings1.pickle', 'wb') as handle:
#     pickle.dump(d1, handle)

In [67]:
# with open('data/dict_embeddings2.pickle', 'wb') as handle:
#     pickle.dump(d2, handle)