In [1]:
import warnings
warnings.filterwarnings('ignore')
import pickle
import numpy as np
import pandas as pd
import json
from textblob import TextBlob
import nltk
from scipy import spatial
import torch
import spacy
en_nlp = spacy.load('en_core_web_sm')
from InferSent.models import InferSent

## Convert Json to Pandas Dataframe

In [2]:
valid = pd.read_json("data/dev-v1.1.json")

In [3]:
valid.shape

(48, 2)

In [4]:
valid.head(3)

Unnamed: 0,data,version
0,"{'title': 'Super_Bowl_50', 'paragraphs': [{'co...",1.1
1,"{'title': 'Warsaw', 'paragraphs': [{'context':...",1.1
2,"{'title': 'Normans', 'paragraphs': [{'context'...",1.1


In [5]:
valid.iloc[1,0]['paragraphs'][0]

{'context': 'One of the most famous people born in Warsaw was Maria Skłodowska-Curie, who achieved international recognition for her research on radioactivity and was the first female recipient of the Nobel Prize. Famous musicians include Władysław Szpilman and Frédéric Chopin. Though Chopin was born in the village of Żelazowa Wola, about 60 km (37 mi) from Warsaw, he moved to the city with his family when he was seven months old. Casimir Pulaski, a Polish general and hero of the American Revolutionary War, was born here in 1745.',
 'qas': [{'answers': [{'answer_start': 188, 'text': 'Nobel Prize'},
    {'answer_start': 188, 'text': 'Nobel Prize'},
    {'answer_start': 188, 'text': 'Nobel Prize'}],
   'question': 'What was Maria Curie the first female recipient of?',
   'id': '5733a5f54776f41900660f45'},
  {'answers': [{'answer_start': 517, 'text': '1745'},
    {'answer_start': 517, 'text': '1745'},
    {'answer_start': 517, 'text': '1745'}],
   'question': 'What year was Casimir Pulask

In [6]:
# valid.iloc[1,0]['paragraphs'][0]

In [7]:
contexts = []
questions = []
answers_text = []
answers_start = []
for i in range(valid.shape[0]):
    topic = valid.iloc[i,0]['paragraphs']
    for sub_para in topic:
        for q_a in sub_para['qas']:
            questions.append(q_a['question'])
            answers_start.append(q_a['answers'][0]['answer_start'])
            answers_text.append(q_a['answers'][0]['text'])
            contexts.append(sub_para['context'])   
df = pd.DataFrame({"context":contexts, "question": questions, "answer_start": answers_start, "text": answers_text})

In [8]:
df.shape

(10570, 4)

In [9]:
df.to_csv("data/valid.csv", index = None)

## Create dictionary of sentence embeddings for faster computation

In [11]:
paras = list(df["context"].drop_duplicates().reset_index(drop= True))

In [12]:
len(paras)

2067

In [13]:
blob = TextBlob(" ".join(paras))
sentences = [item.raw for item in blob.sentences]

In [14]:
len(sentences)

10330

**My Code Start**

In [16]:
model_version = 1
MODEL_PATH = "InferSent/encoder/infersent%s.pkl" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [17]:
use_cuda = False
model = model.cuda() if use_cuda else model

In [18]:
# If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
W2V_PATH = 'InferSent/dataset/GloVe/glove.840B.300d.txt' if model_version == 1 else 'InferSent/dataset/fastText/crawl-300d-2M.vec'
model.set_w2v_path(W2V_PATH)

In [19]:

# Load embeddings of K most frequent words
# model.build_vocab_k_words(K=100000)

Vocab size : 100000


**End**

In [None]:
# infersent = torch.load('InferSent/dataset/encoder/infersent1.pkl', map_location=lambda storage, loc: storage)
# infersent.set_glove_path("InferSent/dataset/GloVe/glove.840B.300d.txt")

In [20]:
infersent = model

In [21]:
infersent.build_vocab(sentences, tokenize=True)

Found 25944(/28184) words with w2v vectors
Vocab size : 25944


In [22]:
dict_embeddings = {}
for i in range(len(sentences)):
    if i%1000==0:
        print(i)
    dict_embeddings[sentences[i]] = infersent.encode([sentences[i]], tokenize=True)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000


In [23]:
questions = list(df["question"])

In [24]:
len(questions)

10570

In [25]:
for i in range(len(questions)):
    if i%1000==0:
        print(i)
    dict_embeddings[questions[i]] = infersent.encode([questions[i]], tokenize=True)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000


In [38]:
# dict_embeddings['Architecturally, the school has a Catholic character.'][0]

array([ 0.05519997,  0.05013141,  0.04787038, ...,  0.00821209,
       -0.03642813,  0.04468501], dtype=float32)

In [26]:
d_valid = {key:dict_embeddings[key] for i, key in enumerate(dict_embeddings)}
# d2 = {key:dict_embeddings[key] for i, key in enumerate(dict_embeddings) if i % 2 == 1}

In [None]:
# d1

In [None]:
# d2

In [27]:
with open('data/dict_embeddings1_valid.pickle', 'wb') as handle:
    pickle.dump(d_valid, handle)

In [41]:
# with open('data/dict_embeddings2_valid.pickle', 'wb') as handle:
#     pickle.dump(d2, handle)

In [28]:
del dict_embeddings