In [1]:
import warnings
warnings.filterwarnings('ignore')
import pickle
import numpy as np
import pandas as pd
import json
from textblob import TextBlob
import nltk
from scipy import spatial
import torch
import spacy

In [2]:
en_nlp = spacy.load('SQuAD/lib/python3.7/site-packages/en_core_web_sm/en_core_web_sm-2.2.5')

## Convert Json to Pandas Dataframe

In [3]:
train = pd.read_json("data/train-v2.0.json")

In [4]:
valid = pd.read_json("data/dev-v2.0.json")

In [5]:
test = pd.read_json("data/test-v2.0.json")

In [14]:
train.shape, valid.shape, test.shape

((442, 2), (16, 2), (20, 2))

In [15]:
train.head(3)

Unnamed: 0,version,data
0,v2.0,"{'title': 'Beyoncé', 'paragraphs': [{'qas': [{..."
1,v2.0,"{'title': 'Frédéric_Chopin', 'paragraphs': [{'..."
2,v2.0,{'title': 'Sino-Tibetan_relations_during_the_M...


In [6]:
contexts = []
questions = []
answers_text = []
answers_start = []
for i in range(train.shape[0]):
    topic = train.iloc[i,1]['paragraphs']
    for sub_para in topic:
        for q_a in sub_para['qas']:
            questions.append(q_a['question'])
            try:
                answers_start.append(q_a['answers'][0]['answer_start'])
            except:
                answers_start.append('')
            try:
                answers_text.append(q_a['answers'][0]['text'])
            except:
                answers_text.append('')
            contexts.append(sub_para['context'])
df = pd.DataFrame({"context":contexts, "question": questions, "answer_start": answers_start, "text": answers_text})

In [7]:
df.shape

(130319, 4)

In [8]:
df.head(10)

Unnamed: 0,context,question,answer_start,text
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,269,in the late 1990s
1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,207,singing and dancing
2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,526,2003
3,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,166,"Houston, Texas"
4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,276,late 1990s
5,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what R&B group was she the lead singer?,320,Destiny's Child
6,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What album made her a worldwide known artist?,505,Dangerously in Love
7,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,Who managed the Destiny's Child group?,360,Mathew Knowles
8,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyoncé rise to fame?,276,late 1990s
9,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What role did Beyoncé have in Destiny's Child?,290,lead singer


In [36]:
df.to_csv("data/train.csv", index = None)

## Create dictionary of sentence embeddings for faster computation

In [8]:
paras = list(df["context"].drop_duplicates().reset_index(drop= True))

In [38]:
len(paras)

19029

In [40]:
paras[0]

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'

In [9]:
blob = TextBlob(" ".join(paras))
sentences = [item.raw for item in blob.sentences]

In [41]:
sentences[0]

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress.'

In [42]:
len(sentences)

93280

In [10]:
# Load model
from models import InferSent
model_version = 2
MODEL_PATH = "InferSent-master/encoder/infersent%s.pkl" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))

<All keys matched successfully>

In [47]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/yyy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
# infersent = torch.load('InferSent-master/encoder/infersent2.pkl', map_location=lambda storage, loc: storage)
infersent.set_w2v_path("InferSent-master/fastText/crawl-300d-2M.vec")

In [12]:
infersent.build_vocab(sentences, tokenize=True)

Found 89681(/110286) words with w2v vectors
Vocab size : 89681


In [13]:
dict_embeddings = {}
for i in range(len(sentences)):
    try:
        dict_embeddings[sentences[i]] = infersent.encode([sentences[i]], tokenize=True)
    except:
        dict_embeddings[sentences[i]] = []
    if i % 1000 == 0:
        print(i)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000


In [14]:
questions = list(df["question"])

In [15]:
len(questions)

130319

In [17]:
for i in range(len(questions)):
    try:
        dict_embeddings[questions[i]] = infersent.encode([questions[i]], tokenize=True)
    except:
        dict_embeddings[questions[i]] = []
    if i % 2000 == 0:
        print(i)

0
2000
4000
6000
8000
10000
12000
14000
16000
18000
20000
22000
24000
26000
28000
30000
32000
34000
36000
38000
40000
42000
44000
46000
48000
50000
52000
54000
56000
58000
60000
62000
64000
66000
68000
70000
72000
74000
76000
78000
80000
82000
84000
86000
88000
90000
92000
94000
96000
98000
100000
102000
104000
106000
108000
110000
112000
114000
116000
118000
120000
122000
124000
126000
128000
130000


In [18]:
dict_embeddings['Architecturally, the school has a Catholic character.'][0]

array([ 0.00746889, -0.05086312,  0.00736476, ...,  0.04118386,
        0.01421822, -0.0138564 ], dtype=float32)

In [19]:
d1 = {key:dict_embeddings[key] for i, key in enumerate(dict_embeddings) if i % 2 == 0}
d2 = {key:dict_embeddings[key] for i, key in enumerate(dict_embeddings) if i % 2 == 1}

In [20]:
d1

{'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress.': array([[ 0.00746889, -0.04449936,  0.11347561, ...,  0.02631407,
          0.07244834,  0.02589748]], dtype=float32),
 "Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time.": array([[ 0.00746889,  0.17613176,  0.10349542, ...,  0.02955914,
          0.03982133, -0.01385639]], dtype=float32),
 'Following the disbandment of Destiny\'s Child in June 2005, she released her second solo album, B\'Day (2006), which contained hits "Déjà Vu", "Irreplaceable", and "Beautiful Liar".': array([[ 0.00746889, -0.06522691,  0.17579783, ...,  0.03920721,
          0.05661291,  0.00816607]], dtype=float32),
 'Her marriage to rapper Jay Z and portrayal of Etta James in Cadillac Records (2008) influenced her third album, I Am... Sasha Fierce (2008), which saw the birth of her alter-ego Sasha Fierce

In [21]:
d2

{"Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child.": array([[ 0.00746889, -0.07552826,  0.1798204 , ...,  0.01542308,
          0.05224251,  0.0067659 ]], dtype=float32),
 'Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".': array([[0.00746889, 0.05835425, 0.16314448, ..., 0.02819629, 0.12729868,
         0.01142828]], dtype=float32),
 'Beyoncé also ventured into acting, with a Golden Globe-nominated performance in Dreamgirls (2006), and starring roles in The Pink Panther (2006) and Obsessed (2009).': array([[ 0.00746889, -0.05096488,  0.15526353, ...,  0.03228499,
          0.05260491,  0.02064428]], dtype=float32),
 'Beyoncé took a hiatus from music in 

In [22]:
with open('infdata/dict_embeddings1.pickle', 'wb') as handle:
    pickle.dump(d1, handle)

In [23]:
with open('data/dict_embeddings2.pickle', 'wb') as handle:
    pickle.dump(d2, handle)

In [24]:
del dict_embeddings