In [3]:
import numpy as np, pandas as pd
import json
import ast 
from textblob import TextBlob
import nltk
import torch
import pickle
from scipy import spatial
import warnings
warnings.filterwarnings('ignore')
import spacy
from nltk import Tree
en_nlp = spacy.load('en_core_web_sm')
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [4]:
# !conda update pandas --y

In [5]:
train = pd.read_csv("data/train.csv")

In [6]:
train.shape

(87599, 4)

### Loading Embedding dictionary

In [7]:
with open("data/dict_embeddings1.pickle", "rb") as f:
    d1 = pickle.load(f)

In [8]:
with open("data/dict_embeddings2.pickle", "rb") as f:
    d2 = pickle.load(f)

In [9]:
dict_emb = dict(d1)
dict_emb.update(d2)

In [10]:
len(dict_emb)

179862

In [11]:
del d1, d2

## Data Processing

In [12]:
def get_target(x):
    idx = -1
    for i in range(len(x["sentences"])):
        if x["text"] in x["sentences"][i]: idx = i
    return idx

<matplotlib.axes._subplots.AxesSubplot at 0x1643903d7c8>

In [14]:
train.shape

(87599, 4)

In [15]:
train.dropna(inplace=True)

In [16]:
train.shape

(87598, 4)

In [17]:
def process_data(train):
    
    print("step 1")
    train['sentences'] = train['context'].apply(lambda x: [item.raw for item in TextBlob(x).sentences])
    
    print("step 2")
    train["target"] = train.apply(get_target, axis = 1)
    
    print("step 3")
    train['sent_emb'] = train['sentences'].apply(lambda x: [dict_emb[item][0] if item in\
                                                           dict_emb else np.zeros(4096) for item in x])
    print("step 4")
    train['quest_emb'] = train['question'].apply(lambda x: dict_emb[x] if x in dict_emb else np.zeros(4096) )
        
    return train   

In [18]:
train = process_data(train)

step 1
step 2
step 3
step 4


In [19]:
train.head(3)

Unnamed: 0,context,question,answer_start,text,sentences,target,sent_emb,quest_emb
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,515,Saint Bernadette Soubirous,"[Architecturally, the school has a Catholic ch...",5,"[[0.05519997, 0.05013141, 0.047870375, 0.01624...","[[0.11010079, 0.114229396, 0.11560896, 0.05489..."
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,188,a copper statue of Christ,"[Architecturally, the school has a Catholic ch...",2,"[[0.05519997, 0.05013141, 0.047870375, 0.01624...","[[0.10951651, 0.11030624, 0.052100066, 0.03053..."
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,279,the Main Building,"[Architecturally, the school has a Catholic ch...",3,"[[0.05519997, 0.05013141, 0.047870375, 0.01624...","[[0.011956469, 0.14930709, 0.026600497, 0.0527..."


## Predicted Cosine & Euclidean Index

In [20]:
def cosine_sim(x):
    li = []
    for item in x["sent_emb"]:
        li.append(spatial.distance.cosine(item,x["quest_emb"][0]))
    return li   

In [21]:
def pred_idx(distances):
    return np.argmin(distances)   

In [22]:
def predictions(train):
    
    train["cosine_sim"] = train.apply(cosine_sim, axis = 1)
    train["diff"] = (train["quest_emb"] - train["sent_emb"])**2
    train["euclidean_dis"] = train["diff"].apply(lambda x: list(np.sum(x, axis = 1)))
    del train["diff"]
    
    print("cosine start")
    
    train["pred_idx_cos"] = train["cosine_sim"].apply(lambda x: pred_idx(x))
    train["pred_idx_euc"] = train["euclidean_dis"].apply(lambda x: pred_idx(x))
    
    return train
    

In [23]:
predicted = predictions(train)

cosine start


In [24]:
predicted.head(3)

Unnamed: 0,context,question,answer_start,text,sentences,target,sent_emb,quest_emb,cosine_sim,euclidean_dis,pred_idx_cos,pred_idx_euc
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,515,Saint Bernadette Soubirous,"[Architecturally, the school has a Catholic ch...",5,"[[0.05519997, 0.05013141, 0.047870375, 0.01624...","[[0.11010079, 0.114229396, 0.11560896, 0.05489...","[0.42473626136779785, 0.364050030708313, 0.347...","[14.563858, 15.262213, 17.398178, 14.272491, 1...",5,5
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,188,a copper statue of Christ,"[Architecturally, the school has a Catholic ch...",2,"[[0.05519997, 0.05013141, 0.047870375, 0.01624...","[[0.10951651, 0.11030624, 0.052100066, 0.03053...","[0.45407456159591675, 0.32262009382247925, 0.3...","[12.889506, 12.285218, 16.843704, 8.361172, 11...",3,3
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,279,the Main Building,"[Architecturally, the school has a Catholic ch...",3,"[[0.05519997, 0.05013141, 0.047870375, 0.01624...","[[0.011956469, 0.14930709, 0.026600497, 0.0527...","[0.3958578109741211, 0.2917083501815796, 0.309...","[11.857297, 11.392319, 15.061656, 7.184714, 8....",3,3


In [25]:
predicted["cosine_sim"][0]

[0.42473626136779785,
 0.364050030708313,
 0.3477550148963928,
 0.3942416310310364,
 0.3710247874259949,
 0.18569016456604004,
 0.3519207239151001]

In [26]:
predicted["euclidean_dis"][0]

[14.563858, 15.262213, 17.398178, 14.272491, 13.339655, 9.336262, 15.720997]

## Accuracy

In [27]:
def accuracy(target, predicted):
    
    acc = (target==predicted).sum()/len(target)
    
    return acc

### Accuracy for  euclidean Distance

In [28]:
print(accuracy(predicted["target"], predicted["pred_idx_euc"]))

0.4472819014132743


### Accuracy for Cosine Similarity

In [29]:
print(accuracy(predicted["target"], predicted["pred_idx_cos"]))

0.6338158405443046


In [73]:
predicted.to_csv("train_detect_sent.csv", index=None)

In [31]:
predicted.iloc[75207,:]

context          Both the vertical and dipole antennas are simp...
question                             Are basic antennas expensive?
answer_start                                                    69
text                                        relatively inexpensive
sentences        [Both the vertical and dipole antennas are sim...
target                                                           0
sent_emb         [[0.06494937, 0.03690031, 0.12519251, -0.02735...
quest_emb        [[0.031715073, 0.07947657, 0.030824697, 0.0126...
cosine_sim       [0.3591885566711426, 0.46891170740127563, 0.44...
euclidean_dis    [11.512397, 21.817242, 12.696278, 17.895185, 1...
pred_idx_cos                                                     0
pred_idx_euc                                                     0
Name: 75208, dtype: object

In [32]:
ct,k = 0,0
for i in range(predicted.shape[0]):
    if predicted.iloc[i,10] != predicted.iloc[i,5]:
        k += 1
        if predicted.iloc[i,11] == predicted.iloc[i,5]:
            ct += 1

In [33]:
ct, k

(5513, 32077)

### Combining Accuracy

In [34]:
label = []
for i in range(predicted.shape[0]):
    if predicted.iloc[i,10] == predicted.iloc[i,11]:
        label.append(predicted.iloc[i,10])
    else:
        label.append((predicted.iloc[i,10],predicted.iloc[i,10]))

In [35]:
ct = 0
for i in range(75206):
    item = predicted["target"][i]
    try:
        if label[i] == predicted["target"][i]: ct +=1
    except:
        if item in label[i]: ct +=1
            

In [36]:
ct/75206

0.636917267239316

### Root Match

In [37]:
predicted = pd.read_csv("train_detect_sent.csv").reset_index(drop=True)

In [43]:
predicted.head()

Unnamed: 0,context,question,answer_start,text,sentences,target,sent_emb,quest_emb,cosine_sim,euclidean_dis,pred_idx_cos,pred_idx_euc
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,515,Saint Bernadette Soubirous,"['Architecturally, the school has a Catholic c...",5,"[array([ 0.05519997, 0.05013141, 0.04787038,...",[[ 0.11010079 0.1142294 0.11560896 ... 0.0...,"[0.42473626136779785, 0.364050030708313, 0.347...","[14.563858, 15.262213, 17.398178, 14.272491, 1...",5,5
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,188,a copper statue of Christ,"['Architecturally, the school has a Catholic c...",2,"[array([ 0.05519997, 0.05013141, 0.04787038,...",[[ 0.10951651 0.11030624 0.05210007 ... -0.0...,"[0.45407456159591675, 0.32262009382247925, 0.3...","[12.889506, 12.285218, 16.843704, 8.361172, 11...",3,3
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,279,the Main Building,"['Architecturally, the school has a Catholic c...",3,"[array([ 0.05519997, 0.05013141, 0.04787038,...",[[ 0.01195647 0.14930709 0.0266005 ... 0.0...,"[0.3958578109741211, 0.2917083501815796, 0.309...","[11.857297, 11.392319, 15.061656, 7.184714, 8....",3,3
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,381,a Marian place of prayer and reflection,"['Architecturally, the school has a Catholic c...",4,"[array([ 0.05519997, 0.05013141, 0.04787038,...",[[ 0.0711433 0.05411833 -0.01395983 ... -0.0...,"[0.4900696873664856, 0.4060605764389038, 0.456...","[13.317537, 15.017246, 20.81268, 10.511387, 10...",3,3
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,92,a golden statue of the Virgin Mary,"['Architecturally, the school has a Catholic c...",1,"[array([ 0.05519997, 0.05013141, 0.04787038,...",[[0.16133597 0.15039583 0.09225756 ... 0.06351...,"[0.47775155305862427, 0.2891119718551636, 0.34...","[15.0888195, 11.612733, 16.684145, 9.71824, 12...",3,3


In [38]:
doc = en_nlp(predicted.iloc[0,1])

In [39]:
predicted.iloc[0,1]

'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?'

In [40]:
predicted.iloc[0,2]

515

In [41]:
def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
    else:
        return node.orth_

In [48]:
predicted.iloc[0,1]

'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?'

In [50]:
[to_nltk_tree(sent.root).pretty_print()  for sent in en_nlp(predicted.iloc[0,1]).sents]

                  appear                             
  __________________|____________________________     
 |      |      |    |         |           |      in  
 |      |      |    |         |           |      |    
 |      |      |    To       Mary         in   France
 |      |      |    |      ___|_____      |      |    
did allegedly  ?   whom  the      Virgin 1858 Lourdes



[None]

In [51]:
[to_nltk_tree(sent.root) .pretty_print() for sent in doc.sents][5]

                  appear                             
  __________________|____________________________     
 |      |      |    |         |           |      in  
 |      |      |    |         |           |      |    
 |      |      |    To       Mary         in   France
 |      |      |    |      ___|_____      |      |    
did allegedly  ?   whom  the      Virgin 1858 Lourdes



IndexError: list index out of range

In [54]:
for sent in doc.sents:
    roots = [st.stem(chunk.root.head.text.lower()) for chunk in sent.noun_chunks]
    print(roots)

['to', 'appear', 'in']


In [55]:
def match_roots(x):
    question = x["question"].lower()
    sentences = en_nlp(x["context"].lower()).sents
    
    question_root = st.stem(str([sent.root for sent in en_nlp(question).sents][0]))
    
    li = []
    for i,sent in enumerate(sentences):
        roots = [st.stem(chunk.root.head.text.lower()) for chunk in sent.noun_chunks]

        if question_root in roots: 
            for k,item in enumerate(ast.literal_eval(x["sentences"])):
                if str(sent) in item.lower(): 
                    li.append(k)
    return li

In [56]:
predicted["question"][21493]

'The end of what road was once home to Newgate Prison?'

In [57]:
predicted["context"][21493]

"10th Street (40°44′03″N 74°00′11″W\ufeff / \ufeff40.7342580°N 74.0029670°W\ufeff / 40.7342580; -74.0029670) begins at the FDR Drive and Avenue C. West of Sixth Avenue, it turns southward about 40 degrees to join the Greenwich Village street grid and continue to West Street on the Hudson River. Because West 4th Street turns northward at Sixth Avenue, it intersects 10th, 11th and 12th and 13th Streets in the West Village. The M8 bus operates on 10th Street in both directions between Avenue D and Avenue A, and eastbound between West Street and Sixth Avenue. 10th Street has an eastbound bike lane from West Street to the East River. In 2009, the two-way section of 10th Street between Avenue A and the East River had bicycle markings and sharrows installed, but it still has no dedicated bike lane. West 10th Street was previously named Amos Street for Richard Amos. The end of West 10th Street toward the Hudson River was once the home of Newgate Prison, New York City's first prison and the Uni

In [58]:
predicted["root_match_idx"] = predicted.apply(match_roots, axis = 1)

In [59]:
predicted["root_match_idx_first"]= predicted["root_match_idx"].apply(lambda x: x[0] if len(x)>0 else 0)

In [60]:
(predicted["root_match_idx_first"]==predicted["target"]).sum()/predicted.shape[0]

0.40405032078357955

In [61]:
predicted.to_csv("train_detect_sent.csv", index=None)

In [62]:
predicted[(predicted["sentences"].apply(lambda x: len(ast.literal_eval(x)))<11) &  (predicted["root_match_idx_first"]>10)]       



Unnamed: 0,context,question,answer_start,text,sentences,target,sent_emb,quest_emb,cosine_sim,euclidean_dis,pred_idx_cos,pred_idx_euc,root_match_idx,root_match_idx_first


In [63]:
len(ast.literal_eval(predicted.iloc[21493,4]))

7

In [64]:
question = predicted["question"][21493].lower()
sentences = en_nlp(predicted["context"][21493].lower()).sents
    
question_root = st.stem(str([sent.root for sent in en_nlp(question).sents][0]))
    
li = []
for i,sent in enumerate(sentences):
    roots = [st.stem(chunk.root.head.text.lower()) for chunk in sent.noun_chunks]
    print(roots)

    if question_root in roots: li.append(i)

['street']
['°']
['w\ufeff']
['at', 'driv', 'of', 'turn', 'turn', 'join', 'to', 'on']
['turn', 'at', 'intersect', 'intersect', 'in']
['op', 'on', 'in', 'between', 'op', 'between', 'street']
['has', 'has', 'from', 'to']
['had', 'of', 'between', 'a', 'had', 'mark', 'has', 'has']
['nam', 'for']
['was', 'of', 'toward', 'was', 'of', 'prison', 'prison']


In [65]:
ast.literal_eval(predicted["sentences"][21493])

['10th Street (40°44′03″N 74°00′11″W\ufeff / \ufeff40.7342580°N 74.0029670°W\ufeff / 40.7342580; -74.0029670) begins at the FDR Drive and Avenue C. West of Sixth Avenue, it turns southward about 40 degrees to join the Greenwich Village street grid and continue to West Street on the Hudson River.',
 'Because West 4th Street turns northward at Sixth Avenue, it intersects 10th, 11th and 12th and 13th Streets in the West Village.',
 'The M8 bus operates on 10th Street in both directions between Avenue D and Avenue A, and eastbound between West Street and Sixth Avenue.',
 '10th Street has an eastbound bike lane from West Street to the East River.',
 'In 2009, the two-way section of 10th Street between Avenue A and the East River had bicycle markings and sharrows installed, but it still has no dedicated bike lane.',
 'West 10th Street was previously named Amos Street for Richard Amos.',
 "The end of West 10th Street toward the Hudson River was once the home of Newgate Prison, New York City's

In [66]:
predicted["context"][21493]

"10th Street (40°44′03″N 74°00′11″W\ufeff / \ufeff40.7342580°N 74.0029670°W\ufeff / 40.7342580; -74.0029670) begins at the FDR Drive and Avenue C. West of Sixth Avenue, it turns southward about 40 degrees to join the Greenwich Village street grid and continue to West Street on the Hudson River. Because West 4th Street turns northward at Sixth Avenue, it intersects 10th, 11th and 12th and 13th Streets in the West Village. The M8 bus operates on 10th Street in both directions between Avenue D and Avenue A, and eastbound between West Street and Sixth Avenue. 10th Street has an eastbound bike lane from West Street to the East River. In 2009, the two-way section of 10th Street between Avenue A and the East River had bicycle markings and sharrows installed, but it still has no dedicated bike lane. West 10th Street was previously named Amos Street for Richard Amos. The end of West 10th Street toward the Hudson River was once the home of Newgate Prison, New York City's first prison and the Uni

In [67]:
# en_nlp = spacy.load('en')
sentences = en_nlp(predicted["context"][21493].lower()).sents

In [68]:
for item in sentences:
    print(item)

10th street (40°44′03″n 74°
00′11″w﻿ / ﻿40.7342580°n 74.0029670°
w﻿ / 40.7342580
; -74.0029670) begins at the fdr drive and avenue c. west of sixth avenue, it turns southward about 40 degrees to join the greenwich village street grid and continue to west street on the hudson river.
because west 4th street turns northward at sixth avenue, it intersects 10th, 11th and 12th and 13th streets in the west village.
the m8 bus operates on 10th street in both directions between avenue d and avenue a, and eastbound between west street and sixth avenue.
10th street has an eastbound bike lane from west street to the east river.
in 2009, the two-way section of 10th street between avenue a and the east river had bicycle markings and sharrows installed, but it still has no dedicated bike lane.
west 10th street was previously named amos street for richard amos.
the end of west 10th street toward the hudson river was once the home of newgate prison, new york city's first prison and the united states' s

In [69]:
TfidfVectorizer(predicted["sentences"][0], ngram_range=(1,2))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input="['Architecturally, the school has a Catholic "
                      'character.\', "Atop the Main Building\'s gold dome is a '
                      'golden statue of the Virgin Mary.", \'Immediately in '
                      'front of the Main Building and facing it, is a copper '
                      'statue of Christ with arms upraised with the legend '
                      '"Ve...
                      'the end of the main drive (and in a direct line that '
                      'connects through 3 statues and the Gold Dome), is a '
                      "simple, modern stone statue of Mary.']",
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublin