In [57]:
import numpy as np, pandas as pd
import json
import ast 
from textblob import TextBlob
import nltk
import torch
import pickle
from scipy import spatial
import warnings
warnings.filterwarnings('ignore')
import spacy
from nltk import Tree
en_nlp = spacy.load('en')
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [58]:
# !conda update pandas --y

In [59]:
# comment out to avoid overwriting original data
# load train data
train = pd.read_csv("data/train.csv")

In [60]:
train.shape

(1314, 4)

### Loading Embedding dictionary

In [61]:
with open("data/dict_embeddings1.pickle", "rb") as f:
    d1 = pickle.load(f)

with open("data/dict_embeddings2.pickle", "rb") as f:
    d2 = pickle.load(f)

In [62]:
# key - sentence / question , val - vector
dict_emb = dict(d1)
dict_emb.update(d2)

In [63]:
dict_emb['To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?'][0]

array([ 0.1095634 ,  0.1142294 ,  0.04428943, ...,  0.02811733,
       -0.01866924,  0.12806854], dtype=float32)

In [64]:
len(dict_emb)

2429

In [65]:
del d1, d2

## Data Processing

In [66]:
# find the index of the sentence that contains answer
def get_target(x):
    idx = -1
    for i in range(len(x["sentences"])):
        if x["text"] in x["sentences"][i]: idx = i
    return idx

In [67]:
train.head(3)

Unnamed: 0,context,question,answer_start,text
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,515,Saint Bernadette Soubirous
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,188,a copper statue of Christ
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,279,the Main Building


In [68]:
train.shape

(1314, 4)

In [69]:
train.dropna(inplace=True)

In [70]:
train.shape

(1314, 4)

In [71]:
dict_emb['Architecturally, the school has a Catholic character.']

array([[ 0.05519996,  0.05013141,  0.04787038, ...,  0.00821209,
        -0.03642813,  0.044685  ]], dtype=float32)

In [72]:
def process_data(train):
    
    print("step 1")
    train['sentences'] = train['context'].apply(lambda x: [item.raw for item in TextBlob(x).sentences])
    
    print("step 2")
    train["target"] = train.apply(get_target, axis = 1)
    
    print("step 3")
    train['sent_emb'] = train['sentences'].apply(lambda x: [dict_emb[item][0] if item in\
                                                           dict_emb else np.zeros(4096) for item in x])
    print("step 4")
    train['quest_emb'] = train['question'].apply(lambda x: dict_emb[x] if x in dict_emb else np.zeros(4096) )
        
    return train   

In [73]:
train = process_data(train)

step 1
step 2
step 3
step 4


In [74]:
train.head(3)

Unnamed: 0,context,question,answer_start,text,sentences,target,sent_emb,quest_emb
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,515,Saint Bernadette Soubirous,"[Architecturally, the school has a Catholic ch...",5,"[[0.055199962, 0.05013141, 0.047870375, 0.0162...","[[0.109563395, 0.114229396, 0.04428943, 0.0531..."
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,188,a copper statue of Christ,"[Architecturally, the school has a Catholic ch...",2,"[[0.055199962, 0.05013141, 0.047870375, 0.0162...","[[0.10951651, 0.11030624, 0.05210008, 0.030539..."
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,279,the Main Building,"[Architecturally, the school has a Catholic ch...",3,"[[0.055199962, 0.05013141, 0.047870375, 0.0162...","[[0.0026226756, 0.14930709, 0.030639833, 0.062..."


In [75]:
train.iloc[0,:]['sent_emb']

[array([ 0.05519996,  0.05013141,  0.04787038, ...,  0.00821209,
        -0.03642813,  0.044685  ], dtype=float32),
 array([ 0.07475325,  0.11794458,  0.06240867, ...,  0.01915886,
        -0.02436746,  0.10806957], dtype=float32),
 array([0.11262652, 0.11146841, 0.14750297, ..., 0.00293285, 0.03322018,
        0.06657628], dtype=float32),
 array([ 0.08010551,  0.11775322,  0.02186233, ...,  0.01656766,
        -0.01024127,  0.04706628], dtype=float32),
 array([ 0.04149357,  0.0703306 ,  0.03724371, ...,  0.01096805,
        -0.02892281,  0.0428066 ], dtype=float32),
 array([ 0.10776819,  0.0805801 ,  0.10461736, ...,  0.01522135,
        -0.03814263,  0.14945611], dtype=float32),
 array([ 0.04795522,  0.16508998,  0.09383532, ...,  0.05321149,
        -0.01826634,  0.10806957], dtype=float32)]

## Predicted Cosine & Euclidean & other Index

In [76]:
def cosine_sim(x):
    li = []
    for item in x["sent_emb"]:
        li.append(spatial.distance.cosine(item,x["quest_emb"][0]))
    return li   

In [77]:
def euc_dist(x):
    li = []
    for item in x["sent_emb"]:
#         print(item)
#         print(x["quest_emb"][0])
#         print()
        li.append(spatial.distance.euclidean(item,x["quest_emb"][0]))
    return li   

In [78]:
# actually manhatten distance as p = 1
def min_dist(x,p=1):
    li = []
    for item in x["sent_emb"]:
#         print(item)
#         print(x["quest_emb"][0])
#         print()
        li.append(spatial.distance.minkowski(item,x["quest_emb"][0],p))
    return li  

In [79]:
def che_dist(x):
    li = []
    for item in x["sent_emb"]:
#         print(item)
#         print(x["quest_emb"][0])
#         print()
        li.append(spatial.distance.chebyshev(item,x["quest_emb"][0]))
    return li  

In [80]:
def get_sent_by_idx(x):
    return x['sentences'][x["pred_idx_cos"]]

In [81]:
def pred_idx(distances):
    return np.argmin(distances)   

In [82]:
def predictions(train):
    
    train["cosine_sim"] = train.apply(cosine_sim, axis = 1)
    train["diff"] = (train["quest_emb"] - train["sent_emb"])**2
    train["euclidean_dis"] = train["diff"].apply(lambda x: list(np.sum(x, axis = 1)))
    train["euclidean_dis2"]= train.apply(euc_dist,axis = 1)
    train["minkowski_dis"] = train.apply(min_dist,axis = 1)
    train["chebyshev_dis"] = train.apply(che_dist,axis = 1)
    del train["diff"]
    
    print("cosine start")
    
    train["pred_idx_cos"] = train["cosine_sim"].apply(lambda x: pred_idx(x))
    train["pred_idx_euc"] = train["euclidean_dis"].apply(lambda x: pred_idx(x))
    train["pred_idx_min"] = train["minkowski_dis"].apply(lambda x: pred_idx(x))
    train["pred_idx_che"] = train["chebyshev_dis"].apply(lambda x: pred_idx(x))
    train["pred_sen_cos"] = train.apply(get_sent_by_idx,axis = 1)



    return train
    

In [83]:
predicted = predictions(train)

cosine start


In [84]:
predicted

Unnamed: 0,context,question,answer_start,text,sentences,target,sent_emb,quest_emb,cosine_sim,euclidean_dis,euclidean_dis2,minkowski_dis,chebyshev_dis,pred_idx_cos,pred_idx_euc,pred_idx_min,pred_idx_che,pred_sen_cos
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,515,Saint Bernadette Soubirous,"[Architecturally, the school has a Catholic ch...",5,"[[0.055199962, 0.05013141, 0.047870375, 0.0162...","[[0.109563395, 0.114229396, 0.04428943, 0.0531...","[0.43262481689453125, 0.36864835023880005, 0.3...","[14.108408, 15.0115595, 18.043663, 13.160972, ...","[3.7561161518096924, 3.8744754791259766, 4.247...","[175.40135, 182.2885, 207.47821, 169.22188, 16...","[0.32886288, 0.3149855, 0.28589985, 0.26730207...",5,5,5,4,"It is a replica of the grotto at Lourdes, Fran..."
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,188,a copper statue of Christ,"[Architecturally, the school has a Catholic ch...",2,"[[0.055199962, 0.05013141, 0.047870375, 0.0162...","[[0.10951651, 0.11030624, 0.05210008, 0.030539...","[0.454074501991272, 0.32262009382247925, 0.355...","[12.889506, 12.285219, 16.843704, 8.361172, 11...","[3.590195894241333, 3.5050275325775146, 4.1041...","[167.5769, 162.31854, 193.53157, 130.61359, 16...","[0.31238672, 0.27531803, 0.2836463, 0.3119438,...",3,3,3,4,Next to the Main Building is the Basilica of t...
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,279,the Main Building,"[Architecturally, the school has a Catholic ch...",3,"[[0.055199962, 0.05013141, 0.047870375, 0.0162...","[[0.0026226756, 0.14930709, 0.030639833, 0.062...","[0.39596283435821533, 0.315723180770874, 0.346...","[11.590837, 12.180486, 16.56459, 7.4446793, 9....","[3.404531717300415, 3.4900553226470947, 4.0699...","[159.62509, 163.19696, 196.51852, 121.326935, ...","[0.26670542, 0.28385335, 0.2744509, 0.32841536...",3,3,3,4,Next to the Main Building is the Basilica of t...
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,381,a Marian place of prayer and reflection,"[Architecturally, the school has a Catholic ch...",4,"[[0.055199962, 0.05013141, 0.047870375, 0.0162...","[[0.0711433, 0.054118328, -0.013959831, 0.0531...","[0.4900696873664856, 0.4060605764389038, 0.456...","[13.317537, 15.017247, 20.81268, 10.511387, 10...","[3.649320125579834, 3.875209093093872, 4.56209...","[170.12534, 184.275, 225.77197, 149.94484, 152...","[0.2894366, 0.26743698, 0.28701258, 0.310778, ...",3,3,3,1,Next to the Main Building is the Basilica of t...
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,92,a golden statue of the Virgin Mary,"[Architecturally, the school has a Catholic ch...",1,"[[0.055199962, 0.05013141, 0.047870375, 0.0162...","[[0.16131133, 0.15654242, 0.08214859, 0.043728...","[0.4777635931968689, 0.31209897994995117, 0.36...","[14.159465, 12.130126, 17.447664, 8.731176, 12...","[3.762906551361084, 3.48283314704895, 4.177040...","[178.42073, 160.24245, 199.47891, 131.98923, 1...","[0.30424595, 0.28583807, 0.28460413, 0.3058796...",3,3,3,4,Next to the Main Building is the Basilica of t...
5,"As at most other universities, Notre Dame's st...",When did the Scholastic Magazine of Notre dame...,248,September 1876,"[As at most other universities, Notre Dame's s...",2,"[[0.097203255, 0.09345725, 0.05466026, 0.04843...","[[0.016918726, 0.120841, -0.006960277, 0.05206...","[0.29326915740966797, 0.38319146633148193, 0.2...","[12.047197, 16.473732, 14.844393, 11.835869, 1...","[3.470907211303711, 4.058784484863281, 3.85284...","[163.33374, 193.62622, 180.39012, 163.30954, 1...","[0.31910977, 0.2881556, 0.4477546, 0.2942412, ...",5,4,4,4,The newspapers have varying publication intere...
6,"As at most other universities, Notre Dame's st...",How often is Notre Dame's the Juggler published?,441,twice,"[As at most other universities, Notre Dame's s...",3,"[[0.097203255, 0.09345725, 0.05466026, 0.04843...","[[0.084965, 0.11260415, 0.098718055, 0.0556499...","[0.3005536198616028, 0.4449201226234436, 0.345...","[12.456057, 19.187895, 17.29383, 12.208015, 11...","[3.5293140411376953, 4.380398750305176, 4.1585...","[161.95517, 209.23253, 196.72424, 162.78445, 1...","[0.24237432, 0.29191032, 0.43189096, 0.2942412...",5,4,4,0,The newspapers have varying publication intere...
7,"As at most other universities, Notre Dame's st...",What is the daily student paper at Notre Dame ...,598,The Observer,"[As at most other universities, Notre Dame's s...",9,"[[0.097203255, 0.09345725, 0.05466026, 0.04843...","[[0.0711433, 0.054118328, 0.026413964, 0.08664...","[0.24287962913513184, 0.38149869441986084, 0.3...","[10.405749, 17.056553, 16.048374, 12.674274, 1...","[3.225794553756714, 4.129958152770996, 4.00604...","[148.86406, 196.22449, 188.09366, 165.82748, 1...","[0.26918662, 0.35661048, 0.44465598, 0.2904780...",5,0,0,10,The newspapers have varying publication intere...
8,"As at most other universities, Notre Dame's st...",How many student news papers are found at Notr...,126,three,"[As at most other universities, Notre Dame's s...",9,"[[0.097203255, 0.09345725, 0.05466026, 0.04843...","[[0.07469624, 0.06573354, 0.121014304, 0.07063...","[0.19735431671142578, 0.4017585515975952, 0.39...","[8.394439, 17.361256, 19.493011, 14.162596, 13...","[2.897315740585327, 4.166684150695801, 4.41508...","[133.86192, 198.57562, 210.48413, 176.5583, 17...","[0.24077533, 0.32049227, 0.45311219, 0.2877127...",0,0,0,0,"As at most other universities, Notre Dame's st..."
9,"As at most other universities, Notre Dame's st...",In what year did the student paper Common Sens...,908,1987,"[As at most other universities, Notre Dame's s...",7,"[[0.097203255, 0.09345725, 0.05466026, 0.04843...","[[0.04265439, 0.13311036, 0.11229288, 0.097729...","[0.2252199649810791, 0.36460912227630615, 0.26...","[9.867832, 16.703403, 13.726372, 11.037147, 12...","[3.141310691833496, 4.086979866027832, 3.70491...","[146.4502, 196.8992, 174.75381, 155.818, 167.2...","[0.23578824, 0.29134822, 0.41784167, 0.2877127...",5,0,7,10,The newspapers have varying publication intere...


In [85]:
predicted['cosine_sim'][0][5]

0.19309091567993164

## Accuracy

In [86]:
def accuracy(target, predicted):
    
    acc = (target==predicted).sum()/len(target)
    
    return acc

### Accuracy for  euclidean Distance

In [87]:
print(accuracy(predicted["target"], predicted["pred_idx_euc"]))

0.3995433789954338


### Accuracy for Cosine Similarity and others

In [88]:
print(accuracy(predicted["target"], predicted["pred_idx_cos"]))

0.6149162861491628


In [89]:
print(accuracy(predicted["target"], predicted["pred_idx_min"]))

0.4117199391171994


In [90]:
print(accuracy(predicted["target"], predicted["pred_idx_che"]))

0.2815829528158295


In [91]:
# predicted.to_csv("train_detect_sent.csv", index=None)

### Root Match