In [1]:
import numpy as np, pandas as pd
import json
import ast 
from textblob import TextBlob
import nltk
import torch
import pickle
from scipy import spatial
import warnings
warnings.filterwarnings('ignore')
import spacy
from nltk import Tree
en_nlp = spacy.load('en_core_web_sm')
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [2]:
# !conda update pandas --y

In [45]:
valid = pd.read_csv("data/valid2.csv")

In [46]:
valid.shape

(11873, 4)

### Loading Embedding dictionary

In [15]:
with open("data/dict_embeddings1_valid_2.pickle", "rb") as f:
    d = pickle.load(f)

In [16]:
# with open("data/dict_embeddings2.pickle", "rb") as f:
#     d2 = pickle.load(f)

In [17]:
dict_emb = dict(d)
# dict_emb.update(d2)

In [18]:
len(dict_emb)

18162

In [19]:
del d

## Data Processing

In [66]:
def get_target(x):
    idx = -1
    for i in range(len(x["sentences"])):
        if len(x["text"])>0:
            if x["text"] in x["sentences"][i]: idx = i
    return idx

In [64]:
valid = valid.fillna("")

In [56]:
# valid.dropna(inplace=True)

In [22]:
valid.shape

(11873, 4)

In [67]:
def process_data(train):
    
    print("step 1")
    train['sentences'] = valid['context'].apply(lambda x: [item.raw for item in TextBlob(x).sentences])
    
    print("step 2")
    train["target"] = valid.apply(get_target, axis = 1)
    
    print("step 3")
    train['sent_emb'] = valid['sentences'].apply(lambda x: [dict_emb[item][0] if item in\
                                                           dict_emb else np.zeros(4096) for item in x])
    print("step 4")
    train['quest_emb'] = valid['question'].apply(lambda x: dict_emb[x] if x in dict_emb else np.zeros(4096) )
        
    return train   

In [68]:
valid = process_data(valid)

step 1
step 2
step 3
step 4


In [69]:
valid.head(3)

Unnamed: 0,context,question,answer_start,text,sentences,target,sent_emb,quest_emb
0,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,159,France,[The Normans (Norman: Nourmands; French: Norma...,0,"[[0.12450485, 0.08537765, 0.053539462, 0.13777...","[[0.17353246, 0.060284417, 0.0011510936, -0.00..."
1,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,94,10th and 11th centuries,[The Normans (Norman: Nourmands; French: Norma...,0,"[[0.12450485, 0.08537765, 0.053539462, 0.13777...","[[0.12760219, 0.050012887, 0.003247912, 0.0306..."
2,The Normans (Norman: Nourmands; French: Norman...,From which countries did the Norse originate?,256,"Denmark, Iceland and Norway",[The Normans (Norman: Nourmands; French: Norma...,1,"[[0.12450485, 0.08537765, 0.053539462, 0.13777...","[[0.16916153, 0.048537504, 0.058460955, -0.023..."


## Predicted Cosine & Euclidean Index

In [72]:
def cosine_sim(x):
    li = []
    for item in x["sent_emb"]:
        li.append(spatial.distance.cosine(item,x["quest_emb"][0]))
    return li   

In [73]:
def pred_idx(distances):
    return np.argmin(distances)   

In [74]:
def predictions(train):
    
    train["cosine_sim"] = train.apply(cosine_sim, axis = 1)
    train["diff"] = (train["quest_emb"] - train["sent_emb"])**2
    train["euclidean_dis"] = train["diff"].apply(lambda x: list(np.sum(x, axis = 1)))
    del train["diff"]
    
    print("cosine start")
    
    train["pred_idx_cos"] = train["cosine_sim"].apply(lambda x: pred_idx(x))
    train["pred_idx_euc"] = train["euclidean_dis"].apply(lambda x: pred_idx(x))
    
    return train
    

In [75]:
predicted = predictions(valid)

cosine start


In [76]:
predicted.head(3)

Unnamed: 0,context,question,answer_start,text,sentences,target,sent_emb,quest_emb,cosine_sim,euclidean_dis,pred_idx_cos,pred_idx_euc
0,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,159,France,[The Normans (Norman: Nourmands; French: Norma...,0,"[[0.12450485, 0.08537765, 0.053539462, 0.13777...","[[0.17353246, 0.060284417, 0.0011510936, -0.00...","[0.3918405771255493, 0.4257136583328247, 0.451...","[18.196575, 19.671942, 17.978249, 17.909397]",0,3
1,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,94,10th and 11th centuries,[The Normans (Norman: Nourmands; French: Norma...,0,"[[0.12450485, 0.08537765, 0.053539462, 0.13777...","[[0.12760219, 0.050012887, 0.003247912, 0.0306...","[0.3263145685195923, 0.36495232582092285, 0.40...","[15.742837, 17.370426, 16.294949, 15.831424]",0,0
2,The Normans (Norman: Nourmands; French: Norman...,From which countries did the Norse originate?,256,"Denmark, Iceland and Norway",[The Normans (Norman: Nourmands; French: Norma...,1,"[[0.12450485, 0.08537765, 0.053539462, 0.13777...","[[0.16916153, 0.048537504, 0.058460955, -0.023...","[0.33490025997161865, 0.3215295672416687, 0.33...","[16.052065, 15.721034, 13.987843, 13.998224]",1,2


In [96]:
predicted[(predicted.target==-1)].index

Int64Index([    5,     6,     7,     8,    12,    13,    14,    15,    16,
               19,
            ...
            11853, 11854, 11860, 11861, 11862, 11863, 11869, 11870, 11871,
            11872],
           dtype='int64', length=5959)

In [90]:
maxLen

29

In [112]:
predicted["cosine_sim"][3]

[0.46332818269729614,
 0.3930584788322449,
 0.49910104274749756,
 0.48594987392425537]

## Accuracy

In [79]:
def accuracy(target, predicted):
    
    acc = (target==predicted).sum()/len(target)
    
    return acc

### Accuracy for  euclidean Distance

In [80]:
print(accuracy(predicted["target"], predicted["pred_idx_euc"]))

0.2201633959403689


### Accuracy for Cosine Similarity

In [81]:
print(accuracy(predicted["target"], predicted["pred_idx_cos"]))

0.30859934304725006


In [82]:
predicted.to_csv("valid_detect_sent2.csv", index=None)

In [32]:
predicted.shape

(10570, 12)

In [31]:
predicted.iloc[0,:]

context          Super Bowl 50 was an American football game to...
question         Which NFL team represented the AFC at Super Bo...
answer_start                                                   177
text                                                Denver Broncos
sentences        [Super Bowl 50 was an American football game t...
target                                                           1
sent_emb         [[0.089634605, 0.09790096, 0.05888891, 0.17397...
quest_emb        [[0.08884388, 0.09329621, 0.011793177, 0.09658...
cosine_sim       [0.17136400938034058, 0.18079036474227905, 0.2...
euclidean_dis         [10.409248, 10.875992, 15.322799, 16.594406]
pred_idx_cos                                                     0
pred_idx_euc                                                     0
Name: 0, dtype: object

In [102]:
ct,k = 0,0
for i in range(predicted.shape[0]):
    if predicted.iloc[i,10] != predicted.iloc[i,5]:
        k += 1
        if predicted.iloc[i,11] == predicted.iloc[i,5]:
            ct += 1

In [104]:
predicted.shape

(11873, 12)

In [103]:
ct, k

(413, 8209)

### Combining Accuracy

In [105]:
label = []
for i in range(predicted.shape[0]):
    if predicted.iloc[i,10] == predicted.iloc[i,11]:
        label.append(predicted.iloc[i,10])
    else:
        label.append((predicted.iloc[i,10],predicted.iloc[i,10]))

In [106]:
ct = 0
for i in range(11873):
    item = predicted["target"][i]
    try:
        if label[i] == predicted["target"][i]: ct +=1
    except:
        if item in label[i]: ct +=1
            

In [107]:
ct/11873

0.30859934304725006

In [108]:
predicted.shape

(11873, 12)

### Root Match

In [43]:
predicted = pd.read_csv("valid_detect_sent.csv").reset_index(drop=True)

In [44]:
predicted.head()

Unnamed: 0,context,question,answer_start,text,sentences,target,sent_emb,quest_emb,cosine_sim,euclidean_dis,pred_idx_cos,pred_idx_euc
0,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...,177,Denver Broncos,['Super Bowl 50 was an American football game ...,1,"[array([ 0.0896346 , 0.09790096, 0.05888891,...",[[ 0.08884388 0.09329621 0.01179318 ... 0.0...,"[0.17136400938034058, 0.18079036474227905, 0.2...","[10.409248, 10.875992, 15.322799, 16.594406]",0,0
1,Super Bowl 50 was an American football game to...,Which NFL team represented the NFC at Super Bo...,249,Carolina Panthers,['Super Bowl 50 was an American football game ...,1,"[array([ 0.0896346 , 0.09790096, 0.05888891,...",[[ 0.08884388 0.1030563 -0.0122866 ... 0.0...,"[0.17528098821640015, 0.18317222595214844, 0.2...","[10.60577, 11.0015335, 15.33537, 16.480312]",0,0
2,Super Bowl 50 was an American football game to...,Where did Super Bowl 50 take place?,403,"Santa Clara, California",['Super Bowl 50 was an American football game ...,2,"[array([ 0.0896346 , 0.09790096, 0.05888891,...",[[ 0.05568645 0.06275963 -0.03044511 ... 0.0...,"[0.2916830778121948, 0.32172471284866333, 0.39...","[16.539953, 17.848553, 19.231264, 21.330368]",0,0
3,Super Bowl 50 was an American football game to...,Which NFL team won Super Bowl 50?,177,Denver Broncos,['Super Bowl 50 was an American football game ...,1,"[array([ 0.0896346 , 0.09790096, 0.05888891,...",[[ 0.08884388 0.10037533 -0.03044511 ... 0.0...,"[0.20750784873962402, 0.22276735305786133, 0.3...","[12.396893, 13.205339, 19.081303, 19.976736]",0,0
4,Super Bowl 50 was an American football game to...,What color was used to emphasize the 50th anni...,488,gold,['Super Bowl 50 was an American football game ...,3,"[array([ 0.0896346 , 0.09790096, 0.05888891,...",[[5.7403199e-02 1.3905521e-01 2.4461875e-02 .....,"[0.21517080068588257, 0.25752145051956177, 0.3...","[12.799313, 15.086236, 16.44073, 12.100648]",3,3


In [45]:
doc = en_nlp(predicted.iloc[0,1])

In [46]:
predicted.iloc[0,1]

'Which NFL team represented the AFC at Super Bowl 50?'

In [47]:
predicted.iloc[0,2]

177

In [48]:
def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
    else:
        return node.orth_

In [49]:
predicted.iloc[0,1]

'Which NFL team represented the AFC at Super Bowl 50?'

In [50]:
[to_nltk_tree(sent.root).pretty_print()  for sent in en_nlp(predicted.iloc[0,1]).sents]

               represented                   
  __________________|_________________        
 |         |                |         at     
 |         |                |         |       
 |        team             AFC       Bowl    
 |     ____|________        |     ____|____   
 ?  Which          NFL     the Super       50



[None]

In [53]:
[to_nltk_tree(sent.root) .pretty_print() for sent in doc.sents][0]

               represented                   
  __________________|_________________        
 |         |                |         at     
 |         |                |         |       
 |        team             AFC       Bowl    
 |     ____|________        |     ____|____   
 ?  Which          NFL     the Super       50



In [54]:
for sent in doc.sents:
    roots = [st.stem(chunk.root.head.text.lower()) for chunk in sent.noun_chunks]
    print(roots)

['repres', 'repres', 'at']


In [55]:
def match_roots(x):
    question = x["question"].lower()
    sentences = en_nlp(x["context"].lower()).sents
    
    question_root = st.stem(str([sent.root for sent in en_nlp(question).sents][0]))
    
    li = []
    for i,sent in enumerate(sentences):
        roots = [st.stem(chunk.root.head.text.lower()) for chunk in sent.noun_chunks]

        if question_root in roots: 
            for k,item in enumerate(ast.literal_eval(x["sentences"])):
                if str(sent) in item.lower(): 
                    li.append(k)
    return li

In [57]:
predicted["question"][10000]

'Which country today is a remnant of the Ottoman empire?'

In [58]:
predicted["context"][10000]

'With Istanbul as its capital and control of lands around the Mediterranean basin, the Ottoman Empire was at the center of interactions between the Eastern and Western worlds for six centuries. Following a long period of military setbacks against European powers, the Ottoman Empire gradually declined into the late nineteenth century. The empire allied with Germany in the early 20th century, with the imperial ambition of recovering its lost territories, but it dissolved in the aftermath of World War I, leading to the emergence of the new state of Turkey in the Ottoman Anatolian heartland, as well as the creation of modern Balkan and Middle Eastern states, thus ending Turkish colonial ambitions.'

In [59]:
predicted["root_match_idx"] = predicted.apply(match_roots, axis = 1)

In [60]:
predicted["root_match_idx_first"]= predicted["root_match_idx"].apply(lambda x: x[0] if len(x)>0 else 0)

In [72]:
(predicted["root_match_idx_first"]==predicted["target"]).sum()/predicted.shape[0]

0.4121097445600757

In [62]:
predicted.to_csv("valid_detect_sent.csv", index=None)

In [63]:
predicted[(predicted["sentences"].apply(lambda x: len(ast.literal_eval(x)))<11) &  (predicted["root_match_idx_first"]>10)]       



Unnamed: 0,context,question,answer_start,text,sentences,target,sent_emb,quest_emb,cosine_sim,euclidean_dis,pred_idx_cos,pred_idx_euc,root_match_idx,root_match_idx_first


In [65]:
len(ast.literal_eval(predicted.iloc[10000,4]))

3

In [66]:
question = predicted["question"][10000].lower()
sentences = en_nlp(predicted["context"][10000].lower()).sents
    
question_root = st.stem(str([sent.root for sent in en_nlp(question).sents][0]))
    
li = []
for i,sent in enumerate(sentences):
    roots = [st.stem(chunk.root.head.text.lower()) for chunk in sent.noun_chunks]
    print(roots)

    if question_root in roots: li.append(i)

['with', 'as', 'capit', 'of', 'around', 'was', 'at', 'of', 'between', 'for']
['follow', 'of', 'against', 'declin', 'into']
['empir', 'with', 'in', 'with', 'recov', 'dissolv', 'in', 'of']
['lead', 'to', 'of', 'of', 'in', 'lead', 'of', 'end']


In [67]:
ast.literal_eval(predicted["sentences"][10000])

['With Istanbul as its capital and control of lands around the Mediterranean basin, the Ottoman Empire was at the center of interactions between the Eastern and Western worlds for six centuries.',
 'Following a long period of military setbacks against European powers, the Ottoman Empire gradually declined into the late nineteenth century.',
 'The empire allied with Germany in the early 20th century, with the imperial ambition of recovering its lost territories, but it dissolved in the aftermath of World War I, leading to the emergence of the new state of Turkey in the Ottoman Anatolian heartland, as well as the creation of modern Balkan and Middle Eastern states, thus ending Turkish colonial ambitions.']

In [68]:
predicted["context"][10000]

'With Istanbul as its capital and control of lands around the Mediterranean basin, the Ottoman Empire was at the center of interactions between the Eastern and Western worlds for six centuries. Following a long period of military setbacks against European powers, the Ottoman Empire gradually declined into the late nineteenth century. The empire allied with Germany in the early 20th century, with the imperial ambition of recovering its lost territories, but it dissolved in the aftermath of World War I, leading to the emergence of the new state of Turkey in the Ottoman Anatolian heartland, as well as the creation of modern Balkan and Middle Eastern states, thus ending Turkish colonial ambitions.'

In [69]:
# en_nlp = spacy.load('en')
sentences = en_nlp(predicted["context"][10000].lower()).sents

In [70]:
for item in sentences:
    print(item)

with istanbul as its capital and control of lands around the mediterranean basin, the ottoman empire was at the center of interactions between the eastern and western worlds for six centuries.
following a long period of military setbacks against european powers, the ottoman empire gradually declined into the late nineteenth century.
the empire allied with germany in the early 20th century, with the imperial ambition of recovering its lost territories, but it dissolved in the aftermath of world war
i, leading to the emergence of the new state of turkey in the ottoman anatolian heartland, as well as the creation of modern balkan and middle eastern states, thus ending turkish colonial ambitions.


In [71]:
TfidfVectorizer(predicted["sentences"][0], ngram_range=(1,2))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input="['Super Bowl 50 was an American football game to "
                      'determine the champion of the National Football League '
                      "(NFL) for the 2015 season.', 'The American Football "
                      'Conference (AFC) champion Denver Broncos defeated the '
                      'National Football Conference (NFC) champion Carol...
                      'with Roman numerals (under which the game would have '
                      'been known as "Super Bowl L"), so that the logo could '
                      "prominently feature the Arabic numerals 50.']",
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_