In [2]:
import numpy as np, pandas as pd
import json
import ast 
from textblob import TextBlob
import nltk
import torch
import pickle
from scipy import spatial
import warnings
warnings.filterwarnings('ignore')
import spacy
from nltk import Tree
en_nlp = spacy.load('SQuAD/lib/python3.7/site-packages/en_core_web_sm/en_core_web_sm-2.2.5')
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [4]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

In [1]:
import xgboost as xgb

In [3]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [2]:
# !conda update pandas --y

In [2]:
train = pd.read_csv("data/train.csv")

In [3]:
train.shape

(130319, 4)

### Loading Embedding dictionary

In [4]:
with open("data/dict_embeddings1.pickle", "rb") as f:
    d1 = pickle.load(f)

In [5]:
with open("data/dict_embeddings2.pickle", "rb") as f:
    d2 = pickle.load(f)

In [6]:
dict_emb = dict(d1)
dict_emb.update(d2)

In [7]:
len(dict_emb)

223345

In [8]:
del d1, d2

## Data Processing

In [9]:
def get_target(x):
    idx = -1
    for i in range(len(x["sentences"])):
        if x["text"] in x["sentences"][i]: idx = i
    return idx

In [10]:
train.head(3)

Unnamed: 0,context,question,answer_start,text
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,269.0,in the late 1990s
1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,207.0,singing and dancing
2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,526.0,2003


In [11]:
train.shape

(130319, 4)

In [12]:
train = train.dropna().reset_index(drop=True)

In [13]:
train.shape

(86820, 4)

In [14]:
def process_data(train):
    
    print("step 1")
    train['sentences'] = train['context'].apply(lambda x: [item.raw for item in TextBlob(x).sentences])
    
    print("step 2")
    train["target"] = train.apply(get_target, axis = 1)
    
    print("step 3")
    train['sent_emb'] = train['sentences'].apply(lambda x: [dict_emb[item][0] if item in\
                                                           dict_emb else np.zeros(4096) for item in x])
    print("step 4")
    train['quest_emb'] = train['question'].apply(lambda x: dict_emb[x] if x in dict_emb else np.zeros(4096) )
        
    return train   

In [15]:
train = process_data(train)

step 1
step 2
step 3
step 4


In [16]:
train.head(3)

Unnamed: 0,context,question,answer_start,text,sentences,target,sent_emb,quest_emb
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,269.0,in the late 1990s,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,1,"[[0.0074688885, -0.04449936, 0.113475606, 0.01...","[[0.0074688885, -0.047163706, 0.08111103, -0.0..."
1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,207.0,singing and dancing,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,1,"[[0.0074688885, -0.04449936, 0.113475606, 0.01...","[[0.0074688885, -0.05823731, 0.07162362, -0.00..."
2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,526.0,2003,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,3,"[[0.0074688885, -0.04449936, 0.113475606, 0.01...","[[0.0074688885, -0.047163706, 0.112324856, -0...."


## Predicted Cosine & Euclidean Index

In [17]:
def cosine_sim(x):
    li = []
    for item in x["sent_emb"]:
        li.append(spati al.distance.cosine(item,x["quest_emb"][0]))
    return li   

In [18]:
def pred_idx(distances):
    return np.argmin(distances)   

In [19]:
def predictions(train):
    
    train["cosine_sim"] = train.apply(cosine_sim, axis = 1)
    train["diff"] = (train["quest_emb"] - train["sent_emb"])**2
    train["euclidean_dis"] = train["diff"].apply(lambda x: list(np.sum(x, axis = 1)))
    del train["diff"]
    
    print("cosine start")
    
    train["pred_idx_cos"] = train["cosine_sim"].apply(lambda x: pred_idx(x))
    train["pred_idx_euc"] = train["euclidean_dis"].apply(lambda x: pred_idx(x))
    
    return train

In [20]:
predicted = predictions(train)

cosine start


In [21]:
predicted.head(3)

Unnamed: 0,context,question,answer_start,text,sentences,target,sent_emb,quest_emb,cosine_sim,euclidean_dis,pred_idx_cos,pred_idx_euc
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,269.0,in the late 1990s,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,1,"[[0.0074688885, -0.04449936, 0.113475606, 0.01...","[[0.0074688885, -0.047163706, 0.08111103, -0.0...","[0.6082267463207245, 0.7082076072692871, 0.669...","[8.0352955, 9.462915, 6.914433, 9.608288]",0,2
1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,207.0,singing and dancing,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,1,"[[0.0074688885, -0.04449936, 0.113475606, 0.01...","[[0.0074688885, -0.05823731, 0.07162362, -0.00...","[0.5250032842159271, 0.51096311211586, 0.62473...","[6.9849253, 6.9543734, 6.4365134, 8.201017]",1,2
2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,526.0,2003,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,3,"[[0.0074688885, -0.04449936, 0.113475606, 0.01...","[[0.0074688885, -0.047163706, 0.112324856, -0....","[0.4191964268684387, 0.4746186137199402, 0.584...","[5.770602, 6.6236463, 6.2470593, 7.1245747]",0,0


In [22]:
predicted["cosine_sim"][0]

[0.6082267463207245,
 0.7082076072692871,
 0.6695424020290375,
 0.6530875563621521]

In [23]:
predicted["euclidean_dis"][0]

[8.0352955, 9.462915, 6.914433, 9.608288]

## Accuracy

In [24]:
def accuracy(target, predicted):
    
    acc = (target==predicted).sum()/len(target)
    
    return acc

### Accuracy for  Euclidean Distance

In [25]:
print(accuracy(predicted["target"], predicted["pred_idx_euc"]))

0.4889541580281041


### Accuracy for Cosine Similarity

In [26]:
print(accuracy(predicted["target"], predicted["pred_idx_cos"]))

0.597293250403133


In [27]:
predicted.to_csv("data/train_detect_sent_unsupervised.csv", index=None)

In [28]:
predicted.iloc[75207,:]

context          From early 1944 until the days leading up to t...
question         Who did the network of fortifications on the i...
answer_start                                                   495
text                                                  the defender
sentences        [From early 1944 until the days leading up to ...
target                                                           3
sent_emb         [[0.0074688885, 0.021900535, 0.1008227, -0.026...
quest_emb        [[0.0074688885, -0.028091114, 0.1041385, -0.00...
cosine_sim       [0.6578376293182373, 0.7051503360271454, 0.645...
euclidean_dis              [8.349659, 7.570962, 6.673405, 4.85423]
pred_idx_cos                                                     3
pred_idx_euc                                                     3
Name: 75207, dtype: object

In [29]:
ct,k = 0,0
for i in range(predicted.shape[0]):
    if predicted.iloc[i,10] != predicted.iloc[i,5]:
        k += 1
        if predicted.iloc[i,11] == predicted.iloc[i,5]:
            ct += 1

In [30]:
ct, k

(3754, 34963)

### Combining Accuracy

In [31]:
label = []
for i in range(predicted.shape[0]):
    if predicted.iloc[i,10] == predicted.iloc[i,11]:
        label.append(predicted.iloc[i,10])
    else:
        label.append((predicted.iloc[i,10],predicted.iloc[i,10]))

In [32]:
ct = 0
for i in range(75206):
    item = predicted["target"][i]
    try:
        if label[i] == predicted["target"][i]: ct +=1
    except:
        if item in label[i]: ct +=1
            

In [33]:
ct/75206

0.5988351993192033

### Root Match

In [35]:
predicted = pd.read_csv("data/train_detect_sent_unsupervised.csv").reset_index(drop=True)

In [39]:
predicted.columns

Index(['context', 'question', 'answer_start', 'text', 'sentences', 'target',
       'sent_emb', 'quest_emb', 'cosine_sim', 'euclidean_dis', 'pred_idx_cos',
       'pred_idx_euc'],
      dtype='object')

In [40]:
answer_start = predicted['answer_start']
predicted.drop(labels=['answer_start'], axis=1, inplace=True)
predicted.insert(0, 'answer_start', answer_start)
predicted.head()

Unnamed: 0,answer_start,context,question,text,sentences,target,sent_emb,quest_emb,cosine_sim,euclidean_dis,pred_idx_cos,pred_idx_euc
0,269.0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s,['Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/...,1,"[array([ 0.00746889, -0.04449936, 0.11347561,...",[[ 0.00746889 -0.04716371 0.08111103 ... 0.0...,"[0.6082267463207245, 0.7082076072692871, 0.669...","[8.0352955, 9.462915, 6.914433, 9.608288]",0,2
1,207.0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing,['Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/...,1,"[array([ 0.00746889, -0.04449936, 0.11347561,...",[[ 0.00746889 -0.05823731 0.07162362 ... 0.0...,"[0.5250032842159271, 0.51096311211586, 0.62473...","[6.9849253, 6.9543734, 6.4365134, 8.201017]",1,2
2,526.0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003,['Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/...,3,"[array([ 0.00746889, -0.04449936, 0.11347561,...",[[ 0.00746889 -0.04716371 0.11232486 ... 0.0...,"[0.4191964268684387, 0.4746186137199402, 0.584...","[5.770602, 6.6236463, 6.2470593, 7.1245747]",0,0
3,166.0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"Houston, Texas",['Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/...,1,"[array([ 0.00746889, -0.04449936, 0.11347561,...",[[ 0.00746889 -0.03403048 0.08177199 ... 0.0...,"[0.5637381374835968, 0.5945537984371185, 0.651...","[7.3106747, 7.8408995, 6.481517, 8.796563]",0,2
4,276.0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,late 1990s,['Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/...,1,"[array([ 0.00746889, -0.04449936, 0.11347561,...",[[ 0.00746889 -0.06298249 0.10006339 ... 0.0...,"[0.4646044373512268, 0.5985883176326752, 0.589...","[6.2806025, 8.1350765, 6.1614323, 8.060831]",0,2


In [41]:
doc = en_nlp(predicted.iloc[0,1])

In [42]:
predicted.iloc[0,1]

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'

In [43]:
predicted.iloc[0,2]

'When did Beyonce start becoming popular?'

In [44]:
def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
    else:
        return node.orth_

In [45]:
[to_nltk_tree(sent.root).pretty_print()  for sent in en_nlp(predicted.iloc[0,2]).sents]

          start              
  __________|___________      
 |    |     |     |  becoming
 |    |     |     |     |     
When did Beyonce  ?  popular 



[None]

In [46]:
[to_nltk_tree(sent.root) .pretty_print() for sent in doc.sents][5]

                                                                  is                                                                                      
  ________________________________________________________________|________________________________________________                                        
 |                                             Carter                                                            singer                                   
 |      _________________________________________|___________________________________             _________________|_______________                        
 |     |       |       |     |   |   |   |                    |                     born         |     |      |                songwriter                 
 |     |       |       |     |   |   |   |                    |                      |           |     |      |     _______________|_________              
 |     |       |       |     |   |   |   |                   say   

IndexError: list index out of range

In [47]:
for sent in doc.sents:
    roots = [st.stem(chunk.root.head.text.lower()) for chunk in sent.noun_chunks]
    print(roots)

['is', 'is', 'sing', 'songwrit', 'produc']
['in', 'houston', 'perform', 'in', 'as', 'in', 'as', 'of']
['by', 'fath', 'becam', 'of', 'of']
['saw', 'saw', 'of', 'in', 'est', 'as', 'earn', 'feat', 'singl', 'in', 'singl']


In [48]:
def match_roots(x):
    question = x["question"].lower()
    sentences = en_nlp(x["context"].lower()).sents
    
    question_root = st.stem(str([sent.root for sent in en_nlp(question).sents][0]))
    
    li = []
    for i,sent in enumerate(sentences):
        roots = [st.stem(chunk.root.head.text.lower()) for chunk in sent.noun_chunks]

        if question_root in roots: 
            for k,item in enumerate(ast.literal_eval(x["sentences"])):
                if str(sent) in item.lower(): 
                    li.append(k)
    return li

In [49]:
predicted["question"][21493]

'How do Afro-multiracials identify in the 21st century?'

In [50]:
predicted["context"][21493]

"According to Dr. Carlos Moore, resident scholar at Brazil's University of the State of Bahia, in the 21st century Afro-multiracials in the Arab world, including Arabs in North Africa, self-identify in ways that resemble multi-racials in Latin America. He claims that black-looking Arabs, much like black-looking Latin Americans, consider themselves white because they have some distant white ancestry."

In [51]:
predicted["root_match_idx"] = predicted.apply(match_roots, axis = 1)

In [52]:
predicted["root_match_idx_first"]= predicted["root_match_idx"].apply(lambda x: x[0] if len(x)>0 else 0)

In [53]:
(predicted["root_match_idx_first"]==predicted["target"]).sum()/predicted.shape[0]

0.4039391845196959

In [54]:
predicted.to_csv("data/train_detect_sent_unsupervised_v3.csv", index=None)

In [55]:
predicted[(predicted["sentences"].apply(lambda x: len(ast.literal_eval(x)))<11) &  (predicted["root_match_idx_first"]>10)]

Unnamed: 0,answer_start,context,question,text,sentences,target,sent_emb,quest_emb,cosine_sim,euclidean_dis,pred_idx_cos,pred_idx_euc,root_match_idx,root_match_idx_first


In [56]:
len(ast.literal_eval(predicted.iloc[21493,4]))

2

In [57]:
question = predicted["question"][21493].lower()
sentences = en_nlp(predicted["context"][21493].lower()).sents
    
question_root = st.stem(str([sent.root for sent in en_nlp(question).sents][0]))
    
li = []
for i,sent in enumerate(sentences):
    roots = [st.stem(chunk.root.head.text.lower()) for chunk in sent.noun_chunks]
    print(roots)

    if question_root in roots: li.append(i)

['to', 'moor', 'at', 'of', 'of', 'in', 'in', 'in', 'includ', 'in', 'in', 'in']
['claim', 'consid', 'lik', 'whit', 'hav', 'hav']


In [58]:
ast.literal_eval(predicted["sentences"][21493])

["According to Dr. Carlos Moore, resident scholar at Brazil's University of the State of Bahia, in the 21st century Afro-multiracials in the Arab world, including Arabs in North Africa, self-identify in ways that resemble multi-racials in Latin America.",
 'He claims that black-looking Arabs, much like black-looking Latin Americans, consider themselves white because they have some distant white ancestry.']

In [59]:
predicted["context"][21493]

"According to Dr. Carlos Moore, resident scholar at Brazil's University of the State of Bahia, in the 21st century Afro-multiracials in the Arab world, including Arabs in North Africa, self-identify in ways that resemble multi-racials in Latin America. He claims that black-looking Arabs, much like black-looking Latin Americans, consider themselves white because they have some distant white ancestry."

In [65]:
# en_nlp = spacy.load('en')
sentences = en_nlp(predicted["context"][21493].lower()).sents

In [66]:
for item in sentences:
    print(item)

according to dr. carlos moore, resident scholar at brazil's university of the state of bahia, in the 21st century afro-multiracials in the arab world, including arabs in north africa, self-identify in ways that resemble multi-racials in latin america.
he claims that black-looking arabs, much like black-looking latin americans, consider themselves white because they have some distant white ancestry.


In [62]:
TfidfVectorizer(predicted["sentences"][0], ngram_range=(1,2))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input="['Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ "
                      'bee-YON-say) (born September 4, 1981) is an American '
                      "singer, songwriter, record producer and actress.', "
                      '"Born and raised in Houston, Texas, she performed in '
                      'various singing and dancing competitions as a child,...
                      'her as a solo artist worldwide, earned five Grammy '
                      'Awards and featured the Billboard Hot 100 number-one '
                      'singles "Crazy in Love" and "Baby Boy".\']',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='