In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import dill
from IPython.core import display as ICD  # to print multiple nice pandas tables
import json
import logging
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import tqdm
from tqdm import tqdm_notebook, tnrange
sns.set()

In [2]:
import datasets
import classify
from utils import tabular
from utils import subtract_baseline

In [3]:
from collections import defaultdict
results = defaultdict(dict)
dump = defaultdict(dict)

In [12]:
import spacy

In [13]:
import tqdm
import numpy as np

nlp_lg = spacy.load('en_vectors_web_lg')

class SpacyWordVectorModel(object):

    def __init__(self, cls=None):
        self.cls = cls

    def fit(self, X, Y):
        embedding = list(map(self.vectorize, tqdm.tqdm_notebook(X)))
        self.stacked = np.vstack(embedding)
        self.cls.fit(self.stacked, Y)
    
    def predict(self, X):
        embedding = list(map(self.vectorize, tqdm.tqdm_notebook(X)))
        Yhat = self.cls.predict(np.vstack(embedding))
        return Yhat
    
    def vectorize(self, x):
        doc = nlp_lg(' '.join(x))
        return doc.vector
    
    def score(self, X, Y):
        Yhat = self.predict(X)
        return 1-((Yhat-Y)**2).mean()
    
    def update(self, X, Y):
        pass

In [17]:
results = defaultdict(dict)

In [18]:
for dataset in datasets.ALL_DATASETS + datasets.TREC_DATASETS:
    print()
    print(dataset.name())
    model = SpacyWordVectorModel(classify.SkClassifier())
    model.fit(dataset.train_samples(), dataset.train_labels())
    train_p = model.score(dataset.train_samples(), dataset.train_labels())
    test_p = model.score(dataset.test_samples(), dataset.test_labels())
    results[dataset.name()][('logistic', 'train')] = train_p
    results[dataset.name()][('logistic', 'test')] = test_p
    print(train_p, test_p)
    


CRDataset









0.8278145695364238 0.8174603174603174

MRDataset









0.7893070699964826 0.7928772258669166

SUBJDataset









0.922375 0.931

MPQADataset









0.8900282885431401 0.883129123468426

TRECDataset-HUM









0.9258559126233985 0.9278523489932886

TRECDataset-ABBR









0.9922285234194497 0.9899328859060402

TRECDataset-ENTY









0.8634740600714136 0.848993288590604

TRECDataset-NUM









0.9317370300357067 0.8959731543624161

TRECDataset-DESC









0.8911993278722957 0.8640939597315436

TRECDataset-LOC









0.9477000630119723 0.9446308724832215


In [23]:
from sklearn.svm import SVC
for dataset in datasets.ALL_DATASETS + datasets.TREC_DATASETS:
    print()
    print(dataset.name())
    model = SpacyWordVectorModel(SVC())
    model.fit(dataset.train_samples(), dataset.train_labels())
    train_p = model.score(dataset.train_samples(), dataset.train_labels())
    test_p = model.score(dataset.test_samples(), dataset.test_labels())
    results[dataset.name()][('svc', 'train')] = train_p
    results[dataset.name()][('svc', 'test')] = test_p
    print(train_p, test_p)
    


CRDataset









0.6380794701986755 0.6375661375661376

MRDataset









0.7412357837964592 0.7432052483598875

SUBJDataset









0.895125 0.896

MPQADataset









0.86999057048562 0.8595664467483506

TRECDataset-HUM









0.8720856962822936 0.8691275167785235

TRECDataset-ABBR









0.9840369670237346 0.9832214765100671

TRECDataset-ENTY









0.7742070993488763 0.7734899328859061

TRECDataset-NUM









0.8304977945809704 0.8305369127516778

TRECDataset-DESC









0.8101239235454736 0.802013422818792

TRECDataset-LOC









0.851291745431632 0.8506711409395973


In [24]:
pd.DataFrame(results)

Unnamed: 0,Unnamed: 1,CRDataset,MPQADataset,MRDataset,SUBJDataset,TRECDataset-ABBR,TRECDataset-DESC,TRECDataset-ENTY,TRECDataset-HUM,TRECDataset-LOC,TRECDataset-NUM
logistic,test,0.81746,0.883129,0.792877,0.931,0.989933,0.864094,0.848993,0.927852,0.944631,0.895973
logistic,train,0.827815,0.890028,0.789307,0.922375,0.992229,0.891199,0.863474,0.925856,0.9477,0.931737
svc,test,0.637566,0.859566,0.743205,0.896,0.983221,0.802013,0.77349,0.869128,0.850671,0.830537
svc,train,0.638079,0.869991,0.741236,0.895125,0.984037,0.810124,0.774207,0.872086,0.851292,0.830498


In [26]:
to_file = 'dumps/wordvec_results.pickle'
if False:
    pickle.dump(results, open(to_file, 'wb'))

In [35]:
loaded = pickle.load(open(to_file, 'rb'))

In [36]:
pd.DataFrame(loaded)

Unnamed: 0,Unnamed: 1,CRDataset,MPQADataset,MRDataset,SUBJDataset,TRECDataset-ABBR,TRECDataset-DESC,TRECDataset-ENTY,TRECDataset-HUM,TRECDataset-LOC,TRECDataset-NUM
logistic,test,0.81746,0.883129,0.792877,0.931,0.989933,0.864094,0.848993,0.927852,0.944631,0.895973
logistic,train,0.827815,0.890028,0.789307,0.922375,0.992229,0.891199,0.863474,0.925856,0.9477,0.931737
svc,test,0.637566,0.859566,0.743205,0.896,0.983221,0.802013,0.77349,0.869128,0.850671,0.830537
svc,train,0.638079,0.869991,0.741236,0.895125,0.984037,0.810124,0.774207,0.872086,0.851292,0.830498


In [71]:
df = pd.DataFrame(loaded)
nottrec = [x for x in df.columns if 'TREC' not in x] 
trec = [x for x in df.columns if 'TREC' in x]

In [72]:
latex = df[nottrec].round(2).to_latex()
latex = latex.replace('Dataset','')
latex = latex.replace('  ','')
print(tabular(latex, 'Precision for pretrained word vectors'))

trec_df = df[trec]
latex = trec_df.round(2).to_latex()
latex = latex.replace('Dataset','')
latex = latex.replace('TREC-','')
latex = latex.replace('  ','')
print(tabular(latex, 'Precision for pretrained word vectors on TREC dataset'))


\begin{table}[h]
\begin{center}

\begin{tabular}{llrrrr}
\toprule
 &&CR &MPQA &MR &SUBJ \\
\midrule
logistic & test & 0.82 & 0.88 & 0.79 & 0.93 \\
 & train & 0.83 & 0.89 & 0.79 & 0.92 \\
svc & test & 0.64 & 0.86 & 0.74 & 0.90 \\
 & train & 0.64 & 0.87 & 0.74 & 0.90 \\
\bottomrule
\end{tabular}

\caption[Precision for pretrained word vectors]{Precision for pretrained word vectors}
\label{tab:}
\end{center}
\end{table}



\begin{table}[h]
\begin{center}

\begin{tabular}{llrrrrrr}
\toprule
 &&ABBR &DESC &ENTY &HUM &LOC &NUM \\
\midrule
logistic & test &0.99 &0.86 &0.85 & 0.93 & 0.94 & 0.90 \\
 & train &0.99 &0.89 &0.86 & 0.93 & 0.95 & 0.93 \\
svc & test &0.98 &0.80 &0.77 & 0.87 & 0.85 & 0.83 \\
 & train &0.98 &0.81 &0.77 & 0.87 & 0.85 & 0.83 \\
\bottomrule
\end{tabular}

\caption[Precision for pretrained word vectors on TREC dataset]{Precision for pretrained word vectors on TREC dataset}
\label{tab:}
\end{center}
\end{table}




In [73]:
subtract_baseline(df)
latex = df[nottrec].round(2).to_latex()
latex = latex.replace('Dataset','')
latex = latex.replace('  ','')
print(tabular(latex, 'Precision improvements for pretrained word vectors'))

trec_df = df[trec]
latex = trec_df.round(2).to_latex()
latex = latex.replace('Dataset','')
latex = latex.replace('TREC-','')
latex = latex.replace('  ','')
print(tabular(latex, 'Precision improvements for pretrained word vectors on TREC dataset'))


\begin{table}[h]
\begin{center}

\begin{tabular}{llrrrr}
\toprule
 &&CR &MPQA &MR &SUBJ \\
\midrule
logistic & test & 0.18 & 0.20 & 0.29 & 0.43 \\
 & train & 0.19 & 0.20 & 0.29 & 0.42 \\
svc & test &-0.00 & 0.17 & 0.24 & 0.40 \\
 & train & 0.00 & 0.18 & 0.24 & 0.40 \\
\bottomrule
\end{tabular}

\caption[Precision improvements for pretrained word vectors]{Precision improvements for pretrained word vectors}
\label{tab:}
\end{center}
\end{table}



\begin{table}[h]
\begin{center}

\begin{tabular}{llrrrrrr}
\toprule
 &&ABBR &DESC &ENTY &HUM &LOC &NUM \\
\midrule
logistic & test &0.01 &0.08 &0.07 & 0.14 & 0.10 & 0.07 \\
 & train &0.01 &0.11 &0.09 & 0.14 & 0.10 & 0.10 \\
svc & test & -0.00 &0.02 & -0.00 & 0.09 & 0.00 & 0.00 \\
 & train & -0.00 &0.03 &0.00 & 0.09 & 0.01 & 0.00 \\
\bottomrule
\end{tabular}

\caption[Precision improvements for pretrained word vectors on TREC dataset]{Precision improvements for pretrained word vectors on TREC dataset}
\label{tab:}
\end{center}
\end{table}


