In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import dill
from IPython.core import display as ICD  # to print multiple nice pandas tables
import json
import logging
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import tqdm
from tqdm import tqdm_notebook, tnrange
sns.set()

In [2]:
import datasets
import classify
from simple_model import SimpleModel
from testing import test_simple_model

In [3]:
from collections import defaultdict
results = defaultdict(dict)
dump = defaultdict(dict)

In [4]:
import spacy

In [5]:
import tqdm
import numpy as np

nlp_lg = spacy.load('en_vectors_web_lg')

class SpacyWordVectorModel(object):

    def __init__(self, cls=None):
        self.cls = cls

    def fit(self, X, Y):
        embedding = list(map(self.vectorize, tqdm.tqdm_notebook(X)))
        self.stacked = np.vstack(embedding)
        self.cls.fit(self.stacked, Y)
    
    def predict(self, X):
        embedding = list(map(self.vectorize, tqdm.tqdm_notebook(X)))
        Yhat = self.cls.predict(np.vstack(embedding))
        return Yhat
    
    def vectorize(self, x):
        doc = nlp_lg(' '.join(x))
        return doc.vector
    
    def score(self, X, Y):
        Yhat = self.predict(X)
        return 1-((Yhat-Y)**2).mean()
    
    def update(self, X, Y):
        pass

In [7]:
results = defaultdict(dict)

In [8]:
for dataset in datasets.ALL_DATASETS + datasets.TREC_DATASETS:
    print()
    print(dataset.name())
    model = SpacyWordVectorModel(classify.SkClassifier())
    model.fit(dataset.train_samples(), dataset.train_labels())
    train_p = model.score(dataset.train_samples(), dataset.train_labels())
    test_p = model.score(dataset.test_samples(), dataset.test_labels())
    results[dataset.name()][('logistic', 'train')] = train_p
    results[dataset.name()][('logistic', 'test')] = test_p
    print(train_p, test_p)
    


CRDataset









0.8278145695364238 0.8174603174603174

MRDataset









0.7893070699964826 0.7928772258669166

SUBJDataset









0.922375 0.931

MPQADataset









0.8900282885431401 0.883129123468426

TRECDataset-HUM









0.9258559126233985 0.9278523489932886

TRECDataset-ABBR









0.9922285234194497 0.9899328859060402

TRECDataset-ENTY









0.8634740600714136 0.848993288590604

TRECDataset-NUM









0.9317370300357067 0.8959731543624161

TRECDataset-DESC









0.8911993278722957 0.8640939597315436

TRECDataset-LOC









0.9477000630119723 0.9446308724832215


In [38]:
from sklearn.svm import SVC
for dataset in datasets.ALL_DATASETS:
    print()
    print(dataset.name())
    model = SpacyWordVectorModel(SVC())
    model.fit(dataset.train_samples(), dataset.train_labels())
    train_p = model.score(dataset.train_samples(), dataset.train_labels())
    test_p = model.score(dataset.test_samples(), dataset.test_labels())
    results[dataset.name()][('svc', 'train')] = train_p
    results[dataset.name()][('svc', 'test')] = test_p
    print(train_p, test_p)
    


CRDataset









0.6380794701986755 0.6375661375661376

MRDataset









0.7442842068237777 0.7085285848172447

SUBJDataset









0.89625 0.919

MPQADataset









0.8672795851013673 0.8699340245051838


In [39]:
pd.DataFrame(results)

Unnamed: 0,Unnamed: 1,CRDataset,MPQADataset,MRDataset,SUBJDataset
logistic,test,0.820106,0.882187,0.750703,0.93
logistic,train,0.825166,0.891561,0.795169,0.92225
svc,test,0.637566,0.869934,0.708529,0.919
svc,train,0.638079,0.86728,0.744284,0.89625


In [40]:
to_file = 'dumps/wordvec_results.pickle'
if True:
    pickle.dump(results, open(to_file, 'wb'))

In [42]:
loaded = pickle.load(open(to_file, 'rb'))

In [43]:
pd.DataFrame(loaded)

Unnamed: 0,Unnamed: 1,CRDataset,MPQADataset,MRDataset,SUBJDataset
logistic,test,0.820106,0.882187,0.750703,0.93
logistic,train,0.825166,0.891561,0.795169,0.92225
svc,test,0.637566,0.869934,0.708529,0.919
svc,train,0.638079,0.86728,0.744284,0.89625
