In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import dill
from IPython.core import display as ICD  # to print multiple nice pandas tables
import json
import logging
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import tqdm
from tqdm import tqdm_notebook, tnrange
sns.set()

In [3]:
import datasets
import classify
from simple_model import SimpleModel
from testing import test_simple_model
from utils import subtract_baseline, tabular

In [4]:
dump_file = 'dumps/results_baselines_simple_model.pickle'
loaded_results = pickle.load(open(dump_file,'br'))

In [5]:
df = pd.DataFrame(loaded_results)

In [6]:
df = df.reset_index()

In [7]:
nice = [x for x in df['level_1'] if 'mylogistic' not in x]

In [8]:
df = df[df['level_1'].isin(nice)]

In [9]:
df['classifier'] = [q.split('_')[0] for q in df['level_1'].values]
df['lsa'] = [q.split('_')[1] for q in df['level_1'].values]
df['scheme'] = [q.split('_')[2] for q in df['level_1'].values]

In [10]:
df = df.drop(['level_0','level_1'], axis=1)

In [11]:
df = df.set_index(['classifier','lsa','scheme','level_2']).loc['logistic']

In [12]:
df.to_hdf(key='default', path_or_buf='clean_dumps/batch_results.hdf')

In [13]:
nottrec = [x for x in df.columns if 'TREC' not in x] 
trec = [x for x in df.columns if 'TREC' in x]

In [14]:
df.loc[(slice(None), slice(None),['train','test']),nottrec]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,CRDataset,MPQADataset,MRDataset,SUBJDataset
lsa,scheme,level_2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
,,test,0.780423,0.846371,0.770384,0.919
,,train,0.972185,0.91042,0.979365,0.994125
,tfchi2,test,0.748677,0.812441,0.675726,0.849
,tfchi2,train,0.785099,0.85396,0.706062,0.842875
,tfgr,test,0.753968,0.813384,0.675726,0.849
,tfgr,train,0.787748,0.854668,0.711455,0.8455
,tfidf,test,0.761905,0.847314,0.758201,0.916
,tfidf,train,0.999007,0.979137,1.0,1.0
,tfig,test,0.753968,0.813384,0.675726,0.849
,tfig,train,0.787748,0.854668,0.711455,0.8455


In [15]:
bow = df.loc[''].loc[(slice(None),['train','test']),nottrec]

In [16]:
latex = tabular(bow.round(3).to_latex(), 'Accuracy for BOW baseline')
latex = latex.replace('level\_2','')
latex = latex.replace('Dataset','')
print(latex)


\begin{table}[H]
\begin{center}

\begin{tabular}{llrrrr}
\toprule
{} &      &  CR &  MPQA &  MR &  SUBJ \\
scheme &  &            &              &            &              \\
\midrule
None & test &      0.780 &        0.846 &      0.770 &        0.919 \\
{} & train &      0.972 &        0.910 &      0.979 &        0.994 \\
tfchi2 & test &      0.749 &        0.812 &      0.676 &        0.849 \\
{} & train &      0.785 &        0.854 &      0.706 &        0.843 \\
tfgr & test &      0.754 &        0.813 &      0.676 &        0.849 \\
{} & train &      0.788 &        0.855 &      0.711 &        0.846 \\
tfidf & test &      0.762 &        0.847 &      0.758 &        0.916 \\
{} & train &      0.999 &        0.979 &      1.000 &        1.000 \\
tfig & test &      0.754 &        0.813 &      0.676 &        0.849 \\
{} & train &      0.788 &        0.855 &      0.711 &        0.846 \\
tfor & test &      0.772 &        0.839 &      0.773 &        0.904 \\
{} & train &      0.891 &        0.

In [17]:
subtract_baseline(bow)
latex = tabular(bow.round(3).to_latex(), 'Accuracy improvements for BOW baseline')
latex = latex.replace('level\_2','')
latex = latex.replace('Dataset','')
print(latex)


\begin{table}[H]
\begin{center}

\begin{tabular}{llrrrr}
\toprule
{} &      &  CR &  MPQA &  MR &  SUBJ \\
scheme &  &            &              &            &              \\
\midrule
None & test &      0.143 &        0.159 &      0.270 &        0.419 \\
{} & train &      0.335 &        0.223 &      0.479 &        0.494 \\
tfchi2 & test &      0.111 &        0.125 &      0.176 &        0.349 \\
{} & train &      0.147 &        0.166 &      0.206 &        0.343 \\
tfgr & test &      0.116 &        0.126 &      0.176 &        0.349 \\
{} & train &      0.150 &        0.167 &      0.211 &        0.346 \\
tfidf & test &      0.124 &        0.160 &      0.258 &        0.416 \\
{} & train &      0.361 &        0.291 &      0.500 &        0.500 \\
tfig & test &      0.116 &        0.126 &      0.176 &        0.349 \\
{} & train &      0.150 &        0.167 &      0.211 &        0.346 \\
tfor & test &      0.135 &        0.151 &      0.273 &        0.404 \\
{} & train &      0.253 &        0.

In [18]:
bow=None

In [19]:
bow_trec = df.loc[''].loc[(slice(None),['train','test']),trec]

In [20]:
latex = tabular(bow_trec.round(3).to_latex(), 'Accuracy for BOW baseline on TREC datasets')
latex = latex.replace('level\_2','')
latex = latex.replace('   ','')
latex = latex.replace('TRECDataset-','')
print(latex)


\begin{table}[H]
\begin{center}

\begin{tabular}{llrrrrrr}
\toprule
{} &&  ABBR &  DESC &  ENTY &  HUM &  LOC &  NUM \\
scheme &  & & & &&&\\
\midrule
None & test & 0.995 & 0.930 & 0.883 &0.921 &0.963 &0.960 \\
{} & train & 0.995 & 0.983 & 0.983 &0.986 &0.988 &0.988 \\
tfchi2 & test & 0.993 & 0.864 & 0.780 &0.886 &0.950 &0.928 \\
{} & train & 0.993 & 0.879 & 0.789 &0.897 &0.949 &0.922 \\
tfgr & test & 0.990 & 0.862 & 0.780 &0.888 &0.950 &0.936 \\
{} & train & 0.991 & 0.879 & 0.788 &0.896 &0.950 &0.933 \\
tfidf & test & 0.995 & 0.930 & 0.883 &0.918 &0.973 &0.970 \\
{} & train & 1.000 & 0.999 & 1.000 &1.000 &1.000 &1.000 \\
tfig & test & 0.990 & 0.862 & 0.780 &0.888 &0.950 &0.936 \\
{} & train & 0.991 & 0.879 & 0.788 &0.896 &0.950 &0.933 \\
tfor & test & 0.990 & 0.847 & 0.837 &0.923 &0.953 &0.960 \\
{} & train & 0.990 & 0.951 & 0.940 &0.967 &0.969 &0.970 \\
tfrf & test & 0.990 & 0.842 & 0.834 &0.908 &0.948 &0.948 \\
{} & train & 0.991 & 0.917 & 0.909 &0.942 &0.950 &0.955 \\
\bottomrule


In [21]:
subtract_baseline(bow_trec)
latex = tabular(bow_trec.round(3).to_latex(), 'Accuracy improvements for BOW baseline on TREC datasets')
latex = latex.replace('level\_2','')
latex = latex.replace('   ','')
latex = latex.replace('TRECDataset-','')
print(latex)


\begin{table}[H]
\begin{center}

\begin{tabular}{llrrrrrr}
\toprule
{} &&  ABBR &  DESC &  ENTY &  HUM &  LOC &  NUM \\
scheme &  & & & &&&\\
\midrule
None & test & 0.011 & 0.148 & 0.108 &0.138 &0.117 &0.129 \\
{} & train & 0.010 & 0.202 & 0.209 &0.202 &0.142 &0.158 \\
tfchi2 & test & 0.009 & 0.083 & 0.006 &0.102 &0.104 &0.097 \\
{} & train & 0.009 & 0.098 & 0.015 &0.113 &0.103 &0.092 \\
tfgr & test & 0.006 & 0.081 & 0.006 &0.104 &0.104 &0.106 \\
{} & train & 0.007 & 0.097 & 0.014 &0.112 &0.103 &0.103 \\
tfidf & test & 0.011 & 0.148 & 0.108 &0.134 &0.127 &0.139 \\
{} & train & 0.016 & 0.218 & 0.226 &0.216 &0.154 &0.170 \\
tfig & test & 0.006 & 0.081 & 0.006 &0.104 &0.104 &0.106 \\
{} & train & 0.007 & 0.097 & 0.014 &0.112 &0.103 &0.103 \\
tfor & test & 0.006 & 0.066 & 0.063 &0.139 &0.107 &0.129 \\
{} & train & 0.006 & 0.170 & 0.166 &0.183 &0.123 &0.140 \\
tfrf & test & 0.006 & 0.061 & 0.060 &0.124 &0.102 &0.118 \\
{} & train & 0.007 & 0.136 & 0.135 &0.158 &0.104 &0.125 \\
\bottomrule


In [22]:
all_res = df.loc[(slice(None), slice(None),['train','test']),nottrec]

In [23]:
def multireplace(txt, lis):
    for q,w in lis:
        txt = txt.replace(q,w)
    return txt

In [24]:
lsas = all_res.loc[['lsa200','lsa300','lsa400']]
subtract_baseline(lsas)
tex = lsas.round(3).to_latex()
tex = multireplace(tex, [
    ('Dataset',''),
    ('level\_2',''),
    ('lsa','')
])
print(tabular(tex, 'Accuracy improvements for LSA baseline'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df[d.name()] -= d.bias()



\begin{table}[H]
\begin{center}

\begin{tabular}{lllrrrr}
\toprule
{} & {} &      &  CR &  MPQA &  MR &  SUBJ \\
 & scheme &  &            &              &            &              \\
\midrule
200 & None & test &      0.093 &        0.095 &      0.198 &        0.360 \\
{} & None & train &      0.162 &        0.103 &      0.216 &        0.390 \\
{} & tfchi2 & test &      0.093 &        0.095 &      0.198 &        0.360 \\
{} & tfchi2 & train &      0.162 &        0.103 &      0.215 &        0.390 \\
{} & tfgr & test &      0.093 &        0.094 &      0.198 &        0.359 \\
{} & tfgr & train &      0.162 &        0.103 &      0.215 &        0.390 \\
{} & tfidf & test &      0.093 &        0.095 &      0.197 &        0.360 \\
{} & tfidf & train &      0.162 &        0.103 &      0.215 &        0.389 \\
{} & tfig & test &      0.093 &        0.094 &      0.197 &        0.360 \\
{} & tfig & train &      0.162 &        0.103 &      0.216 &        0.390 \\
{} & tfor & test &      0.093 &  

In [25]:
for dim in [200,300,400]:
    tex = lsas.loc['lsa'+str(dim)].round(3).to_latex()
    tex = multireplace(tex, [
        ('Dataset',''),
        ('level\_2',''),
        ('lsa','')
    ])
    print(tabular(tex, 'Accuracy improvements for LSA baseline with {} dimensions'.format(dim)))


\begin{table}[H]
\begin{center}

\begin{tabular}{llrrrr}
\toprule
{} &      &  CR &  MPQA &  MR &  SUBJ \\
scheme &  &            &              &            &              \\
\midrule
None & test &      0.093 &        0.095 &      0.198 &        0.360 \\
{} & train &      0.162 &        0.103 &      0.216 &        0.390 \\
tfchi2 & test &      0.093 &        0.095 &      0.198 &        0.360 \\
{} & train &      0.162 &        0.103 &      0.215 &        0.390 \\
tfgr & test &      0.093 &        0.094 &      0.198 &        0.359 \\
{} & train &      0.162 &        0.103 &      0.215 &        0.390 \\
tfidf & test &      0.093 &        0.095 &      0.197 &        0.360 \\
{} & train &      0.162 &        0.103 &      0.215 &        0.389 \\
tfig & test &      0.093 &        0.094 &      0.197 &        0.360 \\
{} & train &      0.162 &        0.103 &      0.216 &        0.390 \\
tfor & test &      0.093 &        0.094 &      0.197 &        0.360 \\
{} & train &      0.162 &        0.

In [26]:
cm = sns.light_palette("green", as_cmap=True)
all_res.style.background_gradient(cmap=cm)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,CRDataset,MPQADataset,MRDataset,SUBJDataset
lsa,scheme,level_2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
,,test,0.780423,0.846371,0.770384,0.919
,,train,0.972185,0.91042,0.979365,0.994125
,tfchi2,test,0.748677,0.812441,0.675726,0.849
,tfchi2,train,0.785099,0.85396,0.706062,0.842875
,tfgr,test,0.753968,0.813384,0.675726,0.849
,tfgr,train,0.787748,0.854668,0.711455,0.8455
,tfidf,test,0.761905,0.847314,0.758201,0.916
,tfidf,train,0.999007,0.979137,1.0,1.0
,tfig,test,0.753968,0.813384,0.675726,0.849
,tfig,train,0.787748,0.854668,0.711455,0.8455


In [27]:
all_res

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,CRDataset,MPQADataset,MRDataset,SUBJDataset
lsa,scheme,level_2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
,,test,0.780423,0.846371,0.770384,0.919
,,train,0.972185,0.91042,0.979365,0.994125
,tfchi2,test,0.748677,0.812441,0.675726,0.849
,tfchi2,train,0.785099,0.85396,0.706062,0.842875
,tfgr,test,0.753968,0.813384,0.675726,0.849
,tfgr,train,0.787748,0.854668,0.711455,0.8455
,tfidf,test,0.761905,0.847314,0.758201,0.916
,tfidf,train,0.999007,0.979137,1.0,1.0
,tfig,test,0.753968,0.813384,0.675726,0.849
,tfig,train,0.787748,0.854668,0.711455,0.8455


In [28]:
subtract_baseline(all_res)
all_res

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,CRDataset,MPQADataset,MRDataset,SUBJDataset
lsa,scheme,level_2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
,,test,0.142807,0.158647,0.270384,0.419
,,train,0.33457,0.222696,0.479365,0.494125
,tfchi2,test,0.111061,0.124717,0.175726,0.349
,tfchi2,train,0.147483,0.166236,0.206062,0.342875
,tfgr,test,0.116352,0.12566,0.175726,0.349
,tfgr,train,0.150132,0.166944,0.211455,0.3455
,tfidf,test,0.124289,0.15959,0.258201,0.416
,tfidf,train,0.361391,0.291413,0.5,0.5
,tfig,test,0.116352,0.12566,0.175726,0.349
,tfig,train,0.150132,0.166944,0.211455,0.3455


In [29]:
cm = sns.light_palette("green", as_cmap=True)
df.style.background_gradient(cmap=cm)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,CRDataset,MPQADataset,MRDataset,SUBJDataset,TRECDataset-ABBR,TRECDataset-DESC,TRECDataset-ENTY,TRECDataset-HUM,TRECDataset-LOC,TRECDataset-NUM
lsa,scheme,level_2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
,,test,0.780423,0.846371,0.770384,0.919,0.994966,0.92953,0.88255,0.921141,0.963087,0.959732
,,train,0.972185,0.91042,0.979365,0.994125,0.994539,0.983407,0.982777,0.985717,0.988448,0.988448
,,valid,0.814324,0.829406,0.753283,0.898,0.993277,0.934454,0.880672,0.92437,0.961345,0.969748
,tfchi2,test,0.748677,0.812441,0.675726,0.849,0.993289,0.864094,0.780201,0.885906,0.949664,0.927852
,tfchi2,train,0.785099,0.85396,0.706062,0.842875,0.993069,0.879437,0.78891,0.89666,0.94938,0.922075
,tfchi2,valid,0.761273,0.802074,0.659475,0.834,0.996639,0.895798,0.784874,0.90084,0.952941,0.929412
,tfgr,test,0.753968,0.813384,0.675726,0.849,0.989933,0.862416,0.780201,0.887584,0.949664,0.936242
,tfgr,train,0.787748,0.854668,0.711455,0.8455,0.990548,0.878807,0.78807,0.89582,0.94959,0.933207
,tfgr,valid,0.755968,0.804901,0.660413,0.842,0.994958,0.890756,0.788235,0.904202,0.951261,0.944538
,tfidf,test,0.761905,0.847314,0.758201,0.916,0.994966,0.92953,0.88255,0.917785,0.973154,0.969799


In [30]:
cm = sns.light_palette("green", as_cmap=True)
df = pd.DataFrame(loaded_results)
subtract_baseline(df)
df = df.sort_index().loc[(slice(None),slice(None), 'test'),:]
df.style.background_gradient(cmap=cm)

Unnamed: 0,Unnamed: 1,Unnamed: 2,CRDataset,MPQADataset,MRDataset,SUBJDataset,TRECDataset-ABBR,TRECDataset-DESC,TRECDataset-ENTY,TRECDataset-HUM,TRECDataset-LOC,TRECDataset-NUM
simple,constant__,test,-4.97565e-05,0.00030623,-0.000468604,0.0,-0.000817502,0.000293173,-0.000703616,-4.51036e-05,-0.000464567,5.97622e-05
simple,logistic__None,test,0.142807,0.158647,0.270384,0.419,0.0109275,0.147944,0.108357,0.137539,0.116985,0.129254
simple,logistic__tfchi2,test,0.111061,0.124717,0.175726,0.349,0.00924961,0.0825079,0.00600779,0.102304,0.103562,0.0973752
simple,logistic__tfgr,test,0.116352,0.12566,0.175726,0.349,0.00589391,0.0808301,0.00600779,0.103982,0.103562,0.105764
simple,logistic__tfidf,test,0.124289,0.15959,0.258201,0.416,0.0109275,0.147944,0.108357,0.134183,0.127052,0.139322
simple,logistic__tfig,test,0.116352,0.12566,0.175726,0.349,0.00589391,0.0808301,0.00600779,0.103982,0.103562,0.105764
simple,logistic__tfor,test,0.134871,0.151107,0.273196,0.404,0.00589391,0.0657294,0.0630548,0.139217,0.106918,0.129254
simple,logistic__tfrf,test,0.10577,0.147337,0.239456,0.386,0.00589391,0.0606959,0.0596991,0.124116,0.101884,0.117509
simple,logistic_lsa200_None,test,0.0928074,0.0945569,0.197751,0.3595,0.00606169,0.050461,0.0576856,0.12076,0.0950052,0.117509
simple,logistic_lsa200_tfchi2,test,0.0925428,0.0948397,0.198219,0.3597,0.00606169,0.0506287,0.0580212,0.12076,0.0948374,0.117509


In [31]:
cm = sns.light_palette("green", as_cmap=True)
df_results = pd.DataFrame(loaded_results).loc['simple']
interesting = [x for x in df_results.index.levels[0] if 'constant' not in x and 'mylog' not in x] + ['constant__None']

df = df_results.loc[(interesting, 'test'),:]
df.index = df.index.droplevel(1)