In [7]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import tqdm
import glob
from tqdm import tqdm_notebook, tnrange
import pyperclip
sns.set()

In [9]:
import experiments as exp
import utils

# Batch results

In [10]:
result_pattern = exp.result_file_pattern
result_fnames = glob.glob(result_pattern.format('*'))
results = defaultdict(dict)
for fname in result_fnames:
    d = pickle.load(open(fname,'rb'))
    for k1 in d:
        for k2 in d[k1]:
            results[k1][k2] = d[k1][k2]
my_df = pd.DataFrame(results).loc['batch']

In [11]:
my_df_no_index = my_df.reset_index().drop(columns=['level_4'])
crossvalidated = my_df_no_index.groupby(['level_0','level_1', 'level_2','level_3'], as_index=True).mean()
crossvalidated.index.names = ['scheme', 'alpha', 'lsa', 'set']
crossvalidated = crossvalidated.swaplevel(0,1)

In [12]:
crossvalidated.to_hdf(key='default', path_or_buf='clean_dumps/batch_results.hdf')

## Load lsa results

In [13]:
lsa_df = pd.read_hdf('clean_dumps/baseline_results.hdf').loc[['lsa200','lsa300', 'lsa400']].rename(index={'lsa200':200, 'lsa300':300, 'lsa400':400})
lsa_df.index.names = ['lsa', 'scheme', 'set']
lsa_df = lsa_df.swaplevel(0,1)

In [14]:
def form(x):
    if x>0:
        return "\\textbf{{{}}}".format(x)
    else:
        return "{}".format(x)


In [15]:
# crossvalidated.index.levels[0]
diff = (crossvalidated.loc[0.1]-lsa_df)
test_dif = diff.loc[(slice(None),slice(None),'test'),:]
test_dif.round(2).style.applymap(utils.color_positives)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,CRDataset,MPQADataset,MRDataset,SUBJDataset,TRECDataset-ABBR,TRECDataset-DESC,TRECDataset-ENTY,TRECDataset-HUM,TRECDataset-LOC,TRECDataset-NUM
scheme,lsa,set,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
,200,test,0.01,0.02,0.06,0.02,0,0.01,0.01,-0.0,0.01,-0.0
,300,test,0.02,0.02,0.05,-0.0,0,0.01,0.01,0.01,0.01,-0.0
,400,test,0.03,0.01,0.04,0.01,0,0.01,-0.01,0.01,0.02,0.0
tfchi2,200,test,0.01,0.0,0.01,0.01,0,0.03,0.03,0.02,-0.01,0.02
tfchi2,300,test,0.0,-0.0,0.02,0.01,0,0.01,0.01,-0.0,0.01,0.03
tfchi2,400,test,0.01,0.0,0.03,0.02,0,-0.01,0.02,0.01,0.01,0.02
tfgr,200,test,0.01,-0.0,0.01,0.02,0,-0.01,0.01,0.02,0.0,0.01
tfgr,300,test,0.01,-0.0,0.01,0.01,0,0.01,0.01,0.02,0.01,0.02
tfgr,400,test,0.03,0.01,0.01,0.02,0,0.03,0.01,0.02,0.0,0.01
tfidf,200,test,0.04,0.06,0.07,0.01,0,0.02,0.0,0.01,0.01,0.01


In [23]:
to_table = test_dif.reset_index(level='set').drop(columns='set')
toprint = ''
latex = to_table[utils.NOTREC].round(2).to_latex(formatters=[utils.bold_positives for _ in to_table.columns])
latex = utils.multireplace(latex, [
    ('\\textbackslash','\\'),
    ('  ',' '),
    ('Dataset',''),
    ('\\{','{'),
    ('\\}','}'),
    ('llrrrr','ll|rrrr')
    
])
toprint = utils.tabular(latex,'Accuracy increase over LSA', 'tab:batch:results') 

latex = to_table[utils.TREC].round(2).to_latex(formatters=[utils.bold_positives for _ in to_table.columns])
latex = utils.multireplace(latex, [
    ('\\textbackslash','\\'),
    ('  ',' '),
    ('TRECDataset-',''),
    ('\\{','{'),
    ('\\}','}'),
    ('llrrrr','ll|rrrr')
    
])
toprint +='\n\n\n\n'
toprint += utils.tabular(latex,'Accuracy increase over LSA on TREC datasets', 'tab:batch:results:trec') 

print(toprint)
pyperclip.copy(toprint)


\begin{table}[H]
\begin{center}

\begin{tabular}{ll|rrrr}
\toprule
   &   &   CR &  MPQA &   MR &  SUBJ \\
scheme & lsa &        &        &        &        \\
\midrule
None & 200 &     -0.01 & \textbf{0.01} & \textbf{0.02} &      0.0 \\
   & 300 &      0.0 &      0.0 &     -0.01 &     -0.01 \\
   & 400 & \textbf{0.01} &     -0.01 & \textbf{0.01} & \textbf{0.01} \\
tfchi2 & 200 & \textbf{0.02} &      0.0 &     -0.02 &      0.0 \\
   & 300 &     -0.01 &      0.0 &     -0.01 &      0.0 \\
   & 400 &     -0.01 &     -0.0 &     -0.02 &      0.0 \\
tfgr & 200 &     -0.01 & \textbf{0.01} &     -0.01 &      0.0 \\
   & 300 &     -0.0 &      0.0 & \textbf{0.01} &     -0.01 \\
   & 400 &      0.0 &      0.0 &     -0.01 & \textbf{0.02} \\
tfidf & 200 & \textbf{0.01} & \textbf{0.01} &      0.0 &      0.0 \\
   & 300 &     -0.01 & \textbf{0.01} & \textbf{0.01} &     -0.01 \\
   & 400 & \textbf{0.01} &      0.0 &     -0.01 & \textbf{0.01} \\
tfig & 200 & \textbf{0.01} & \textbf{0.01} &     -0.01 & 

In [24]:
toprint = ''
for alpha in [0.01, 0.001]:
    diff = (crossvalidated.loc[alpha]-lsa_df)
    test_dif = diff.loc[(slice(None),slice(None),'test'),:]
    test_dif.round(2).style.applymap(utils.color_positives)

    to_table = test_dif.reset_index(level='set').drop(columns='set')
    latex = to_table[utils.NOTREC].round(2).to_latex(formatters=[utils.bold_positives for _ in to_table.columns])
    latex = utils.multireplace(latex, [
        ('\\textbackslash','\\'),
        ('  ',' '),
        ('Dataset',''),
        ('\\{','{'),
        ('\\}','}'),
        ('llrrrr','ll|rrrr')

    ])
    toprint += utils.tabular(latex,'Accuracy increase over LSA for $\\alpha=%s$'%str(alpha), 
                             'tab:batch:results'+str(alpha)) 

    latex = to_table[utils.TREC].round(2).to_latex(formatters=[utils.bold_positives for _ in to_table.columns])
    latex = utils.multireplace(latex, [
        ('\\textbackslash','\\'),
        ('  ',' '),
        ('TRECDataset-',''),
        ('\\{','{'),
        ('\\}','}'),
        ('llrrrr','ll|rrrr')

    ])
    toprint +='\n\n\n\n'
    toprint += utils.tabular(latex,'Accuracy increase over LSA for $\\alpha=%s$ on TREC datasets'%str(alpha), 
                             'tab:batch:results:trec'+str(alpha)) 

print(toprint)
pyperclip.copy(toprint)


\begin{table}[H]
\begin{center}

\begin{tabular}{ll|rrrr}
\toprule
   &   &   CR &  MPQA &   MR &  SUBJ \\
scheme & lsa &        &        &        &        \\
\midrule
None & 200 &     -0.01 & \textbf{0.01} & \textbf{0.03} & \textbf{0.01} \\
   & 300 & \textbf{0.02} &     -0.0 & \textbf{0.03} &     -0.01 \\
   & 400 & \textbf{0.01} & \textbf{0.01} & \textbf{0.02} &     -0.0 \\
tfchi2 & 200 &     -0.01 &      0.0 &      0.0 & \textbf{0.01} \\
   & 300 &     -0.02 &      0.0 &     -0.0 & \textbf{0.01} \\
   & 400 &     -0.01 & \textbf{0.01} &     -0.0 &      0.0 \\
tfgr & 200 & \textbf{0.02} &     -0.0 & \textbf{0.02} & \textbf{0.01} \\
   & 300 &     -0.02 &     -0.0 & \textbf{0.03} & \textbf{0.01} \\
   & 400 &      0.0 & \textbf{0.01} &     -0.01 & \textbf{0.01} \\
tfidf & 200 &      0.0 & \textbf{0.01} & \textbf{0.05} &      0.0 \\
   & 300 &     -0.01 & \textbf{0.02} & \textbf{0.03} & \textbf{0.01} \\
   & 400 &      0.0 & \textbf{0.01} & \textbf{0.02} & \textbf{0.01} \\
tfig & 200