In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import datasets
from itertools import chain 
from collections import Counter
import numpy as np
import pandas as pd

In [3]:
def get_stats(examples):
    words = list(chain(*examples))
    c = Counter(words)
    sc = sorted(list(c.items()))
    slens = [len(x) for x in examples]
    return {
        "#words": len(words),
        "#unique words": len(c),
        "\specialcell{#words with\\\\$1$ apperance}": len([x for x in c if c[x]==1]),
        "#examples": len(examples),
        "\specialcell{avg sentence\\\\length}": np.mean(slens),
        "\specialcell{max sentence\\\\length}": np.max(slens),
        "\specialcell{median sentence\\\\length}": np.median(slens),        
        "counter": c
    }

def explore_dataset(dataset):
    # number of tokens
    # number of unique tokens
    # same for positive, and negative
    stats = get_stats(dataset.samples)
    mean = np.mean(dataset.labels)
    stats['bias'] = max(mean, 1-mean)
    del stats['counter']
    return stats
        

In [4]:
all_stats = {}
for dataset in datasets.ALL_DATASETS+[datasets.TRECDataset()]:
    all_stats[dataset.name()[:-7]] = explore_dataset(dataset)

In [5]:
q = pd.DataFrame(all_stats)
q

Unnamed: 0,CR,MPQA,MR,SUBJ,TRECDatas
#examples,3775.0,10606.0,10662.0,10000.0,5952.0
#unique words,5674.0,6238.0,20325.0,22636.0,8968.0
#words,75932.0,32779.0,230162.0,246015.0,58468.0
\specialcell{#words with\\$1$ apperance},2714.0,3117.0,10160.0,11152.0,5338.0
\specialcell{avg sentence\\length},20.114437,3.090609,21.587132,24.6015,9.823253
\specialcell{max sentence\\length},106.0,44.0,62.0,122.0,37.0
\specialcell{median sentence\\length},18.0,2.0,21.0,23.0,9.0
bias,0.637616,0.687724,0.5,0.5,0.984039


In [6]:
df = pd.DataFrame(all_stats).round(2)
df.loc['bias']['TREC'] = None
raw_tex = df.to_latex() 
raw_tex = raw_tex.replace('textbackslashspecialcell\\','specialcell')
raw_tex = raw_tex.replace('textbackslash','')
raw_tex = raw_tex.replace('length\\','length')
raw_tex = raw_tex.replace('apperance\\','apperance')
raw_tex = raw_tex.replace('.000','')
raw_tex = raw_tex.replace('.00','')
raw_tex = raw_tex.replace('.0 &',' &')
raw_tex = raw_tex.replace('lrrrr','l|rrrr')
raw_tex = raw_tex.replace('\\\\','\\\\\hline')
raw_tex = raw_tex.replace('\hlinelength','length')
raw_tex = raw_tex.replace('.0 \\\\\hline', ' \\\\\hline ')
raw_tex = raw_tex.replace('\\hline\n\\bottomrule', '\n\\bottomrule')
raw_tex = raw_tex.replace('SUBJ \\\\\\hline', 'SUBJ \\\\')
raw_tex = raw_tex.replace('0.5 &', '0.50 &')
raw_tex = raw_tex.replace('24.6 &','24.60 &')
raw_tex = raw_tex.replace('words with\\\\\\hline\$1\\$','words with\\\$1$')

print(raw_tex)

\begin{tabular}{l|rrrrr}
\toprule
{} &        CR &      MPQA &         MR &      SUBJ &  TRECDatas \\\hline
\midrule
\#examples                                &   3775 &  10606 &   10662 &   10000 &    5952 \\\hline
\#unique words                            &   5674 &   6238 &   20325 &   22636 &    8968 \\\hline
\#words                                   &  75932 &  32779 &  230162 &  246015 &   58468 \\\hline
\specialcell{\#words with\\$1$ apperance} &   2714 &   3117 &   10160 &   11152 &    5338 \\\hline
\specialcell{avg sentence\\length}       &     20.11 &      3.09 &      21.59 &      24.60 &       9.82 \\\hline
\specialcell{max sentence\\length}       &    106 &     44 &      62 &     122 &      37 \\\hline
\specialcell{median sentence\\length}    &     18 &      2 &      21 &      23 &       9 \\\hline
bias                                     &      0.64 &      0.69 &       0.50 &       0.50 &       0.98 \\
\bottomrule
\end{tabular}



In [102]:
trec_stats = {} 
for lab in datasets.TRECDataset.SUPPORTED_LABELS:
    dataset = datasets.TRECDataset(task_label=lab)
    trec_stats[dataset.name().replace('TRECDataset-','')] = explore_dataset(dataset)

In [103]:
q = pd.DataFrame(trec_stats)
q.loc[['bias']]

Unnamed: 0,ABBR,DESC,ENTY,HUM,LOC,NUM
bias,0.984039,0.781586,0.774194,0.783602,0.846102,0.830477


In [104]:
df = pd.DataFrame(trec_stats).loc[['bias']].round(2)
raw_tex = df.to_latex() 
print(raw_tex)

\begin{tabular}{lrrrrrr}
\toprule
{} &  ABBR &  DESC &  ENTY &   HUM &   LOC &   NUM \\
\midrule
bias &  0.98 &  0.78 &  0.77 &  0.78 &  0.85 &  0.83 \\
\bottomrule
\end{tabular}



In [82]:
for lab in datasets.TRECDataset.SUPPORTED_LABELS:
    d = datasets.TRECDataset(task_label=lab)
    print(lab, " ".join(d.positives[0]))

DESC how did serfdom develop in and then leave russia ?
HUM what contemptible scoundrel stole the cork from my lunch ?
NUM when was ozzy osbourne born ?
ENTY what films featured the character popeye doyle ?
LOC what sprawling u.s. state boasts the most airports ?
ABBR what is the full form of .com ?


In [38]:
for dataset in datasets.ALL_DATASETS:
    print(dataset.name())
    print('\\emph{'+' '.join(dataset.positives[0])+'}')
    print()
    print('\\emph{'+' '.join(dataset.negatives[0])+'}')
    print()
    
for dataset in datasets.ALL_DATASETS:
    print(dataset.name())
    print('\\emph{'+' '.join(dataset.positives[1])+'}')
    print()
    print('\\emph{'+' '.join(dataset.negatives[1])+'}')
    print()

CRDataset
\emph{im a more happier person after discovering the i/p button ! .}

\emph{weaknesses are minor : the feel and layout of the remote control are only so-so ; . it does n 't show the complete file names of mp3s with really long names ; . you must cycle through every zoom setting ( 2x , 3x , 4x , 1/2x , etc . ) before getting back to normal size [ sorry if i 'm just ignorant of a way to get back to 1x quickly ] .}

MRDataset
\emph{the rock is destined to be the 21st century 's new `` conan `` and that he 's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .}

\emph{simplistic , silly and tedious .}

SUBJDataset
\emph{the movie begins in the past where a young boy named sam attempts to save celebi from a hunter .}

\emph{smart and alert , thirteen conversations about one thing is a small gem .}

MPQADataset
\emph{are also being encouraged}

\emph{complaining}

CRDataset
\emph{but , if you 're looking for my opinion of the apex