### topics

* cross validation over the whole data set
* t-test on the basis of cross validation results

In [1]:
# get the fasttext embeddings 

#import torch
import numpy as np

def load_emb_from_file(filepath):

    word_to_index = {}
    embeddings = []
    with open(filepath, "r") as fp:
        for index, line in enumerate(fp):
            line = line.split(" ") # each line: word num1 num2 ...
            word_to_index[line[0]] = index # word = line[0] 
            embedding_i = np.array([float(val) for val in line[1:]])
            embeddings.append(embedding_i)
    return word_to_index, embeddings

In [2]:
# index, array of word embeddings

widx,emb=load_emb_from_file("/home/klenner/Lehre/ml20/cc.de.300.vec")

FileNotFoundError: [Errno 2] No such file or directory: '/home/klenner/Lehre/ml20/cc.de.300.vec'

In [None]:
import pandas as pd
from sklearn.utils import shuffle

# data of learning task: polarity lexicon for German
polex=pd.read_csv("/home/klenner/Lehre/ml20/polexNeutNomen",header=None,
                  index_col=[0,1],usecols=[0,1],names=['lemma', 'pol'])


ids=[]     # gather all embedding indices
index={}   # map index to polarity
oov=0      # out of vocabolary counter

for (l,p),_ in polex.iterrows():  # (lemma,polarity) pairs
    try:
        id=widx[l]  
        if p=='POS' or p=='PRO':
            index[id]=1     # id is the word2vec index of lemma l
            ids.append(id)  # all ids for data split below
        elif p=='NEG' or p=='CON':
            index[id]=0
            ids.append(id)
        else:
            index[id]=2
            ids.append(id)
    except:
        oov+=1
        pass

shuffle(ids,random_state=0)                    # random modifies ids directly
X=[emb[id] for id in ids]                 
y=[index[id] for id in ids]      
len(ids)

In [None]:
from sklearn.linear_model import Perceptron
from sklearn import svm
from sklearn.neural_network import MLPClassifier

perceptron = Perceptron(tol=1e-3)
perceptron.fit(X, y)

svm = svm.SVC(kernel='linear', C=1.0, random_state=0)
svm.fit(X,y)

mlp = MLPClassifier(solver='adam', alpha=1e-5, activation='relu',
                    hidden_layer_sizes=(300,10), random_state=1,validation_fraction=0.2,early_stopping=True)            
mlp.fit(X, y)

### 1. Statement

cross validation produces stable accuracy results

* we compare 5-fold to 10-fold to 50-fold cross validation
* cross validation
    * split the data set into n folds
    * take each fold once as test and 
    * n-1 times as part of training
    * take the average of results
    
the average scores vary only slightly, they are stable

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(perceptron, X, y, cv=5)

print("all scores:",scores,"\naverage score of 5 fold cv:",scores.mean())

In [None]:
perceptron_scores10 = cross_val_score(perceptron, X, y, cv=10)
print("10 fold cv:",perceptron_scores10.mean())

In [None]:
scores = cross_val_score(perceptron, X, y, cv=50)
scores.mean()
print("20 fold cv:",scores.mean())

### 2. Statement

since cv results are stable, we can more reliably apply the t-test

* we compare mlp with perceptron and svm
* we do it with the the results of the n folds
    * this is valid only if the folds are determined equally in each run 
    * otherwise the test folds could differ and we don't have proper pairs 
    * reason: we are applying the paired t-test
    * sklearn seems to guarantee this (see next field)
    * sklearn produces stratified folds, i.e. the class distribution is preserved in each fold

In [None]:
# if we run is twice, the result don't differ
# i.e. given a corpus, every splitting into folds gives the same result, crucial for t-test comparision

from sklearn.model_selection import StratifiedKFold

Xcv = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
ycv = np.array([0, 0, 1, 1])

skf = StratifiedKFold(n_splits=2)
skf.get_n_splits(Xcv, ycv)

print(skf)

for train_index, test_index in skf.split(Xcv, ycv):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = Xcv[train_index], Xcv[test_index]
    y_train, y_test = ycv[train_index], ycv[test_index]

In [None]:
# we compare mlp to perceptron

perceptron_scores10 = cross_val_score(perceptron, X, y, cv=10)
res_perceptron=perceptron_scores10.mean()

mlp_scores10 = cross_val_score(mlp, X, y, cv=10)
res_mlp=mlp_scores10.mean()

print("mlp:",res_mlp,"perceptron:",res_perceptron)

In [None]:
print("mlp scores 10 fold cv\n", mlp_scores10,"\n\nperceptron scores 10 fold cv\n",perceptron_scores10)


        $\hat\sigma_{\bar{x}}$= $\frac{{\hat{\sigma}}\approx s}{\sqrt{n}}$
        
        


In [None]:
# paired t-test, null hypothesis is: both classifier have equal performance
from scipy.stats import t

# degress of freedom
df=10  # number of folds
length=df

print("99% of values are smaller than",t.ppf(0.99,df-1)) 

# substract the accuracy scores and fix the mean
diff=mlp_scores10-perceptron_scores10
mean=diff.mean()

# find the t-value
t_val=mean/(np.std(diff,ddof=1)/np.sqrt(length))

print("\nt_val is higher than this, namely:",t_val,"- we thus reject that both have equal performance")

In [None]:
# let's compare mlp with svm

mlp_scores10 = cross_val_score(mlp, X, y, cv=10)
res_mlp=mlp_scores10.mean()
svm_scores10 = cross_val_score(svm, X, y, cv=10)
res_svm=svm_scores10.mean()

print("mlp:",res_mlp,"svm:",res_svm)

In [None]:
diff=mlp_scores10-svm_scores10
mean=diff.mean()

t_val=mean/(np.std(diff)/np.sqrt(length))  

print("95% of values are smaller than",t.ppf(0.95,df-1)) 

print("\nt_val is smaller than this, namely:",t_val,"- we thus cannot reject the null hypothesis")
            