Hierarchical evaluation
========

In [142]:
import numpy as np
import pandas as pd
import glob
import os.path
import sys

from svecon.HierarchicalGridSearchCV import HierarchicalGridSearchCV
from svecon.EmptyTransformer import EmptyTransformer

from sklearn.cross_validation import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, StandardScaler, LabelEncoder

from metric_learn import LMNN, NCA, LFDA, Covariance
from metric_learn import ITML_Supervised, SDML_Supervised, LSML_Supervised, RCA_Supervised

import plotly
plotly.tools.set_credentials_file(username='sveco', api_key='8701ghzf0i')
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode()

datasetsDirectory = 'datasets'
resultsDirectory = 'datasets-results-all'

if not os.path.exists(resultsDirectory):
    os.makedirs(resultsDirectory)

default_n_jobs = 8
default_random_state = 789
default_n_folds = 10
default_shuffle = True

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logger.handlers = []
fh = logging.FileHandler("{}/error.log".format(resultsDirectory))
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)
# fh = logging.StreamHandler(sys.stdout)
# fh.setLevel(logging.ERROR)
# logger.addHandler(fh)

In [143]:
cv_per_dataset = {}
def evaluateClassifier(X, y, pipeline, parameters, name=None, datasetName=None):
    
    if datasetName in cv_per_dataset:
        cv = cv_per_dataset[datasetName]
    else:
        cv_per_dataset[datasetName] = cv = \
            StratifiedKFold(y, n_folds=default_n_folds, shuffle=default_shuffle, random_state=default_random_state)
    
    grid_search = HierarchicalGridSearchCV(pipeline, parameters, n_jobs=default_n_jobs, verbose=4, cv=cv)
    grid_search.fit(X, y)
    
    stats = [{
        **x,
        **x['scores'],
        **x['params'],
        **x['times'],
     } for x in grid_search.grid_scores_ ]

    for i in stats:
        i.pop('scores')
        i.pop('params')
        i.pop('times')
            
    df = pd.DataFrame(stats)
    df['technique'] = pd.Series([name]*df.shape[0], index=df.index)
    df['dataset'] = pd.Series([datasetName]*df.shape[0], index=df.index)
    
    return df

def evaluatePipeline(X,y,datasetName,pipeline):
    resFilename = '{}/{}_{}_result.csv'.format(resultsDirectory,datasetName,pipeline.__name__[8:])
    
    if os.path.isfile(resFilename):
        logging.info("\t`{}` using `{}` already finished, skipping".format(datasetName,pipeline.__name__[8:]))
        return None
    
    logging.info("\t`{}` using `{}` started".format(datasetName,pipeline.__name__[8:]))
    res = pipeline(X,y,datasetName)
    res.to_csv(resFilename)
    logging.info("\t`{}` using `{}` finished".format(datasetName,pipeline.__name__[8:]))

def noConstraints(Y):
    c = len(set(Y))
    return (10*c*c, 20*c*c, 40*c*c, 80*c*c)

In [144]:
defaultKnnParams = {
    'knn__n_neighbors': (1, 2, 4, 8, 16, 32, 64, 128),
}
defaultIters = (50, 250, 500, 1000)
defaultImputer = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=False)
defaultStandardizer = StandardScaler(copy=False, with_mean=True, with_std=True)

paramsLmnn = {
    'lmnn__k': (1, 2, 4, 8, 16, 32),
    'lmnn__regularization': (.1, .5, .9),
    'lmnn__max_iter': defaultIters,
    'lmnn__learn_rate': (1e-7,)#, 1e-8, 1e-9),
}

paramsItml = {
#     'itml__num_constraints': (10, 100, 1000, 10000),
    'itml__gamma': (.01, .1, 1., 10.),
    'itml__max_iters': defaultIters,
}

paramsSdml = {
    'sdml__num_constraints': (10000, 100000),
    'sdml__use_cov': (True, False),
    'sdml__balance_param': (0.1, .25, .5, .75, 1),
    'sdml__sparsity_param': (.01, .05, .1, .25)
}

paramsLsml = {
#     'lsml__num_constraints': (100, 1000, 10000, 100000),
    'lsml__max_iter': defaultIters,
}

paramsNca = {
    'nca__max_iter': defaultIters,
    'nca__learn_rate': (0.1, 0.01),
}

paramsLfda = {
    'lfda__metric': ('weighted', 'orthonormalized'),
}

paramsRca = {
    'rca__num_chunks': (10, 50, 100, 500, 1000),
    'rca__chunk_size': (1, 2, 3, 5, 7, 10, 16, 32),
}

def pipelineKnn(X,y,datasetName):
    pipeline = [
        Pipeline([ ('imputer', defaultImputer), ]),
        Pipeline([ ('empty', EmptyTransformer()), ]),
        Pipeline([ ('knn', KNeighborsClassifier()), ]),
    ]
    params = [ {}, {}, defaultKnnParams, ]
    return evaluateClassifier(X, y, pipeline, params, "kNN", datasetName)

def pipelineCovKnn(X,y,datasetName):
    pipeline = [
        Pipeline([ ('imputer', defaultImputer), ]),
        Pipeline([ ('cov', Covariance()), ]),
        Pipeline([ ('knn', KNeighborsClassifier()), ]),
    ]
    params = [ {}, {}, defaultKnnParams, ]
    return evaluateClassifier(X, y, pipeline, params, "Cov+kNN", datasetName)

def pipelineLmnnKnn(X,y,datasetName):
    pipeline = [
        Pipeline([ ('imputer', defaultImputer), ]),
        Pipeline([ ('lmnn', LMNN(k=3, min_iter=50, max_iter=1000, learn_rate=1e-07, regularization=0.5, convergence_tol=0.001)) ]),
        Pipeline([ ('knn', KNeighborsClassifier()), ]),
    ]
    params = [ {}, paramsLmnn, defaultKnnParams, ]
    return evaluateClassifier(X, y, pipeline, params, "LMNN+kNN", datasetName)


def pipelineItmlKnn(X,y,datasetName):
    pipeline = [
        Pipeline([ ('imputer', defaultImputer), ]),
        Pipeline([ ('itml', ITML_Supervised(gamma=1.,max_iters=1000,convergence_threshold=1e-3,num_constraints=None,bounds=None,A0=None)) ]),
        Pipeline([ ('knn', KNeighborsClassifier()), ]),
    ]
    params = [ {}, {**paramsItml, 'itml__num_constraints':noConstraints(y)}, defaultKnnParams, ]
    return evaluateClassifier(X, y, pipeline, params, "ITML+kNN", datasetName)


def pipelineSdmlKnn(X,y,datasetName):
    pipeline = [
        Pipeline([ ('imputer', defaultImputer), ]),
        Pipeline([ ('sdml', SDML_Supervised(balance_param=0.5, sparsity_param=0.01, use_cov=True, num_constraints=None)), ]),
        Pipeline([ ('knn', KNeighborsClassifier()), ]),
    ]
    params = [ {}, paramsSdml, defaultKnnParams, ]
    return evaluateClassifier(X, y, pipeline, params, "SDML+kNN", datasetName)


def pipelineLsmlKnn(X,y,datasetName):
    pipeline = [
        Pipeline([ ('imputer', defaultImputer), ]),
        Pipeline([ ('lsml', LSML_Supervised(tol=1e-3, max_iter=1000, prior=None, num_constraints=None)), ]),
        Pipeline([ ('knn', KNeighborsClassifier()), ]),
    ]
    params = [ {}, {**paramsLsml, 'lsml__num_constraints':noConstraints(y)}, defaultKnnParams, ]
    return evaluateClassifier(X, y, pipeline, params, "LSML+kNN", datasetName)


def pipelineNcaKnn(X,y,datasetName):
    pipeline = [
        Pipeline([ ('imputer', defaultImputer), ]),
        Pipeline([ ('nca', NCA(max_iter=100, learning_rate=0.01)), ]),
        Pipeline([ ('knn', KNeighborsClassifier()), ]),
    ]
    params = [ {}, paramsNca, defaultKnnParams, ]
    return evaluateClassifier(X, y, pipeline, params, "NCA+kNN", datasetName)


def pipelineLfdaKnn(X,y,datasetName):
    pipeline = [
        Pipeline([ ('imputer', defaultImputer), ]),
        Pipeline([ ('lfda', LFDA(dim=None, k=7, metric='weighted')), ]),
        Pipeline([ ('knn', KNeighborsClassifier()), ]),
    ]
    params = [ {}, {**paramsLfda, 'lfda__k': tuple(range(1, X.shape[1]))}, defaultKnnParams, ]
    return evaluateClassifier(X, y, pipeline, params, "LFDA+kNN", datasetName)


def pipelineRcaKnn(X,y,datasetName):
    pipeline = [
        Pipeline([ ('imputer', defaultImputer), ]),
        Pipeline([ ('rca', RCA_Supervised(dim=None, num_chunks=100, chunk_size=2)), ]),
        Pipeline([ ('knn', KNeighborsClassifier()), ]),
    ]
    params = [ {}, paramsRca, defaultKnnParams, ]
    return evaluateClassifier(X, y, pipeline, params, "RCA+kNN", datasetName)






def pipelineStandKnn(X,y,datasetName):
    pipeline = [
        Pipeline([
            ('imputer', defaultImputer),
            ('standardizer', defaultStandardizer),
        ]),
        Pipeline([ ('empty', EmptyTransformer()), ]),
        Pipeline([ ('knn', KNeighborsClassifier()), ]),
    ]
    params = [ {}, {}, defaultKnnParams, ]
    return evaluateClassifier(X, y, pipeline, params, "stand+kNN", datasetName)

def pipelineStandCovKnn(X,y,datasetName):
    pipeline = [
        Pipeline([
            ('imputer', defaultImputer),
            ('standardizer', defaultStandardizer),
        ]),
        Pipeline([ ('cov', Covariance()), ]),
        Pipeline([ ('knn', KNeighborsClassifier()), ]),
    ]
    params = [ {}, {}, defaultKnnParams, ]
    return evaluateClassifier(X, y, pipeline, params, "stand+Cov+kNN", datasetName)

def pipelineStandLmnnKnn(X,y,datasetName):
    pipeline = [
        Pipeline([
            ('imputer', defaultImputer),
            ('standardizer', defaultStandardizer),
        ]),
        Pipeline([ ('lmnn', LMNN(k=3, min_iter=50, max_iter=1000, learn_rate=1e-07, regularization=0.5, convergence_tol=0.001)) ]),
        Pipeline([ ('knn', KNeighborsClassifier()), ]),
    ]
    params = [ {}, paramsLmnn, defaultKnnParams, ]
    return evaluateClassifier(X, y, pipeline, params, "stand+LMNN+kNN", datasetName)


def pipelineStandItmlKnn(X,y,datasetName):
    pipeline = [
        Pipeline([
            ('imputer', defaultImputer),
            ('standardizer', defaultStandardizer),
        ]),
        Pipeline([ ('itml', ITML_Supervised(gamma=1.,max_iters=1000,convergence_threshold=1e-3,num_constraints=None,bounds=None,A0=None)) ]),
        Pipeline([ ('knn', KNeighborsClassifier()), ]),
    ]
    params = [ {}, {**paramsItml, 'itml__num_constraints':noConstraints(y)}, defaultKnnParams, ]
    return evaluateClassifier(X, y, pipeline, params, "stand+ITML+kNN", datasetName)


def pipelineStandSdmlKnn(X,y,datasetName):
    pipeline = [
        Pipeline([
            ('imputer', defaultImputer),
            ('standardizer', defaultStandardizer),
        ]),
        Pipeline([ ('sdml', SDML_Supervised(balance_param=0.5, sparsity_param=0.01, use_cov=True, num_constraints=None)), ]),
        Pipeline([ ('knn', KNeighborsClassifier()), ]),
    ]
    params = [ {}, paramsSdml, defaultKnnParams, ]
    return evaluateClassifier(X, y, pipeline, params, "stand+SDML+kNN", datasetName)


def pipelineStandLsmlKnn(X,y,datasetName):
    pipeline = [
        Pipeline([
            ('imputer', defaultImputer),
            ('standardizer', defaultStandardizer),
        ]),
        Pipeline([ ('lsml', LSML_Supervised(tol=1e-3, max_iter=1000, prior=None, num_constraints=None, verbose=False)), ]),
        Pipeline([ ('knn', KNeighborsClassifier()), ]),
    ]
    params = [ {}, {**paramsLsml, 'lsml__num_constraints':noConstraints(y)}, defaultKnnParams, ]
    return evaluateClassifier(X, y, pipeline, params, "stand+LSML+kNN", datasetName)


def pipelineStandNcaKnn(X,y,datasetName):
    pipeline = [
        Pipeline([
            ('imputer', defaultImputer),
            ('standardizer', defaultStandardizer),
        ]),
        Pipeline([ ('nca', NCA(max_iter=100, learning_rate=0.01)), ]),
        Pipeline([ ('knn', KNeighborsClassifier()), ]),
    ]
    params = [ {}, paramsNca, defaultKnnParams, ]
    return evaluateClassifier(X, y, pipeline, params, "stand+NCA+kNN", datasetName)


def pipelineStandLfdaKnn(X,y,datasetName):
    pipeline = [
        Pipeline([
            ('imputer', defaultImputer),
            ('standardizer', defaultStandardizer),
        ]),
        Pipeline([ ('lfda', LFDA(dim=None, k=7, metric='weighted')), ]),
        Pipeline([ ('knn', KNeighborsClassifier()), ]),
    ]
    params = [ {}, {**paramsLfda, 'lfda__k': tuple(range(1, X.shape[1]))}, defaultKnnParams, ]
    return evaluateClassifier(X, y, pipeline, params, "stand+LFDA+kNN", datasetName)


def pipelineStandRcaKnn(X,y,datasetName):
    pipeline = [
        Pipeline([
            ('imputer', defaultImputer),
            ('standardizer', defaultStandardizer),
        ]),
        Pipeline([ ('rca', RCA_Supervised(dim=None, num_chunks=100, chunk_size=2)), ]),
        Pipeline([ ('knn', KNeighborsClassifier()), ]),
    ]
    params = [ {}, paramsRca, defaultKnnParams, ]
    return evaluateClassifier(X, y, pipeline, params, "stand+RCA+kNN", datasetName)

In [147]:
import glob, os

datasets = []
for file in glob.glob("{}/*.csv".format(datasetsDirectory)):
    datasets.append(file)
datasets.sort()

datasets.remove('datasets/soybean-large.csv')

datasets = datasets[2:3]
logging.info("Datasets: " + str(datasets))

In [149]:
for filename in datasets:
    results = []
    datasetName = filename[len(datasetsDirectory)+1:-4]
    
    logging.info("Starting `{}` dataset".format(datasetName))

    data = pd.read_csv(filename, sep=',', skiprows=1, header=0)

    y = data['class']
    X = data.drop(['class'], axis=1).values
    
    le = LabelEncoder()
    y = le.fit_transform(y)
    
    known_label_idx, = np.where(y >= 0)
    known_labels = y[known_label_idx]
    uniq, lookup = np.unique(known_labels, return_inverse=True)
    all_inds = [set(np.where(lookup==c)[0]) for c in range(len(uniq))]
    print(len(all_inds)-1)
    print(np.random.randint(0, high=len(all_inds)-1))

    evaluatePipeline( X,y,datasetName,pipelineKnn )
    evaluatePipeline( X,y,datasetName,pipelineCovKnn )
#     evaluatePipeline( X,y,datasetName,pipelineLmnnKnn )
    evaluatePipeline( X,y,datasetName,pipelineItmlKnn )
    evaluatePipeline( X,y,datasetName,pipelineSdmlKnn )
    evaluatePipeline( X,y,datasetName,pipelineLsmlKnn )
#     evaluatePipeline( X,y,datasetName,pipelineNcaKnn )
    evaluatePipeline( X,y,datasetName,pipelineLfdaKnn )
    evaluatePipeline( X,y,datasetName,pipelineRcaKnn ) 

    evaluatePipeline( X,y,datasetName,pipelineStandKnn )
    evaluatePipeline( X,y,datasetName,pipelineStandCovKnn )
#     evaluatePipeline( X,y,datasetName,pipelineStandLmnnKnn )
    evaluatePipeline( X,y,datasetName,pipelineStandItmlKnn )
    evaluatePipeline( X,y,datasetName,pipelineStandSdmlKnn )
    evaluatePipeline( X,y,datasetName,pipelineStandLsmlKnn )
#     evaluatePipeline( X,y,datasetName,pipelineStandNcaKnn )
    evaluatePipeline( X,y,datasetName,pipelineStandLfdaKnn )
    evaluatePipeline( X,y,datasetName,pipelineStandRcaKnn ) 

1
0


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  24 tasks       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  24 tasks       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  24 tasks       | elapsed:    0.8s
[Parallel(n_jobs=1)]: Done  97 tasks       | elapsed:    6.0s
[Parallel(n_jobs=1)]: Done 220 tasks       | elapsed:   20.6s
[Parallel(n_jobs=1)]: Done 391 tasks       | elapsed:  1.9min


KeyboardInterrupt: 

Statistics
===========

In [None]:
resultsMask = "{}/*_result.csv".format(resultsDirectory)
alldata = pd.concat([pd.DataFrame.from_csv(x) for x in sorted(glob.glob(resultsMask))])
alldata = alldata.set_index(['dataset','technique', alldata.index])
alldata

In [None]:
# dataset+methods (best run)
showBestParams = False
for dataset in alldata.index.levels[0]:
    traces = []
    means = []
    for technique in alldata.index.levels[1]:
        techniqueRows = alldata.loc[dataset].loc[technique]
        if techniqueRows.shape[0]==0:
            continue
            
        bestRowNum = techniqueRows['mean'].argmax()
        bestRow = techniqueRows.iloc[bestRowNum,:]
        means.append( bestRow['mean'] )
        scores = [ x for x in alldata.columns.values if x[:5]=='score']
        traces.append(
            go.Box(
                y=bestRow[scores].values,
                name=technique,
            )
        )
        
        if showBestParams:
            print("======",dataset,"======",technique,"======")
            print(bestRow)
            print()
        
    traces.append( go.Scatter( x=alldata.index.levels[1], y=means, mode='lines', name='mean', marker=dict(color='black') ))
        
    layout = go.Layout(
        yaxis=dict(title='successrate', zeroline=False ),
        title=dataset,
    )
    fig = go.Figure(data=traces, layout=layout)
    py.iplot(fig)
#     plotly.plotly.image.save_as(fig, filename='{}/{}-plot.png'.format(resultsDirectory, dataset))


In [None]:
# methods across all datasets (best run)
valuesPerTechnique = {}
for dataset in alldata.index.levels[0]:
    for technique in alldata.index.levels[1]:
        techniqueRows = alldata.loc[dataset].loc[technique]
        if techniqueRows.shape[0]==0:
            continue
            
        bestRowNum = techniqueRows['mean'].argmax()
        bestRow = techniqueRows.iloc[bestRowNum,:]
        scores = [ x for x in alldata.columns.values if x[:5]=='score']
        
        if technique not in valuesPerTechnique:
            valuesPerTechnique[technique] = {'means':[], 'scores':[]}
        
        # SKIP ALL WHERE THERE IS SUCCESSRATE 0 IN ANY FOLD
        if any(x == 0 for x in bestRow[scores].values):
            continue
        
        valuesPerTechnique[technique]['means'].append(bestRow['mean'])
        valuesPerTechnique[technique]['scores'].extend(bestRow[scores].values)

traces = []
means = []
for technique,value in sorted(valuesPerTechnique.items()):
    traces.append( go.Box(
            y=valuesPerTechnique[technique]['scores'],
            name=technique,
    ))
    means.append((technique, np.mean(valuesPerTechnique[technique]['means'])))
    
traces.append( go.Scatter( 
        x=[x[0] for x in means],
        y=[x[1] for x in means],
        mode='lines', name='mean', marker=dict(color='black')
    ))
    
layout = go.Layout(
    yaxis=dict(title='successrate', zeroline=False ),
    title='Aggregated',
)

fig = go.Figure(data=traces, layout=layout)
py.iplot(fig)
#     plotly.plotly.image.save_as(fig, filename='{}/{}-plot.png'.format(resultsDirectory, dataset))


In [None]:
# times for best run
for dataset in alldata.index.levels[0]:
    
    traces = []
    for timeCol in [ x for x in alldata.columns.values if x[:4]=='time']:
        times = []
        for technique in alldata.index.levels[1]:
            techniqueRows = alldata.loc[dataset].loc[technique]
            if techniqueRows.shape[0]==0:
                continue
                
            bestRowNum = techniqueRows['mean'].argmax()
            bestRow = techniqueRows.iloc[bestRowNum,:]
            
            times.append( bestRow[timeCol] )

        times = [0 if np.isnan(x) else x for x in times]
            
        traces.append(
            go.Bar(
                y=times,
                x=alldata.index.levels[1],
                name=timeCol
            )
        )
        
    layout = go.Layout(
        barmode='stack',
        yaxis=dict(
            title='seconds',
            zeroline=False
        ),
        title=dataset,
    )
    fig = go.Figure(data=traces, layout=layout)
    py.iplot(fig)
#     plotly.plotly.image.save_as(fig, filename='{}/{}-plot.png'.format(resultsDirectory, dataset))


In [None]:
# average times
for dataset in alldata.index.levels[0]:
    
    traces = []
    for timeCol in [ x for x in alldata.columns.values if x[:4]=='time']:
        times = []
        for technique in alldata.index.levels[1]:
            techniqueRows = alldata.loc[dataset].loc[technique]
            if techniqueRows.shape[0]==0:
                continue
            
            times.append( techniqueRows[timeCol].mean() )

        times = [0 if np.isnan(x) else x for x in times]
            
        traces.append(
            go.Bar(
                y=times,
                x=alldata.index.levels[1],
                name=timeCol
            )
        )
        
    layout = go.Layout(
        barmode='stack',
        yaxis=dict(
            title='seconds',
            zeroline=False
        ),
        title=dataset,
    )
    fig = go.Figure(data=traces, layout=layout)
    py.iplot(fig)
#     plotly.plotly.image.save_as(fig, filename='{}/{}-plot.png'.format(resultsDirectory, dataset))
