Datasets
========

In [None]:
# import csv
import numpy as np
# import scipy.linalg as la
# import matplotlib.pyplot as plt
import pandas as pd

# from svecon.KNNClassifierPerClass import KNNClassifierPerClass

from sklearn.cross_validation import StratifiedKFold#, cross_val_score, train_test_split
# from sklearn.utils import shuffle
# from sklearn import metrics
# from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, StandardScaler

from metric_learn import LMNN, NCA, LFDA

from plotly.offline import download_plotlyjs, init_notebook_mode
init_notebook_mode()

In [None]:
def evaluateKnn(X_train, y_train, X_test, y_test, k=1, classifier=KNeighborsClassifier, show_confusion=False):
    knn = classifier(k)
    knn.fit(X_train, y_train)
    
    predicted = knn.predict(X_test)
    wrongVec = predicted!=y_test
    N = len(y_test)
    print('{}% success ({}/{})'.format((1-sum(wrongVec)/N)*100, N-sum(wrongVec), N) )
    
    if show_confusion:
        print(confusion_matrix(y_test, predicted))
    
    return (knn, wrongVec)

In [None]:
def evaluateClassifierOLD(X, y, name=None, transformation=None):
    smallestClass = y.value_counts().min()

    bestAccuracy = (0,0,0)
    for k in range(1,smallestClass+1):
        classifier = KNeighborsClassifier(n_neighbors=k, n_jobs=1)
        cv = StratifiedKFold(y, n_folds=5, shuffle=True, random_state=777)
    
        scores = []
        for train,test in cv:
            
            y_train = y.loc[train].values
            y_test = y.loc[test].values
            
            if transformation is None:
                X_train, X_test = X.loc[train], X.loc[test]
            else:
                X_train, X_test = transformation(X.loc[train].astype(float).values, y_train, X.loc[test].astype(float).values)
            
            classifier.fit(X_train, y_train)
            scores.append( classifier.score(X_test, y_test) )

        if bestAccuracy[0]<np.mean(scores): bestAccuracy = (np.mean(scores), np.std(scores) * 2, k)
    
    print("{} -> Best Accuracy: {} (+/- {}) for K={}".format(name, bestAccuracy[0], bestAccuracy[1], bestAccuracy[2]))

In [None]:
from time import time

def evaluateClassifier(X, y, pipeline, parameters, name=None):
    
    cv = StratifiedKFold(y, n_folds=5, shuffle=False, random_state=777)
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=4, verbose=True, cv=cv, iid=True)
    
    t0 = time()
    grid_search.fit(X.values, y.values)
    print(name)
    print("\tDone in {}s".format(time() - t0))
    print("\tBest score: {}%".format(grid_search.best_score_))
    print("\tBest params: {}".format(grid_search.best_params_))
    print()
    return grid_search
#     print("Grid scores", grid_search.grid_scores_)
#     print("Best parameters set:")
#     best_parameters = grid_search.best_estimator_.get_params()
#     for param_name in sorted(parameters.keys()):
#         print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
import glob, os

datasets = []
for file in glob.glob("datasets/*.csv"):
    datasets.append(file)
    
datasets.sort()
datasetsNames = [x[9:-4] for x in datasets]

print(datasetsNames)

In [None]:
filename = datasets[0]

data = pd.read_csv(filename, sep=',', skiprows=1, header=0)

y = data['class']
X = data.drop(['class'], axis=1)

defaultParams = {
    'knn__n_neighbors': (1, 2, 4, 8, 16, 32),
}
defaultImputer = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=False)
defaultStandardizer = StandardScaler(copy=False, with_mean=True, with_std=True)

# pipeline = Pipeline([
#     ('imputer', defaultImputer),
#     ('knn', KNeighborsClassifier()),
# ])
# params = {
#     **defaultParams
# }
# # evaluateClassifier(X, y, pipeline, params, "Original+kNN")


# pipeline = Pipeline([
#     ('imputer', defaultImputer),
#     ('standardizer', defaultStandardizer),
#     ('knn', KNeighborsClassifier()),
# ])
# params = {
#     **defaultParams
# }
# evaluateClassifier(X, y, pipeline, params, "Standardized+kNN")


pipeline = Pipeline([
    ('imputer', defaultImputer),
    ('standardizer', defaultStandardizer),
    ('lmnn', LMNN(k=3, min_iter=50, max_iter=1000, learn_rate=1e-07, regularization=0.5, convergence_tol=0.001)),
    ('knn', KNeighborsClassifier()),
])
params = {
    **defaultParams,
    'lmnn__k': (1, 2),#, 2, 4, 8, 16),
#     'lmnn__regularization': (.1, .5, .9),
#     'lmnn__max_iter': (100, 500, 1000),
#     'lmnn__learn_rate': (1e-7, 1e-8, 1e-9),
}
gs = evaluateClassifier(X, y, pipeline, params, "LMNN+kNN")

In [None]:
gs.grid_scores_

In [None]:
pipeline = Pipeline([
    ('imputer', defaultImputer),
    ('nca', NCA(max_iter=100, learning_rate=0.01)),
    ('knn', KNeighborsClassifier()),
])
params = {
    **defaultParams,
    'nca__max_iter': (100,),#, 500, 1000),
    'nca__learn_rate': (0.1, 0.01),# 1e-3, 1e-5),
}
evaluateClassifier(X, y, pipeline, params, "NCA+kNN")

In [None]:
pipeline = Pipeline([
    ('imputer', defaultImputer),
    ('lfda', LFDA(dim=None, k=7, metric='weighted')),
    ('knn', KNeighborsClassifier()),
])
params = {
    **defaultParams,
    'lfda__k': (1, 2, 3),#, 4, 8, 16),
    'lfda__metric': ('weighted', 'orthonormalized'),
}
evaluateClassifier(X, y, pipeline, params, "LFDA+kNN")