Runtimes
========

In [None]:
import numpy as np
import pandas as pd
import glob, os, os.path, sys, warnings, math, time, re

%matplotlib inline
import matplotlib.pyplot as plt
from plotting.plots import *

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from techniques import sortedTechniques
from metric_learn import LMNN, NCA, LFDA, Covariance, MetricEvolution, NeuralNetworkTransformer, FullMatrixTransformer, DiagonalMatrixTransformer
from metric_learn import ITML_Supervised, SDML_Supervised, LSML_Supervised, RCA_Supervised


datasetsDirectory = 'datasets'
resultsDirectory = 'results/runtimes'

if not os.path.exists(resultsDirectory):
    os.makedirs(resultsDirectory)

# np.set_printoptions(precision=7, suppress=True, threshold=np.nan)
np.set_printoptions(formatter={'float': lambda x: "{0:0.5f}".format(x)})

import pickle
def save_obj(obj, name):
    with open('{}/{}.pkl'.format(resultsDirectory, name), 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('{}/{}.pkl'.format(resultsDirectory, name), 'rb') as f:
        return pickle.load(f)
    
def exists_obj(name):
    return os.path.exists('{}/{}.pkl'.format(resultsDirectory, name))

In [None]:
import glob, os

datasets = []
for file in glob.glob("{}/*.csv".format(datasetsDirectory)):
    datasets.append(file)
datasets.sort()

# datasets.remove('datasets/gaussians.csv')
# datasets.remove('datasets/ionosphere.csv')
# datasets.remove('datasets/mice-protein.csv')
# datasets.remove('datasets/sonar.csv')
# datasets.remove('datasets/soybean-large.csv')

for x in datasets:
    X = pd.read_csv(x, sep=',', skiprows=1, header=0)
    print(x, X.shape, X['class'].nunique())

In [None]:
common_ea_params = {
    's__stats': None,
    's__n_gen': 200,
    'transformer': 'full',
}

bestAlgorithms = [
    (['full'], 'stand+Cov+kNN', Covariance()),
    (['full'], 'stand+LMNN+kNN', LMNN(k=4, regularization=0.9, learn_rate=1e-8, max_iter=200)),
    (['full'], 'stand+NCA+kNN', NCA(max_iter=200, learning_rate=0.01)),
    (['full'], 'stand+LFDA+kNN', LFDA(k=3, num_dims=None)),
    
    (['full', 'diagonal'], 'stand+CMAES+kNN', MetricEvolution(**common_ea_params, strategy='cmaes', fitnesses=[('knn', {'n_neighbors':8})])),
    (['full', 'diagonal'], 'stand+CMAESFme+kNN', MetricEvolution(**common_ea_params, strategy='cmaes', fitnesses='wfme')),
    
#     (['full', 'diagonal'], 'stand+JDE+kNN', MetricEvolution(**common_ea_params, strategy='jde', fitnesses='wfme')),
#     (['full', 'diagonal'], 'stand+JDEkNN+kNN', MetricEvolution(**common_ea_params, strategy='jde', fitnesses=[('knn', {'n_neighbors':8})])),
]

In [None]:
for _ in range(10):
    for filename in datasets:
        data = pd.read_csv(filename, sep=',', skiprows=1, header=0)

        y = data['class']
        X = data.drop(['class'], axis=1).values

        le = LabelEncoder()
        y = le.fit_transform(y)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

        imputer = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=False)
        X_train = imputer.fit_transform(X_train)
        X_test = imputer.transform(X_test)

        standard = StandardScaler()
        X_train = standard.fit_transform(X_train)
        X_test = standard.transform(X_test)

        for transformers, methodName, method in bestAlgorithms:
            for transformer in transformers:
                if transformer=='full' and filename in ['datasets\\digits10.csv', 'datasets\\digits6.csv', 'datasets\\mice-protein.csv', 'datasets\\sonar.csv']:
                    continue
                
                datasetName = filename[len(datasetsDirectory)+1:-4]

                dump_filename = '{}__{}__{}'.format(datasetName, methodName, transformer)
                if not exists_obj(dump_filename):
                    save_obj([], dump_filename)

                prev_runtimes = load_obj(dump_filename)

                if len(prev_runtimes)>=1: # MAKE THIS 10 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!########$#%$^%*(*&)
                    print('skipping {}, already exists'.format(dump_filename))
                    continue

                print("starting {} ({})".format(dump_filename, X.shape))
                print("{} runs so far: {}".format(len(prev_runtimes), prev_runtimes))

                try:
                    start = time.clock()

                    me = method
                    if isinstance(me, MetricEvolution): me.params['transformer'] = transformer
                    me.fit(X_train, y_train)

                    end = time.clock()

                    save_obj(prev_runtimes+[end-start], dump_filename)
                except:
                    print("Unexpected error:", sys.exc_info()[0])

In [None]:
# ## RENAMING results for full/diag
# results = []
# for file in glob.glob("{}/*.pkl".format(resultsDirectory)):
#     results.append(file)
# results.sort()
# results

# for x in results:
#     os.rename(x, x.replace('.pkl', '__full.pkl'))