Fitness evaluation
========

In [None]:
import numpy as np
import pandas as pd
import glob, os, os.path, sys, warnings, math, time, re

%matplotlib inline
import matplotlib.pyplot as plt
from plotting.plots import *

from techniques import sortedTechniques

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from metric_learn import LMNN, NCA, LFDA, Covariance, MetricEvolution, NeuralNetworkTransformer, FullMatrixTransformer, DiagonalMatrixTransformer
from metric_learn import ITML_Supervised, SDML_Supervised, LSML_Supervised, RCA_Supervised
ME = MetricEvolution

datasetsDirectory = 'datasets'
resultsDirectory = 'results/fitness'
graphsDirectory = 'img/fitness'

if not os.path.exists(resultsDirectory):
    os.makedirs(resultsDirectory)
    
if not os.path.exists(graphsDirectory):
    os.makedirs(graphsDirectory)

# np.set_printoptions(precision=7, suppress=True, threshold=np.nan)
np.set_printoptions(formatter={'float': lambda x: "{0:0.5f}".format(x)})

import pickle
def save_obj(obj, name):
    with open('{}/{}.pkl'.format(resultsDirectory, name), 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('{}/{}.pkl'.format(resultsDirectory, name), 'rb') as f:
        return pickle.load(f)
    
def exists_obj(name):
    return os.path.exists('{}/{}.pkl'.format(resultsDirectory, name))

In [None]:
import glob, os

datasets = []
for file in glob.glob("{}/*.csv".format(datasetsDirectory)):
    datasets.append(file)
datasets.sort()

# datasets.remove('datasets/gaussians.csv')
# datasets.remove('datasets/ionosphere.csv')
# datasets.remove('datasets/mice-protein.csv')
# datasets.remove('datasets/sonar.csv')
# datasets.remove('datasets/soybean-large.csv')
# datasets.remove('datasets/digits6.csv')
# datasets.remove('datasets/digits10.csv')
# datasets.remove('datasets/mice-protein.csv')
# datasets.remove('datasets/sonar.csv')

datasets = ['datasets/digits6.csv','datasets/digits10.csv','datasets/mice-protein.csv','datasets/sonar.csv']

for x in datasets:
    X = pd.read_csv(x, sep=',', skiprows=1, header=0)
    print(x, X.shape, X['class'].nunique())
    
# datasets = ['datasets/breast-cancer.csv']

In [None]:
def evaluateAllIndividuals(me, individuals, X_train, y_train, X_test, y_test, dim=None, k=4):
    results = []
    for individual in individuals:
        individual = me._strategy.cut_individual(individual)
        if X_train.shape[1]==len(individual):
            tr = DiagonalMatrixTransformer()
        else:
            tr = FullMatrixTransformer(n_components=dim)
        tr.fit(X_train, y_train, individual)
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(tr.transform(X_train), y_train)
        results.append(knn.score(tr.transform(X_test), y_test))
    return results

from operator import attrgetter
def select_best_worst_mean(individuals):
    x = sorted(individuals, key=attrgetter("fitness"))
    return x[-1], x[0], x[len(x)//2]

In [None]:
common_params = {
    's__stats': 'identity',
    's__n_gen': 1000,
    'transformer': 'diagonal',
}
methods = [
    ('stand+CMAES+kNN', MetricEvolution(**common_params, strategy='cmaes', fitnesses=[('knn', {'n_neighbors':8})])),
    ('stand+CMAESFme+kNN', MetricEvolution(**common_params, strategy='cmaes', fitnesses='wfme')),
    ('stand+JDEkNN+kNN', MetricEvolution(**common_params, strategy='jde', fitnesses=[('knn', {'n_neighbors':8})])),
    ('stand+JDE+kNN', MetricEvolution(**common_params, strategy='jde', fitnesses='wfme')),
]

for filename in datasets:
    
    data = pd.read_csv(filename, sep=',', skiprows=1, header=0)

    y = data['class']
    X = data.drop(['class'], axis=1).values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

    le = LabelEncoder()
    y = le.fit_transform(y)

    imputer = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=False)
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)
    
    standard = StandardScaler()
    X_train = standard.fit_transform(X_train)
    X_test = standard.transform(X_test)

    knn = KNeighborsClassifier(n_neighbors=4)
    knn.fit(X_train, y_train)
    baseline = knn.score(X_test, y_test)
    
    for methodName, method in methods:
        datasetName = filename[len(datasetsDirectory)+1:-4]

        diagParam = '*d' if common_params['transformer']=='diagonal' else ''
        dump_filename = '{}__{}{}'.format(datasetName, methodName, diagParam)
        if exists_obj(dump_filename):
            print('skipping {}, already exists'.format(dump_filename))
            continue

        print("starting dataset {} {} using {}".format(datasetName, X.shape, methodName))

        try:
            start = time.clock()

            me = method
            me.fit(X_train, y_train)

            end = time.clock()

            logbook = me._strategy.logbook
            individuals = np.asarray([[ ind for ind in x['id']] for x in logbook])
            fitnesses = np.asarray([[ ind.fitness.wvalues for ind in x['id']] for x in logbook])

            print(individuals.shape)

            best,worst,mean = zip(*[select_best_worst_mean(x['id']) for x in logbook])
            best,worst,mean = np.asarray(best),np.asarray(worst),np.asarray(mean)

            best_results = evaluateAllIndividuals(me, best, X_train, y_train, X_test, y_test)
            worst_results = evaluateAllIndividuals(me, worst, X_train, y_train, X_test, y_test)
            mean_results = evaluateAllIndividuals(me, mean, X_train, y_train, X_test, y_test)

    # SORTING MULTIDIMENSIONAL ARRAY USING ARGMAX
    # index = fitnesses.argsort(axis=1)
    # x, y, z = np.indices(index.shape)
    # fitnesses.shape, fitnesses[x, index, z].argmin(axis=1).shape

            save_obj({
                    'fitnesses': fitnesses,
    #                 'individuals': individuals,
                    'best_results': best_results,
                    'worst_results': worst_results,
                    'mean_results': mean_results,
                    'best_individuals': np.asarray(best),
                    'worst_individuals': np.asarray(worst),
                    'mean_individuals': np.asarray(mean),
                    'runtime': end-start,
                    'baseline': baseline,
                    'X_train': X_train,
                    'y_train': y_train,
                    'X_test': X_test,
                    'y_test': y_test,
                }, dump_filename)
        except:
            print("Unexpected error:", sys.exc_info()[0])

In [None]:
results = []
for file in glob.glob("{}/*.pkl".format(resultsDirectory)):
    results.append(file)
results.sort()

resultsByDataset = {}
for x in results:
    _,_,filename = re.split('/|\\\\', x)
    datasetName,methodName = filename[:-4].split('__')
    
    if datasetName not in resultsByDataset:
        resultsByDataset[datasetName] = {}

    resultsByDataset[datasetName][methodName] = load_obj(filename[:-4])

In [None]:
methodTitles = list(filter(lambda x:x[0][:5]=='stand', sortedTechniques))[-4:]

In [None]:
legend = [
    "Maximal fitness",
    "Median fitness",
    "Successrate using best",
    "Successrate using worst",
#     "Successrate using mean",
    "Minimal fitness",
    "Baseline successrate",
]
    
for datasetName, alldata in sorted(resultsByDataset.items()):
    N = sum([1 if x in alldata else 0 for x,y in methodTitles])
    
    fig, axes = startGraphing('`{}` dataset'.format(datasetName), 4, N, size=(10, 2), sharey=False)
    i = 0
    for method, title in methodTitles:
        
        if method not in alldata:
            continue
        
        data = alldata[method]
#         print(datasetName, title, math.ceil(data['runtime']))
        plotFitness(axes[i],**data, title=title, min_gen=0, max_gen=1000, sigma=10)
        i += 1

    endGraphing(fig, legend, '{}/{}'.format(graphsDirectory, datasetName))

In [None]:
# ## REMOVING item from file
# results = []
# for file in glob.glob("{}/*.pkl".format(resultsDirectory)):
#     results.append(file)
# results.sort()

# resultsByDataset = {}
# for x in results:
#     _,_,filename = re.split('/|\\\\', x)
#     datasetName,methodName = filename[:-4].split('__')

#     X = load_obj(filename[:-4])
#     X.pop('individuals')
#     save_obj(X, filename[:-4])