Fitness evaluation
========

In [None]:
import numpy as np
import pandas as pd
import glob, os, os.path, sys, warnings, math, time, re

%matplotlib inline
import matplotlib.pyplot as plt
from plotting.plots import *

from techniques import sortedTechniques

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from metric_learn import LMNN, NCA, LFDA, Covariance, MetricEvolution, NeuralNetworkTransformer, FullMatrixTransformer, DiagonalMatrixTransformer
from metric_learn import ITML_Supervised, SDML_Supervised, LSML_Supervised, RCA_Supervised
ME = MetricEvolution

datasetsDirectory = 'datasets'
resultsDirectory = 'results/fitness'
graphsDirectory = 'img/fitness'

if not os.path.exists(resultsDirectory):
    os.makedirs(resultsDirectory)
    
if not os.path.exists(graphsDirectory):
    os.makedirs(graphsDirectory)

# np.set_printoptions(precision=7, suppress=True, threshold=np.nan)
np.set_printoptions(formatter={'float': lambda x: "{0:0.5f}".format(x)})

import pickle
def save_obj(obj, name):
    with open('{}/{}.pkl'.format(resultsDirectory, name), 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('{}/{}.pkl'.format(resultsDirectory, name), 'rb') as f:
        return pickle.load(f)
    
def exists_obj(name):
    return os.path.exists('{}/{}.pkl'.format(resultsDirectory, name))

def gfn(filename):
    odir = '../thesis-distance-metric-learning/thesis/graphs/fitness'
    if not os.path.exists(odir):
        os.makedirs(odir)
    return '{}/{}'.format(odir, filename)

In [None]:
import glob, os

datasets = []
for file in glob.glob("{}/*.csv".format(datasetsDirectory)):
    datasets.append(file)
datasets.sort()

# datasets.remove('datasets/gaussians.csv')
# datasets.remove('datasets/ionosphere.csv')
# datasets.remove('datasets/mice-protein.csv')
# datasets.remove('datasets/sonar.csv')
# datasets.remove('datasets/soybean-large.csv')
# datasets.remove('datasets/digits6.csv')
# datasets.remove('datasets/digits10.csv')
# datasets.remove('datasets/mice-protein.csv')
# datasets.remove('datasets/sonar.csv')

# datasets = ['datasets/digits6.csv','datasets/digits10.csv','datasets/mice-protein.csv','datasets/sonar.csv']
# datasets = ['datasets\\balance-scale.csv']

datasets = [
'datasets/balance-scale.csv',
'datasets/breast-cancer.csv',
'datasets/gaussians.csv',
'datasets/iris.csv',
'datasets/pima-indians.csv',
'datasets/wine.csv',
'datasets/mice-protein.csv',
'datasets/sonar.csv',
'datasets/digits6.csv',
'datasets/digits10.csv',
]

biggerDatasets = [
    'mice-protein',
    'sonar',
    'digits6',
    'digits10',
]

for x in datasets:
    X = pd.read_csv(x, sep=',', skiprows=1, header=0)
    print(x, X.shape, X['class'].nunique())
    
# datasets = ['datasets/breast-cancer.csv']

In [None]:
def evaluateAllIndividuals(me, individuals, X_train, y_train, X_test, y_test, dim=None, k=4):
    results = []
    for individual in individuals:
        individual = me._strategy.cut_individual(individual)
        if X_train.shape[1]==len(individual):
            tr = DiagonalMatrixTransformer()
        else:
            tr = FullMatrixTransformer(n_components=dim)
        tr.fit(X_train, y_train, individual)
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(tr.transform(X_train), y_train)
        results.append(knn.score(tr.transform(X_test), y_test))
    return results

from operator import attrgetter
def select_best_worst_mean(individuals):
    x = sorted(individuals, key=attrgetter("fitness"))
    return x[-1], x[0], x[len(x)//2]

In [None]:
for transformer in ['diagonal', 'full']:
    common_params = {
        's__stats': 'identity',
        's__n_gen': 1000,
        'transformer': transformer,
    }
    methods = [
        ('stand+CMAES+kNN', MetricEvolution(**common_params, strategy='cmaes', fitnesses=[('knn', {'n_neighbors':8})])),
        ('stand+CMAESFme+kNN', MetricEvolution(**common_params, strategy='cmaes', fitnesses='wfme')),
        ('stand+JDEkNN+kNN', MetricEvolution(**common_params, strategy='jde', fitnesses=[('knn', {'n_neighbors':8})])),
        ('stand+JDE+kNN', MetricEvolution(**common_params, strategy='jde', fitnesses='wfme')),
        ('stand+logJDEkNN+kNN', MetricEvolution(**common_params, strategy='jde', s__population_size='log', fitnesses=[('knn', {'n_neighbors':8})])),
        ('stand+logJDE+kNN', MetricEvolution(**common_params, strategy='jde', s__population_size='log', fitnesses='wfme')),
    ]

    for filename in datasets:

        data = pd.read_csv(filename, sep=',', skiprows=1, header=0)

        y = data['class']
        X = data.drop(['class'], axis=1).values

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

        le = LabelEncoder()
        y = le.fit_transform(y)

        imputer = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=False)
        X_train = imputer.fit_transform(X_train)
        X_test = imputer.transform(X_test)

        standard = StandardScaler()
        X_train = standard.fit_transform(X_train)
        X_test = standard.transform(X_test)

        knn = KNeighborsClassifier(n_neighbors=4)
        knn.fit(X_train, y_train)
        baseline = knn.score(X_test, y_test)

        for methodName, method in methods:
            datasetName = filename[len(datasetsDirectory)+1:-4]

            dump_filename = '{}__{}__{}'.format(datasetName, methodName, common_params['transformer'])
            if exists_obj(dump_filename):
                print('skipping {}, already exists'.format(dump_filename))
                continue
            
            transformer = 'diagonal' if datasetName in biggerDatasets else 'full'
            if 'logJDE' in methodName)

            print("starting {} ({})".format(dump_filename, X.shape))

            try:
                start = time.clock()

                me = method
                me.fit(X_train, y_train)

                end = time.clock()

                logbook = me._strategy.logbook
                individuals = np.asarray([[ ind for ind in x['id']] for x in logbook])
                fitnesses = np.asarray([[ ind.fitness.wvalues for ind in x['id']] for x in logbook])

                print(individuals.shape)

                best,worst,mean = zip(*[select_best_worst_mean(x['id']) for x in logbook])
                best,worst,mean = np.asarray(best),np.asarray(worst),np.asarray(mean)

                best_results = evaluateAllIndividuals(me, best, X_train, y_train, X_test, y_test)
                worst_results = evaluateAllIndividuals(me, worst, X_train, y_train, X_test, y_test)
                mean_results = evaluateAllIndividuals(me, mean, X_train, y_train, X_test, y_test)

        # SORTING MULTIDIMENSIONAL ARRAY USING ARGMAX
        # index = fitnesses.argsort(axis=1)
        # x, y, z = np.indices(index.shape)
        # fitnesses.shape, fitnesses[x, index, z].argmin(axis=1).shape

                save_obj({
                        'fitnesses': fitnesses,
        #                 'individuals': individuals,
                        'best_results': best_results,
                        'worst_results': worst_results,
                        'mean_results': mean_results,
                        'best_individuals': np.asarray(best),
                        'worst_individuals': np.asarray(worst),
                        'mean_individuals': np.asarray(mean),
                        'runtime': end-start,
                        'baseline': baseline,
                        'X_train': X_train, 
                        'y_train': y_train,
                        'X_test': X_test,
                        'y_test': y_test,
                    }, dump_filename)
            except:
                print("Unexpected error:", sys.exc_info()[0])

In [None]:
results = []
for file in glob.glob("{}/*.pkl".format(resultsDirectory)):
    results.append(file)
results.sort()

resultsByDataset = {}
for x in results:
    _,_,filename = re.split('/|\\\\', x)
    datasetName,methodName,transformer = filename[:-4].split('__')
    
    if datasetName not in resultsByDataset:
        resultsByDataset[datasetName] = {}
    if methodName not in resultsByDataset[datasetName]:
        resultsByDataset[datasetName][methodName] = {}

    resultsByDataset[datasetName][methodName][transformer] = load_obj(filename[:-4])
    
def findBestIndividual(dataset, method, transformer='full'):
    data = resultsByDataset[dataset][method][transformer]
    bi = np.argmax(data['best_results'])
    return data['best_individuals'][bi, :]

findBestIndividual('balance-scale', 'stand+CMAES+kNN')

In [None]:
# methodTitles = list(filter(lambda x:x[0][:5]=='stand', sortedTechniques))[-4:]
methodTitles = [
    ('stand+CMAES+kNN', 's:CMAES.kNN'),
    ('stand+JDEkNN+kNN', 's:jDE.kNN'),
 
    ('stand+CMAESFme+kNN', 's:CMAES.fMe'),
    ('stand+JDE+kNN', 's:jDE.fMe'),
    
    ('stand+logJDEkNN+kNN', 's:(log)jDE.kNN'),
    ('stand+logJDE+kNN', 's:(log)jDE.fMe'),
]

methodTitlesNormal = methodTitles[:-2]

methodTitlesJde = [
    methodTitles[1],
    methodTitles[4],
    methodTitles[3],
    methodTitles[5],
]

legend = [
    "Maximal fitness",
    "Median fitness",
    "Test successrate using best",
    "Test successrate using worst",
#     "Successrate using mean",
    "Minimal fitness",
    "Baseline test successrate",
]

transformerName = {
    'full': 'full',
    'diagonal': 'diag',
}

In [None]:
for datasetName, alldata in sorted(resultsByDataset.items()):
    N = sum([len(alldata[x]) if x in alldata else 0 for x,y in methodTitlesNormal]) 
    if N==0:
        print('skipped {} dataset, no data'.format(datasetName))
        continue
    
    cols = 4
    fig, axes = startGraphing('`{}` dataset'.format(datasetName), cols, N, size=(10, 3*math.ceil(N/cols)), sharey=False)
    i = 0
    baseline = None
    for transformer in ['full', 'diagonal']:
        for method, title in methodTitlesNormal:

            if method not in alldata: continue
            if transformer not in alldata[method]: continue

            data = alldata[method][transformer]
    #         print(datasetName, title, math.ceil(data['runtime']))

            if baseline is None:
                baseline = data['baseline']
            else:
                data['baseline'] = baseline

            title += " ({})".format(transformerName[transformer])
            plotFitness(axes[i],**data, title=title, min_gen=30, max_gen=1000, sigma=3, ylabel=None)
            i += 1

        endGraphing(fig, legend, filename=gfn(datasetName), move_title=.90, adjust_legend=.12)
#     break

In [None]:
for datasetName, alldata in sorted(resultsByDataset.items()):
    transformer = 'diagonal' if datasetName in biggerDatasets else 'full'
#     N = sum([1 if x in alldata else 0 for x,y in methodTitlesJde]) # len(alldata[x][transformer])
    
    N = 0
    for x,y in methodTitlesJde:
        if x not in alldata: continue
        if transformer not in alldata[x]: continue
        
        N += 1
    
    if N==0:
        print('skipped {} dataset, no data'.format(datasetName))
        continue
    
    cols = 4
    fig, axes = startGraphing('`{}` dataset'.format(datasetName), cols, N, size=(10, 3*math.ceil(N/cols)), sharey=False)
    i = 0
    baseline = None
#     for transformer in ['full', 'diagonal']:
    for method, title in methodTitlesJde:
        
        if method not in alldata: continue
        if transformer not in alldata[method]: continue

        data = alldata[method][transformer]
#         print(datasetName, title, math.ceil(data['runtime']))

        if baseline is None:
            baseline = data['baseline']
        else:
            data['baseline'] = baseline

        title += " ({})".format(transformerName[transformer])
        plotFitness(axes[i],**data, title=title, min_gen=30, max_gen=1000, sigma=3, ylabel=None)
        i += 1

    endGraphing(fig, legend, filename=gfn(datasetName+'_jde'), move_title=.80, adjust_legend=.2)
#     break

In [None]:
# ## REMOVING item from file
# results = []
# for file in glob.glob("{}/*.pkl".format(resultsDirectory)):
#     results.append(file)
# results.sort()

# resultsByDataset = {}
# for x in results:
#     _,_,filename = re.split('/|\\\\', x)
#     datasetName,methodName = filename[:-4].split('__')

#     X = load_obj(filename[:-4])
#     X.pop('individuals')
#     save_obj(X, filename[:-4])

In [None]:
# ## RENAMING results for full/diag
# results = []
# for file in glob.glob("{}/*.pkl".format(resultsDirectory)):
#     results.append(file)
# results.sort()
# results

# for x in results:
#     os.rename(x, x.replace('.pkl', '__full.pkl'))