Dimensionality reduction
========

In [None]:
import numpy as np
import pandas as pd
import glob, os, os.path, sys, warnings, math, time, re
# warnings.filterwarnings('ignore')

from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from metric_learn import LMNN, NCA, LFDA, Covariance, MetricEvolution, NeuralNetworkTransformer, FullMatrixTransformer
from metric_learn import ITML_Supervised, SDML_Supervised, LSML_Supervised, RCA_Supervised
ME = MetricEvolution

%matplotlib inline
from plotting.plots import *

datasetsDirectory = 'datasets'
resultsDirectory = 'results/dim-reduction'
graphsDirectory = 'img/dim-reduction'

if not os.path.exists(resultsDirectory):
    os.makedirs(resultsDirectory)
    
if not os.path.exists(graphsDirectory):
    os.makedirs(graphsDirectory)

np.set_printoptions(formatter={'float': lambda x: "{0:0.5f}".format(x)})

import pickle
def save_obj(obj, name):
    with open('{}/{}.pkl'.format(resultsDirectory, name), 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('{}/{}.pkl'.format(resultsDirectory, name), 'rb') as f:
        return pickle.load(f)
    
def exists_obj(name):
    return os.path.exists('{}/{}.pkl'.format(resultsDirectory, name))

def gfn(filename, folder='dimred'):
    odir = '../thesis-distance-metric-learning/graphs/{}'.format(folder)
    if not os.path.exists(odir):
        os.makedirs(odir)
    return '{}/{}'.format(odir, filename)

In [None]:
import glob, os

datasets = []
for file in glob.glob("{}/*.csv".format(datasetsDirectory)):
    datasets.append(file)
datasets.sort()

for x in datasets:
    print(x, pd.read_csv(x, sep=',', skiprows=1, header=0).shape)

In [None]:
skipDone = False
def makeGraphBuilder(datasetName, X, y, perRow=4):
    traces = []
    models = []
    scores = []
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    
    imputer = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=False)
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)
    
    sscaler = StandardScaler()
    X_train = sscaler.fit_transform(X_train)
    X_test = sscaler.transform(X_test)
    
    def add(methodName, model, canTransform=True):
        filename = '{}__{}'.format(datasetName, methodName)
        if skipDone and exists_obj(filename):
            print('skipping {}, already exists'.format(filename))
            return
        
#         model = method(**params)
#         model.set_params(**params)
        models.append(model)

        try:
            if canTransform: # TSNE cant transform
                Xt_train = model.fit_transform(X_train, y_train)
                Xt_test = model.transform(X_test)
            else:
                Xt = model.fit_transform(np.concatenate((X_train, X_test)), np.concatenate((y_train, y_test)))
                l = len(X_train)
                Xt_train, Xt_test = Xt[:l], Xt[l:]

            knn = KNeighborsClassifier(n_neighbors=4, n_jobs=-1)
            knn.fit(Xt_train, y_train)

            score = knn.score(Xt_test, y_test)
            wrong = knn.predict(Xt_test) != y_test
            
            print('{} has {}%'.format(methodName, score*100))

            if exists_obj(filename):
                previousRun = load_obj(filename)
                print('{} previously had {}%'.format(methodName, previousRun['score']*100))
                if previousRun['score'] >= score: return
            
            print('SAVING')
            
            data = {
                'score': score,
                'X_train': Xt_train,
                'y_train': y_train,
                'X_test': Xt_test,
                'y_test': y_test,
                'wrong': wrong,
            }
            save_obj(data, filename)
        except:
            import traceback
            traceback.print_exc()
#             raise

    return add

def P(methods):
    def c(**kwargs):
        return Pipeline([(x.__name__.lower(),x()) for x in methods])
    return c

def P(methods):
    def c(**kwargs):
        return Pipeline([(type(x).__name__.lower(),x) for x in methods])
    return c()

In [None]:
sortedDatasetsBySize = [
 'datasets/balance-scale.csv',
 'datasets/breast-cancer.csv',
 'datasets/gaussians.csv',
 'datasets/iris.csv',
 'datasets/pima-indians.csv',
 'datasets/wine.csv',

 'datasets/sonar.csv',
 'datasets/mice-protein.csv',
 'datasets/digits10.csv',
 'datasets/digits6.csv',
]

In [None]:
for filename in sortedDatasetsBySize:#['datasets/balance-scale.csv', 'datasets/gaussians.csv']:#
    results = []
    datasetName = filename[len(datasetsDirectory)+1:-4]
    
    data = pd.read_csv(filename, sep=',', skiprows=1, header=0)
    y = data['class']
    X = data.drop(['class'], axis=1).values

    le = LabelEncoder()
    y = le.fit_transform(y)
    
    g = makeGraphBuilder(datasetName, X, y, perRow=3)
    
    defaults = {
#         'transformer':'neuralnetwork',
#         't__layers':(2,),
        
        'n_gen': 100,
#         'class_separation': True,
#         'verbose': False,
#         'random_state': 41,
    }

    print(datasetName)
    
    start = time.clock()
    
#     g('pca', PCA(n_components=2))
#     g('tsne', TSNE(n_components=2)) # , 'perplexity':50
#     g('lfda', LFDA(num_dims=2, k=2))
#     g('cmaes', ME(t__n_components=2))
    
#     g('normpca', P([PCA(n_components=2) ]))
#     g('normlfda', P([LFDA(num_dims=2, k=2), StandardScaler() ]))
# #     g('normnca', P([NCA(num_dims=2) ]))
#     g('normtsne', P([TSNE(n_components=2) ]) ,canTransform=False)
    g('normcmaes', P([ME(**defaults, t__n_components=2) ]))
    g('normcmaestsne', P([ME(**defaults, ), TSNE(n_components=2) ]), canTransform=False)
    
#     g('nnnone', P([ME(**defaults, transformer=NeuralNetworkTransformer(layers=(16,8,4,2,), activation=None)) ]))
    deepL = (16,8,4,2,)
    g('nnrelu', P([ME(**defaults, transformer=NeuralNetworkTransformer(layers=deepL, activation='relu')) ]))
    g('nnsigm', P([ME(**defaults, transformer=NeuralNetworkTransformer(layers=deepL, activation='sigm')) ]))
    g('nntanh', P([ME(**defaults, transformer=NeuralNetworkTransformer(layers=deepL, activation='tanh')) ]))
    
    shallowL = (8,2,)
    g('nnsigmshallow', P([ME(**defaults, transformer=NeuralNetworkTransformer(layers=shallowL, activation='sigm')), StandardScaler() ]))
#     g('nnsigmdeep', P([ME(**defaults, transformer=NeuralNetworkTransformer(layers=(2,2,2,2,2,2,2,2,2,2,), activation='sigm')), StandardScaler() ]))
    
    g('nntanhshallow', P([ME(**defaults, transformer=NeuralNetworkTransformer(layers=shallowL, activation='tanh')), StandardScaler() ]))
#     g('nntanhdeep', P([ME(**defaults, transformer=NeuralNetworkTransformer(layers=(2,2,2,2,2,2,2,2,2,2,), activation='tanh')), StandardScaler() ]))
    
    g('nnrelushallow', P([ME(**defaults, transformer=NeuralNetworkTransformer(layers=shallowL, activation='relu')), StandardScaler() ]))
#     g('nnreludeep', P([ME(**defaults, transformer=NeuralNetworkTransformer(layers=(2,2,2,2,2,2,2,2,2,2,), activation='relu')), StandardScaler() ]))
    
#     g('nnnoneshallow', P([ME(**defaults, transformer=NeuralNetworkTransformer(layers=shallowL, activation=None)), StandardScaler() ]))
#     g('nnnonedeep', P([ME(**defaults, transformer=NeuralNetworkTransformer(layers=(2,2,2,2,2,2,2,2,2,2,), activation=None)), StandardScaler() ]))
    
#     end = time.clock()
#     print(end - start)
#     start = time.clock()
    
#     g(ME, {**defaults, 'classifier':'knn','c__n_neighbors':1,'c__n_jobs':-1,'c__weights':'uniform'})
#     g(ME, {**defaults, 'classifier':'svc',})
#     g(ME, {**defaults, 'classifier':'lsvc','c__dual':False,})
#     g(ME, {**defaults, 'transformer':NeuralNetworkTransformer(layers=(2,2,), activation='tanh'), 'c__n_neighbors':1, })
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,2,2,)), 'n_neighbors':1, 'n_gen':n_gen,'class_separation':0, 'verbose':verbose})
    
#     end = time.clock()
#     print(end - start)
#     start = time.clock()
    
#     good_transformer = 'full'
#     if X.shape[1]>15:
#         good_transformer = NeuralNetworkTransformer(layers=(10,8,6,4,))
#         good_transformer = 'diagonal'
    
#     g(P([ME, PCA]), {'pca__n_components':2, 'metricevolution__transformer': good_transformer})
#     g(P([ME, TSNE]), {'tsne__n_components':2, 'tsne__perplexity': 30, 'metricevolution__transformer': good_transformer})
#     if False and X.shape[1]>7:
#         g(ME, {'transformer': NeuralNetworkTransformer(layers=(10,8,6,4,2))})
#     else:
#     g(P([ME, TSNE]), {'tsne__n_components':2, 'tsne__perplexity': 30, 
#                       'metricevolution__transformer': 'full', 'metricevolution__evolution_strategy':'dde', 'metricevolution__verbose':True, 
#                       'metricevolution__fitnesses':('class_separation',)})
    
#     end = time.clock()
#     print(end - start)
#     start = time.clock()
    
#     g(P([ME, TSNE]), {'tsne__n_components':2, 'tsne__perplexity': 30, 
#                       'metricevolution__transformer': 'full', 'metricevolution__evolution_strategy':'de', 'metricevolution__verbose':True})

#     end = time.clock()
#     print(end - start)
#     start = time.clock()
    
#     g(P([ME, TSNE]), {'tsne__n_components':2, 'tsne__perplexity': 30, 
#                       'metricevolution__transformer': 'full', 'metricevolution__evolution_strategy':'cmaes', 'metricevolution__verbose':True})

#     end = time.clock()
#     print(end - start)
#     start = time.clock()
    
#     g(ME, {'transformer': 'full', 't__n_components': 2, 'evolution_strategy':'metricevolution'})
#     g(ME, {'transformer': 'full', 't__n_components': 2, 'evolution_strategy':'metricevolution'})
#     g(ME, {'transformer': 'kmeans', 't__transformer': 'full'})
#     g(ME, {'transformer': 'kmeans', 't__transformer': 'diagonal', 't__n_clusters':2})
#     g(ME, {'transformer': 'kmeans', 't__transformer': 'diagonal', 't__n_clusters':2, 't__function':'product'})
#     g(P([ME, TSNE]), {'tsne__n_components':2, 'tsne__perplexity': 30, 'metricevolution__transformer': 'kmeans', 'metricevolution__t__transformer':'diagonal', 'metricevolution__t__n_classes': 'same'})
#     g(P([ME, TSNE]), {'tsne__n_components':2, 'tsne__perplexity': 30, 'metricevolution__transformer': 'kmeans', 'metricevolution__t__transformer':'diagonal', 'metricevolution__t__n_classes': 'same', 'metricevolution__t__function':'product'})

#     end = time.clock()
#     print(end - start)
#     start = time.clock()
    
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,)), 'n_neighbors':1, 'n_gen':n_gen,'classifier':'svm', 'verbose':verbose})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,2,)), 'n_neighbors':1, 'n_gen':n_gen,'classifier':'svm', 'verbose':verbose})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,2,2,)), 'n_neighbors':1, 'n_gen':n_gen,'classifier':'svm', 'verbose':verbose})

#     end = time.clock()
#     print(end - start)
#     start = time.clock()
    
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,)), 'n_neighbors':1, 'n_gen':n_gen,'classifier':'lsvm', 'verbose':verbose})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,2,)), 'n_neighbors':1, 'n_gen':n_gen,'classifier':'lsvm', 'verbose':verbose})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,2,2,)), 'n_neighbors':1, 'n_gen':n_gen,'classifier':'lsvm', 'verbose':verbose})

#     end = time.clock()
#     print(end - start)
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,2,), activation='tanh'), 'n_neighbors':1, 'n_gen':n_gen,'class_separation':0, 'verbose':verbose})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,2,2,), activation='tanh'), 'n_neighbors':1, 'n_gen':n_gen,'class_separation':0, 'verbose':verbose})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(4,3,2,2,), activation='tanh'), 'n_neighbors':1, 'n_gen':n_gen,'class_separation':0, 'verbose':verbose})

#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,2,), activation='none'), 'n_neighbors':1, 'n_gen':n_gen,'class_separation':0, 'verbose':verbose})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,2,2,), activation='none'), 'n_neighbors':1, 'n_gen':n_gen,'class_separation':0, 'verbose':verbose})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(4,3,2,2,), activation='none'), 'n_neighbors':1, 'n_gen':n_gen,'class_separation':0, 'verbose':verbose})

#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,)), 'n_neighbors':4, 'n_gen':n_gen,'class_separation':0, 'verbose':True})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,2,)), 'n_neighbors':4, 'n_gen':n_gen,'class_separation':0, 'verbose':True})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,2,2,), activation='relu'), 'n_neighbors':4, 'n_gen':n_gen,'class_separation':0, 'verbose':True})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,2,2,2,)), 'n_neighbors':4, 'n_gen':n_gen,'class_separation':0, 'verbose':True})
    
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,)), 'n_neighbors':8, 'n_gen':n_gen,'class_separation':1, 'verbose':True})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(4,3,2,)), 'n_neighbors':8, 'n_gen':n_gen,'class_separation':1, 'verbose':True})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(5,4,3,2,)), 'n_neighbors':8, 'n_gen':n_gen,'class_separation':1, 'verbose':True})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(16,8,4,2,)), 'n_neighbors':8, 'n_gen':n_gen,'class_separation':1, 'verbose':True})
    
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,)), 'n_neighbors':8, 'n_gen':n_gen,'class_separation':-1, 'verbose':True})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(4,3,2,)), 'n_neighbors':8, 'n_gen':n_gen,'class_separation':-1, 'verbose':True})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(5,4,3,2,)), 'n_neighbors':8, 'n_gen':n_gen,'class_separation':-1, 'verbose':True})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(16,8,4,2,)), 'n_neighbors':8, 'n_gen':n_gen,'class_separation':-1, 'verbose':True})

#     break

In [None]:
results = []
for file in glob.glob("{}/*.pkl".format(resultsDirectory)):
    results.append(file)
results.sort()

resultsByDataset = {}

for x in results:
    _,_,filename = re.split('/|\\\\', x)
    datasetName,methodName = filename[:-4].split('__')
    
    if datasetName not in resultsByDataset:
        resultsByDataset[datasetName] = {}

    resultsByDataset[datasetName][methodName] = load_obj(filename[:-4])

In [None]:
methodTitles = [
#     ('nnnoneshallow', 'None shallow'),
#     ('nnnonedeep', 'None deep'),
    
    ('nnsigmshallow', 'Shallow NN.Sigm'),
#     ('nnsigmdeep', 'Sigm deep'),

    ('nntanhshallow', 'Shallow NN.Tanh'),
#     ('nntanhdeep', 'Tanh deep'),

    ('nnrelushallow', 'Shallow NN.ReLU'),
#     ('nnreludeep', 'ReLU deep'),
    
#     ('nnnone', 'None'),
    ('nnsigm', 'Deep NN.Sigmoid'),
    ('nntanh', 'Deep NN.Tanh'),
    ('nnrelu', 'Deep NN.ReLU'),
]
for datasetName, alldata in resultsByDataset.items():
#     if datasetName not in ['wine']: continue
    N = sum([1 if x in alldata else 0 for x,y in methodTitles])
    
    i = 0
    cols = 3
    fig, axes = startGraphing('`{}` dataset'.format(datasetName), cols, N, size=(8, 3*(N//cols)))
    for method, title in methodTitles:
        
        if method not in alldata:
            plotEmpty(axes[i], title, 'Memory Error', hideAxis=True)
            i += 1
            continue
        
        data = alldata[method]
        plotScatter(axes[i],title,**data, scoreIsAproximation='t-SNE' in title)
        i += 1
    endGraphing(fig, filename=gfn('{}'.format(datasetName), folder='dimrednn'), move_title=.89)
    plt.subplots_adjust(wspace=0)

In [None]:
methodTitles = [
    ('normpca', 's:PCA'),
    ('normnca', 's:NCA'),
    ('normlfda', 's:LFDA'),
    ('normcmaes', 's:CMAES.kNN'),
    ('normtsne', 's:t-SNE'),
    ('normcmaestsne', 's:CMAES.kNN+t-SNE'),    
]
for datasetName, alldata in resultsByDataset.items():
#     if datasetName not in ['wine']: continue
    N = 6 #sum([1 if x in alldata else 0 for x,y in methodTitles])
    
    i = 0
    cols = 3
#     nSamples,nDim = pd.read_csv('datasets/{}.csv'.format(datasetName), sep=',', skiprows=1, header=0).shape
#     fig, axes = startGraphing('`{}` dataset: {} dimensions, {} samples'.format(datasetName, nDim, nSamples), cols, N, size=(8, 3*(N//cols)))
    fig, axes = startGraphing('`{}` dataset'.format(datasetName), cols, N, size=(8, 3*(N//cols)))
    for method, title in methodTitles:
        
        if method not in alldata:
            plotEmpty(axes[i], title, 'Memory Error', hideAxis=True)
            i += 1
            continue
        
        data = alldata[method]
        plotScatter(axes[i],title,**data, scoreIsAproximation='t-SNE' in title)
        i += 1
    endGraphing(fig, filename=gfn('{}'.format(datasetName)), move_title=.89)
    plt.subplots_adjust(wspace=0)
#     break