Dimensionality reduction
========

In [None]:
import numpy as np
import pandas as pd
import glob
import os.path
import sys

# from svecon.HierarchicalGridSearchCV import HierarchicalGridSearchCV
# from svecon.EmptyTransformer import EmptyTransformer

from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from metric_learn import LMNN, NCA, LFDA, Covariance, MetricEvolution, NeuralNetworkTransformer, FullMatrixTransformer
from metric_learn import ITML_Supervised, SDML_Supervised, LSML_Supervised, RCA_Supervised
ME = MetricEvolution

import plotly
from plotly import tools
plotly.tools.set_credentials_file(username='sveco', api_key='8701ghzf0i')
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode()

datasetsDirectory = 'datasets'
resultsDirectory = 'datasets-results-dim-reduction'
graphsDirectory = 'datasets-graphs-dim-reduction'

if not os.path.exists(resultsDirectory):
    os.makedirs(resultsDirectory)
    
if not os.path.exists(graphsDirectory):
    os.makedirs(graphsDirectory)

default_n_jobs = 8
default_random_state = 789
default_n_folds = 10
default_shuffle = True

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logger.handlers = []
fh = logging.FileHandler("{}/error.log".format(resultsDirectory))
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)
fh = logging.StreamHandler(sys.stdout)
fh.setLevel(logging.ERROR)
logger.addHandler(fh)

defaultScatterMarker=dict(
    size=10,
    colorscale='Viridis',
    opacity=0.5
)

# np.set_printoptions(precision=7, suppress=True, threshold=np.nan)
np.set_printoptions(formatter={'float': lambda x: "{0:0.5f}".format(x)})

In [None]:
import glob, os

def rename(dir, pattern='*.csv'):
    for pathAndFilename in glob.iglob(os.path.join(dir, pattern)):
#         title, ext = os.path.splitext(os.path.basename(pathAndFilename))
#         os.rename(pathAndFilename, os.path.join(dir, titlePattern % title + ext))
        newFilename = pathAndFilename.replace('NeuralNetworkTransformer', 'NT')
        os.rename(pathAndFilename, newFilename)

rename(resultsDirectory)

In [None]:
import glob, os

datasets = []
for file in glob.glob("{}/*.csv".format(datasetsDirectory)):
    datasets.append(file)
datasets.sort()

# datasets.remove('datasets/soybean-large.csv')
# datasets/balance-scale.csv (625, 5)
# datasets/breast-cancer-wisconsin.csv (699, 10)
# datasets.remove('datasets/digits10.csv') # (1797, 65)
# datasets.remove('datasets/digits6.csv') # (1083, 65)
# datasets/ionosphere.csv (351, 35)
# datasets/iris.csv (150, 5)
# datasets.remove('datasets/mice-protein.csv') # (1080, 78)
# datasets/pima-indians-diabetes.csv (768, 9)
# datasets/sonar.csv (208, 61)
# datasets/soybean-large.csv (307, 36)
# datasets/wine.csv (178, 14)

# datasets = datasets[7:8]
logging.info("Datasets: " + str(datasets))

for x in datasets:
    print(x, pd.read_csv(x, sep=',', skiprows=1, header=0).shape)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import math
import warnings
warnings.filterwarnings('ignore')
import time

def makeScatter(X, y):
    X = X.T
    if X.shape[0] in (0,1): return go.Scatter()
    
    assert(X.shape[0]==2)
    
    return go.Scatter(x=X[0], y=X[1], #z=X_train_pca[2],
        text=y, mode='markers', marker={**defaultScatterMarker, 'color':y}, 
    )

def makeGraphBuilder(datasetName, X, y, perRow=4):
    traces = []
    models = []
    scores = []
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    def add(method, params):
        model = method(**params)
        model.set_params(**params)
        models.append(model)

        try:
            if callable(getattr(model, 'transform', None)): # TSNE cant transform
                Xt_train = model.fit_transform(X_train, y_train)
                Xt_test = model.transform(X_test)
            else:
                Xt = model.fit_transform(np.concatenate((X_train, X_test)), np.concatenate((y_train, y_test)))
                l = len(X_train)
                Xt_train, Xt_test = Xt[:l], Xt[l:]

#                 if isinstance(model, CMAES):
#                     print(len(model.hof[0]), model.hof[0].fitness)

            if Xt_test is not None:
                knn = KNeighborsClassifier(n_neighbors=4, n_jobs=-1)
                knn.fit(Xt_train, y_train)
                scores.append(knn.score(Xt_test, y_test))

                trace = makeScatter(
                    np.concatenate((Xt_train, Xt_test), axis=0),
                    np.concatenate((y_train, y_test), axis=0),
                )
            else:
                scores.append(None)
                trace = makeScatter(Xt_train, y_train)

            traces.append(trace)
        except ValueError:
            scores.append(0)
            traces.append(go.Scatter())
            raise

    def draw():
        rows = math.ceil(len(traces)*1.0/perRow)

        titles = []
        for m,s in zip(models, scores):
            if s is None:
                titles.append('{}'.format(m.__class__.__name__))
            else:
                titles.append('{} {:.2f}%'.format(m.__class__.__name__, s*100))

        fig = tools.make_subplots(
            rows=rows,
            cols=perRow,
            print_grid=False,
            subplot_titles=titles
        )
        for i,t in enumerate(traces):
            if t is None: 
                fig.append_trace(py.plotly_empty(), 1+(i//perRow), 1+(i%perRow))
            else:
                fig.append_trace(t, 1+(i//perRow), 1+(i%perRow))

        fig['layout'].update(title='{} dataset {}, {} classes'.format(datasetName, X.shape, len(np.unique(y))), showlegend=False)
        fig['layout'].update(width=700)
        fig['layout'].update(height=300*rows)
        py.iplot(fig)
        fig['layout'].update(titlefont={'size':28})
        fig['layout'].update(width=2100)
        fig['layout'].update(height=700*rows)
        plotly.plotly.image.save_as(fig, filename='{}/{}-plot.png'.format(graphsDirectory, datasetName))

    return add,draw

def P(methods):
    def c(**kwargs):
        return Pipeline([(x.__name__.lower(),x()) for x in methods])
    return c

In [None]:
for filename in datasets[0:]:
    results = []
    datasetName = filename[len(datasetsDirectory)+1:-4]
    
    data = pd.read_csv(filename, sep=',', skiprows=1, header=0)
    y = data['class']
    X = data.drop(['class'], axis=1).values

    le = LabelEncoder()
    y = le.fit_transform(y)
    
    imputer = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=False)
    X = imputer.fit_transform(X)
    
    g,draw = makeGraphBuilder(datasetName, X, y, perRow=3)
    
    defaults = {
        'transformer':'neuralnetwork',
        't__layers':(2,),
        
        'n_gen': 3,
        'class_separation': True,
        'verbose': False,
        'random_state': 41,
    }

    print(datasetName)
    
    start = time.clock()
    
    g(PCA, {'n_components':2})
    g(TSNE, {'n_components':2}) # , 'perplexity':50
    g(LFDA, {'dim':2, 'k':2})
    
#     end = time.clock()
#     print(end - start)
#     start = time.clock()
    
#     g(ME, {**defaults, 'classifier':'knn','c__n_neighbors':1,'c__n_jobs':-1,'c__weights':'uniform'})
#     g(ME, {**defaults, 'classifier':'svc',})
#     g(ME, {**defaults, 'classifier':'lsvc','c__dual':False,})
#     g(ME, {**defaults, 'transformer':NeuralNetworkTransformer(layers=(2,2,), activation='tanh'), 'c__n_neighbors':1, })
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,2,2,)), 'n_neighbors':1, 'n_gen':n_gen,'class_separation':0, 'verbose':verbose})
    
    end = time.clock()
    print(end - start)
    start = time.clock()
    
    good_transformer = 'full'
    if X.shape[1]>15:
        good_transformer = NeuralNetworkTransformer(layers=(10,8,6,4,))
        good_transformer = 'diagonal'
    
#     g(P([ME, PCA]), {'pca__n_components':2, 'metricevolution__transformer': good_transformer})
#     g(P([ME, TSNE]), {'tsne__n_components':2, 'tsne__perplexity': 30, 'metricevolution__transformer': good_transformer})
#     if False and X.shape[1]>7:
#         g(ME, {'transformer': NeuralNetworkTransformer(layers=(10,8,6,4,2))})
#     else:
#     g(P([ME, TSNE]), {'tsne__n_components':2, 'tsne__perplexity': 30, 
#                       'metricevolution__transformer': 'full', 'metricevolution__evolution_strategy':'dde', 'metricevolution__verbose':True, 
#                       'metricevolution__fitnesses':('class_separation',)})
    
    end = time.clock()
    print(end - start)
    start = time.clock()
    
#     g(P([ME, TSNE]), {'tsne__n_components':2, 'tsne__perplexity': 30, 
#                       'metricevolution__transformer': 'full', 'metricevolution__evolution_strategy':'de', 'metricevolution__verbose':True})

    end = time.clock()
    print(end - start)
    start = time.clock()
    
#     g(P([ME, TSNE]), {'tsne__n_components':2, 'tsne__perplexity': 30, 
#                       'metricevolution__transformer': 'full', 'metricevolution__evolution_strategy':'cmaes', 'metricevolution__verbose':True})

    end = time.clock()
    print(end - start)
    start = time.clock()
    
#     g(ME, {'transformer': 'full', 't__n_components': 2, 'evolution_strategy':'metricevolution'})
#     g(ME, {'transformer': 'full', 't__n_components': 2, 'evolution_strategy':'metricevolution'})
#     g(ME, {'transformer': 'kmeans', 't__transformer': 'full'})
#     g(ME, {'transformer': 'kmeans', 't__transformer': 'diagonal', 't__n_clusters':2})
#     g(ME, {'transformer': 'kmeans', 't__transformer': 'diagonal', 't__n_clusters':2, 't__function':'product'})
#     g(P([ME, TSNE]), {'tsne__n_components':2, 'tsne__perplexity': 30, 'metricevolution__transformer': 'kmeans', 'metricevolution__t__transformer':'diagonal', 'metricevolution__t__n_classes': 'same'})
#     g(P([ME, TSNE]), {'tsne__n_components':2, 'tsne__perplexity': 30, 'metricevolution__transformer': 'kmeans', 'metricevolution__t__transformer':'diagonal', 'metricevolution__t__n_classes': 'same', 'metricevolution__t__function':'product'})

#     end = time.clock()
#     print(end - start)
#     start = time.clock()
    
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,)), 'n_neighbors':1, 'n_gen':n_gen,'classifier':'svm', 'verbose':verbose})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,2,)), 'n_neighbors':1, 'n_gen':n_gen,'classifier':'svm', 'verbose':verbose})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,2,2,)), 'n_neighbors':1, 'n_gen':n_gen,'classifier':'svm', 'verbose':verbose})

#     end = time.clock()
#     print(end - start)
#     start = time.clock()
    
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,)), 'n_neighbors':1, 'n_gen':n_gen,'classifier':'lsvm', 'verbose':verbose})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,2,)), 'n_neighbors':1, 'n_gen':n_gen,'classifier':'lsvm', 'verbose':verbose})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,2,2,)), 'n_neighbors':1, 'n_gen':n_gen,'classifier':'lsvm', 'verbose':verbose})

#     end = time.clock()
#     print(end - start)
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,2,), activation='tanh'), 'n_neighbors':1, 'n_gen':n_gen,'class_separation':0, 'verbose':verbose})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,2,2,), activation='tanh'), 'n_neighbors':1, 'n_gen':n_gen,'class_separation':0, 'verbose':verbose})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(4,3,2,2,), activation='tanh'), 'n_neighbors':1, 'n_gen':n_gen,'class_separation':0, 'verbose':verbose})

#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,2,), activation='none'), 'n_neighbors':1, 'n_gen':n_gen,'class_separation':0, 'verbose':verbose})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,2,2,), activation='none'), 'n_neighbors':1, 'n_gen':n_gen,'class_separation':0, 'verbose':verbose})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(4,3,2,2,), activation='none'), 'n_neighbors':1, 'n_gen':n_gen,'class_separation':0, 'verbose':verbose})

#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,)), 'n_neighbors':4, 'n_gen':n_gen,'class_separation':0, 'verbose':True})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,2,)), 'n_neighbors':4, 'n_gen':n_gen,'class_separation':0, 'verbose':True})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,2,2,), activation='relu'), 'n_neighbors':4, 'n_gen':n_gen,'class_separation':0, 'verbose':True})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,2,2,2,)), 'n_neighbors':4, 'n_gen':n_gen,'class_separation':0, 'verbose':True})
    
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,)), 'n_neighbors':8, 'n_gen':n_gen,'class_separation':1, 'verbose':True})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(4,3,2,)), 'n_neighbors':8, 'n_gen':n_gen,'class_separation':1, 'verbose':True})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(5,4,3,2,)), 'n_neighbors':8, 'n_gen':n_gen,'class_separation':1, 'verbose':True})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(16,8,4,2,)), 'n_neighbors':8, 'n_gen':n_gen,'class_separation':1, 'verbose':True})
    
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(2,)), 'n_neighbors':8, 'n_gen':n_gen,'class_separation':-1, 'verbose':True})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(4,3,2,)), 'n_neighbors':8, 'n_gen':n_gen,'class_separation':-1, 'verbose':True})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(5,4,3,2,)), 'n_neighbors':8, 'n_gen':n_gen,'class_separation':-1, 'verbose':True})
#     g(ME, {'transformer':NeuralNetworkTransformer(layers=(16,8,4,2,)), 'n_neighbors':8, 'n_gen':n_gen,'class_separation':-1, 'verbose':True})

    draw()
    break