Z-normalization vs Distance Metric Learning
======

In [2]:
import csv
import numpy as np
import scipy.linalg as la
import matplotlib.pyplot as plt
import pandas as pd

from svecon.KNNClassifierPerClass import KNNClassifierPerClass

from sklearn.cross_validation import train_test_split
from sklearn.utils import shuffle
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA

import plotly.offline as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs
# download_plotlyjs('https://cdn.plot.ly/plotly-latest.min.js')
py.init_notebook_mode()

defaultScatterMarker=dict(
    size=10,
    colorscale='Viridis',
    opacity=0.5
)

# np.set_printoptions(precision=7, suppress=True, threshold=np.nan)
np.set_printoptions(threshold=np.nan)
np.set_printoptions(formatter={'float': lambda x: "{0:0.10f}".format(x)})

from metric_learn import LMNN
from metric_learn import NCA

In [3]:
def evaluateKnn(X_train, y_train, X_test, y_test, k=1):
    knn = KNeighborsClassifier(k)
    knn.fit(X_train, y_train)
    
    predicted = knn.predict(X_test)
    wrongVec = predicted!=y_test
    N = len(y_test)
    print('{}% success ({}/{})'.format((1-sum(wrongVec)/N)*100, N-sum(wrongVec), N) )
    
    print(confusion_matrix(y_test, predicted, labels=range(len(set(y_test)))))
    
    return wrongVec

In [4]:
def evaluateKnnNormPerClass(X_train, y_train, X_test, y_test, k=1):
    knn = KNNClassifierPerClass(k)
    knn.fit(X_train, y_train)
    
    predicted = knn.predict(X_test)
    wrongVec = predicted!=y_test
    N = len(y_test)
    print('{}% success ({}/{})'.format((1-sum(wrongVec)/N)*100, N-sum(wrongVec), N) )
    
    print(confusion_matrix(y_test, predicted, labels=range(len(set(y_test)))))
    
    return wrongVec

In [5]:
def plotScatter(X, y, wrong=None):
    if wrong is None: wrong=np.array([0]*len(y))
    
    pca = PCA(n_components=2)
    pca.fit(X)

    print(pca.explained_variance_ratio_)

    X_train_pca = pca.transform(X).T
    
    print(X_train_pca.shape)
    
    trace1 = go.Scatter(x=X_train_pca[0], y=X_train_pca[1], #z=X_train_pca[2],
        text=y+1, mode='markers', marker={**defaultScatterMarker, 'color':y, 'size':wrong*10+10}
    )

    layout = go.Layout(
         margin=dict(l=0, r=0, b=0, t=0),
    )
    
    fig = go.Figure(data=[trace1], layout={})
    py.iplot(fig)

def plotScatterTT(X, y, X_test, y_test, wrong=None):
    trainLen = len(y)
    
    if wrong is None: wrong=np.zeros(trainLen)
    
    wrong = np.concatenate((np.zeros(trainLen), wrong))
    X = np.vstack((X, X_test))
    y = np.concatenate((y, y_test))
    
    y[:trainLen] = y[:trainLen]+len(set(y))
    
#     pca = PCA(n_components=2)
#     pca.fit(X)

#     print(pca.explained_variance_ratio_)

#     X_train_pca = pca.transform(X).T
    X_train_pca = X.T
#     print(X_train_pca.shape)
    
    trace1 = go.Scatter(x=X_train_pca[0], y=X_train_pca[1], #z=X_train_pca[2],
        text=y+1, mode='markers', marker={**defaultScatterMarker, 'color':y, 'size':wrong*10+10}
    )

    minl = np.amin(X_train_pca)
    maxl = np.amax(X_train_pca)
    minl -= (maxl-minl)/25
    maxl += (maxl-minl)/25
    
    layout = go.Layout(
#         margin=dict(l=0, r=0, b=0, t=0),
        xaxis=dict(
            range=[minl, maxl]
        ),
        yaxis=dict(
            range=[minl, maxl]
        ),
        width=800,height=800,
    )
    
    fig = go.Figure(data=[trace1], layout=layout)
    py.iplot(fig)

    
def plotScatter3d(X, y, wrong=None):
    if wrong is None: wrong=np.array([0]*len(y))
    
    if(X.shape[1] <= 3):
        X_train_pca = X.T
    else:
        pca = PCA(n_components=3)
        pca.fit(X)

        print(pca.explained_variance_ratio_)

        X_train_pca = pca.transform(X).T
        print(X_train_pca.shape)
    
    trace1 = go.Scatter3d(x=X_train_pca[0], y=X_train_pca[1], z=X_train_pca[2],
        text=y+1, mode='markers', marker={**defaultScatterMarker, 'color':y, 'size':wrong*10+10}
    )

    layout = go.Layout(
         margin=dict(l=0, r=0, b=0, t=0),
    )
    
    fig = go.Figure(data=[trace1], layout={})
    py.iplot(fig)
    
def plotScatter3dTT(X, y, X_test, y_test, wrong=None):
    trainLen = len(y)
    
    if wrong is None: wrong=np.zeros(trainLen)
    
    wrong = np.concatenate((np.zeros(trainLen), wrong))
    X = np.vstack((X, X_test))
    y = np.concatenate((y, y_test))
    
    y[:trainLen] = y[:trainLen]+len(set(y))
    
    if(X.shape[1] <= 3):
        X_train_pca = X.T
    else:
        pca = PCA(n_components=3)
        pca.fit(X)

        print(pca.explained_variance_ratio_)

        X_train_pca = pca.transform(X).T
        print(X_train_pca.shape)
    
    trace1 = go.Scatter3d(x=X_train_pca[0], y=X_train_pca[1], z=X_train_pca[2],
        text=y+1, mode='markers', marker={**defaultScatterMarker, 'color':y, 'size':wrong*10+10}
    )

    layout = go.Layout(
         margin=dict(l=0, r=0, b=0, t=0),
    )
    
    fig = go.Figure(data=[trace1], layout={})
    py.iplot(fig)

In [12]:
from sklearn.datasets import load_iris

iris_data = load_iris()

y_iris = iris_data['target']
X_iris = pd.DataFrame(data=iris_data['data'], index=range(len(y_iris)), columns=['F1', 'F2', 'F3', 'F4'])

X_iris_train, X_iris_test, y_iris_train, y_iris_test = train_test_split(X_iris, y_iris, train_size=0.5, stratify=y_iris)

X_iris.describe()

Unnamed: 0,F1,F2,F3,F4
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


Original data
======

In [13]:
evaluateKnn(X_iris_train, y_iris_train, X_iris_test, y_iris_test, k=5)
evaluateKnnNormPerClass(X_iris_train.values, y_iris_train, X_iris_test.values, y_iris_test, k=5)
plotScatter(X_iris_train, y_iris_train)

96.0% success (72/75)
[[25  0  0]
 [ 0 23  2]
 [ 0  1 24]]
96.0% success (72/75)
[[25  0  0]
 [ 0 23  2]
 [ 0  1 24]]
[0.9349033803 0.0452966745]
(2, 75)


Normalized data
======

In [14]:
X_iris_train_norm = (X_iris_train - X_iris_train.mean()) / X_iris_train.std()
X_iris_test_norm = (X_iris_test - X_iris_train.mean()) / X_iris_train.std()

X_iris_train_norm.describe()

Unnamed: 0,F1,F2,F3,F4
count,75.0,75.0,75.0,75.0
mean,1.021405e-15,6.246855e-16,5.121829e-16,2.960595e-18
std,1.0,1.0,1.0,1.0
min,-1.702732,-2.099247,-1.565782,-1.446948
25%,-0.8966594,-0.6255166,-1.231054,-1.180965
50%,0.02456601,-0.1342732,0.3867964,0.2819422
75%,0.6579085,0.3569702,0.7215241,0.8139085
max,2.32763,2.567566,1.725707,1.611858


In [15]:
evaluateKnn(X_iris_train_norm, y_iris_train, X_iris_test_norm, y_iris_test, k=7)
evaluateKnnNormPerClass(X_iris_train.values, y_iris_train, X_iris_test.values, y_iris_test, k=7)
plotScatter(X_iris_train_norm, y_iris_train)

94.66666666666667% success (71/75)
[[25  0  0]
 [ 0 24  1]
 [ 0  3 22]]
96.0% success (72/75)
[[25  0  0]
 [ 0 23  2]
 [ 0  1 24]]
[0.7449009311 0.2171689219]
(2, 75)


LMNN
======

In [None]:
lmnn = LMNN(k=10, learn_rate=1e-5, max_iter=500)
lmnn.fit(X_iris_train.values, y_iris_train, verbose=False)
print("matrix L:", lmnn.L)
print("metric:", lmnn.metric())

In [None]:
evaluateKnn(lmnn.transform(X_iris_train), y_iris_train, lmnn.transform(X_iris_test), y_iris_test, k=7)
plotScatter(lmnn.transform(X_iris_train), y_iris_train)

Normalized data + LMNN
======

In [None]:
lmnn_norm = LMNN(k=7, learn_rate=1e-5, max_iter=1000)
lmnn_norm.fit(X_iris_train_norm.values, y_iris_train, verbose=False)
print(lmnn_norm.L)

In [None]:
evaluateKnn(lmnn_norm.transform(X_iris_train_norm), y_iris_train, lmnn_norm.transform(X_iris_test_norm), y_iris_test, k=30)
plotScatter(lmnn_norm.transform(X_iris_train_norm), y_iris_train)

NCA
=====

In [None]:
nca = NCA()
nca.fit(X_iris_train.values, y_iris_train)

In [None]:
evaluateKnn(nca.transform(X_iris_train), y_iris_train, nca.transform(X_iris_test), y_iris_test, k=7)
plotScatter(nca.transform(X_iris_train), y_iris_train)

Normalized data + NCA
=====

In [None]:
nca_norm = NCA()
nca_norm.fit(X_iris_train_norm.values, y_iris_train)

In [None]:
evaluateKnn(nca_norm.transform(X_iris_train_norm), y_iris_train, nca_norm.transform(X_iris_test_norm), y_iris_test, k=15)
plotScatter(nca_norm.transform(X_iris_train_norm), y_iris_train)

Multivariate normal distributions
====

In [61]:
c = 6
N = 100
X_mn =                   np.random.multivariate_normal([0, 15, 0], [[10, 0, 0], [0, 10, 0], [0, 0, 10]], N)
X_mn = np.vstack( [X_mn, np.random.multivariate_normal([15, 0, 0], [[10, 0, 0], [0, 10, 0], [0, 0, 10]], N)] )
X_mn = np.vstack( [X_mn, np.random.multivariate_normal([0, -15, 0], [[10, 0, 0], [0, 10, 0], [0, 0, 10]], N)] )
X_mn = np.vstack( [X_mn, np.random.multivariate_normal([-15, 0, 0], [[10, 0, 0], [0, 10, 0], [0, 0, 10]], N)] )
X_mn = np.vstack( [X_mn, np.random.multivariate_normal([0, 0, 15], [[10, 0, 0], [0, 10, 0], [0, 0, 10]], N)] )
X_mn = np.vstack( [X_mn, np.random.multivariate_normal([0, 0, -15], [[10, 0, 0], [0, 10, 0], [0, 0, 10]], N)] )

y_mn = np.ndarray.flatten(np.array([[x]*N for x in range(c)]))

X_mn_train, X_mn_test, y_mn_train, y_mn_test = train_test_split(X_mn, y_mn, train_size=0.7)
print(X_mn_train.shape, y_mn_train.shape, X_mn_test.shape, y_mn_test.shape)

(420, 3) (420,) (180, 3) (180,)


In [62]:
evaluateKnn(X_mn_train, y_mn_train, X_mn_test, y_mn_test, k=5)
plotScatter(X_mn, y_mn)

99.44444444444444% success (179/180)
[[30  0  0  0  0  0]
 [ 0 23  0  0  0  0]
 [ 0  0 31  0  0  0]
 [ 0  0  0 36  0  0]
 [ 0  0  0  1 31  0]
 [ 0  0  0  0  0 28]]
[0.3424558211 0.3333408905]
(2, 600)


Normalized data
=====

In [63]:
X_mn_train_norm = (X_mn_train - X_mn_train.mean(axis=0)) / X_mn_train.std(axis=0)
X_mn_test_norm = (X_mn_test - X_mn_train.mean(axis=0)) / X_mn_train.std(axis=0)

In [64]:
evaluateKnn(X_mn_train_norm, y_mn_train, X_mn_test_norm, y_mn_test, k=10)
plotScatter(X_mn_train_norm, y_mn_train)

100.0% success (180/180)
[[30  0  0  0  0  0]
 [ 0 23  0  0  0  0]
 [ 0  0 31  0  0  0]
 [ 0  0  0 36  0  0]
 [ 0  0  0  0 32  0]
 [ 0  0  0  0  0 28]]
[0.3427383744 0.3335199023]
(2, 420)


LMNN
=====

In [None]:
lmnn = LMNN(k=5, learn_rate=1e-7, max_iter=500)
lmnn.fit(X_mn_train, y_mn_train, verbose=False)
print(lmnn.L)

In [None]:
evaluateKnn(lmnn.transform(X_mn_train), y_mn_train, lmnn.transform(X_mn_test), y_mn_test, k=10)
plotScatter(lmnn.transform(X_mn_train), y_mn_train)

Normalized + LMNN
=====

In [None]:
lmnn_norm = LMNN(k=7, learn_rate=1e-5, max_iter=1000)
lmnn_norm.fit(X_mn_train_norm, y_mn_train, verbose=False)
print(lmnn.L)

In [None]:
evaluateKnn(lmnn_norm.transform(X_mn_train_norm), y_mn_train, lmnn_norm.transform(X_mn_test_norm), y_mn_test, k=30)
plotScatter(lmnn_norm.transform(X_mn_train_norm), y_mn_train)

NCA
=====

In [None]:
nca = NCA(max_iter=100)
nca.fit(X_mn_train, y_mn_train)

In [None]:
evaluateKnn(nca.transform(X_mn_train), y_mn_train, nca.transform(X_mn_test), y_mn_test, k=7)
plotScatter(nca.transform(X_mn_train), y_mn_train)

Normalized + NCA
=====

In [None]:
nca_norm = NCA()
nca_norm.fit(X_mn_train_norm, y_mn_train)

In [None]:
evaluateKnn(nca_norm.transform(X_mn_train_norm), y_mn_train, nca_norm.transform(X_mn_test_norm), y_mn_test, k=15)
plotScatter(nca_norm.transform(X_mn_train_norm), y_mn_train)

Unscaled 2D data
=======

In [65]:
Ns = [50, 500]
X_ns =                   np.random.multivariate_normal([0, 0], [[2, 0], [0, 100]], Ns[0])
X_ns = np.vstack( [X_ns, np.random.multivariate_normal([5, -15], [[2, 0], [0, 100]], Ns[1])] )
# X_ns = np.vstack( [X_ns, np.random.multivariate_normal([0, 0, -110], [[10, 0, 0], [0, 1000, 0], [0, 0, 10]], N)] )
# X_ns = np.vstack( [X_ns, np.random.multivariate_normal([0, 0, -100], [[10, 0, 0], [0, 1000, 0], [0, 0, 10]], N)] )
# X_ns = np.vstack( [X_ns, np.random.multivariate_normal([0, 0, 15], [[10, 0, 0], [0, 10, 0], [0, 0, 10]], N)] )
# X_ns = np.vstack( [X_ns, np.random.multivariate_normal([0, 0, -15], [[10, 0, 0], [0, 10, 0], [0, 0, 1000]], N)] )

y_ns = np.array([])
for i,N in enumerate(Ns):
    y_ns = np.concatenate( (y_ns, [i]*N) )

X_ns_train, X_ns_test, y_ns_train, y_ns_test = train_test_split(X_ns, y_ns, train_size=0.7)
print(X_ns_train.shape, y_ns_train.shape, X_ns_test.shape, y_ns_test.shape)

(385, 2) (385,) (165, 2) (165,)


In [66]:
wrong = evaluateKnn(X_ns_train, y_ns_train, X_ns_test, y_ns_test, k=1)
plotScatterTT(X_ns_train, y_ns_train, X_ns_test, y_ns_test, wrong)

95.15151515151516% success (157/165)
[[ 16   3]
 [  5 141]]


Normalized data
=====

In [67]:
X_ns_train_norm = (X_ns_train - X_ns_train.mean(axis=0)) / X_ns_train.std(axis=0)
X_ns_test_norm = (X_ns_test - X_ns_train.mean(axis=0)) / X_ns_train.std(axis=0)

In [68]:
wrong = evaluateKnn(X_ns_train_norm, y_ns_train, X_ns_test_norm, y_ns_test, k=10)
plotScatterTT(X_ns_train_norm, y_ns_train, X_ns_test_norm, y_ns_test, wrong)

97.57575757575758% success (161/165)
[[ 17   2]
 [  2 144]]


Unscaled data v2
=========

In [81]:
# Ns = [100,100]
# X_ns =                   np.random.multivariate_normal([0,0], [[100000000,0], [0,100]], Ns[0])
# X_ns = np.vstack( [X_ns, np.random.multivariate_normal([0,75], [[100000000,0], [0,100]], Ns[1])] )

# Ns = [100, 100, 100, 100, 100]
# X_ns =                   np.random.multivariate_normal([0,0,0], [[100000,0,0], [0,100,0], [0,0,2]], Ns[0])
# X_ns = np.vstack( [X_ns, np.random.multivariate_normal([0,10,0], [[100000,0,0], [0,100,0], [0,0,2]], Ns[1])] )
# X_ns = np.vstack( [X_ns, np.random.multivariate_normal([0,-10,0], [[100000,0,0], [0,100,0], [0,0,2]], Ns[2])] )
# X_ns = np.vstack( [X_ns, np.random.multivariate_normal([0,0,10], [[100000,0,0], [0,100, 0], [0,0,2]], Ns[3])] )
# X_ns = np.vstack( [X_ns, np.random.multivariate_normal([0,0,-10], [[100000,0,0], [0,100,0], [0,0,2]], Ns[4])] )

Ns = [100, 100, 100, 100, 100]
X_ns =                   np.random.multivariate_normal([0,0,0,2], [[100000000,0,0,0], [0,100,0,0], [0,0,2,0], [0,0,0,1]], Ns[0])
X_ns = np.vstack( [X_ns, np.random.multivariate_normal([0,10,0,-2], [[100000000,0,0,0], [0,100,0,0], [0,0,2,0], [0,0,0,1]], Ns[1])] )
X_ns = np.vstack( [X_ns, np.random.multivariate_normal([0,-10,0,0], [[100000000,0,0,0], [0,100,0,0], [0,0,2,0], [0,0,0,1]], Ns[2])] )
X_ns = np.vstack( [X_ns, np.random.multivariate_normal([0,0,10,-2], [[100000000,0,0,0], [0,100, 0,0], [0,0,2,0], [0,0,0,1]], Ns[3])] )
X_ns = np.vstack( [X_ns, np.random.multivariate_normal([0,0,-10,2], [[100000000,0,0,0], [0,100,0,0], [0,0,2,0], [0,0,0,1]], Ns[4])] )

y_ns = np.array([])
for i,N in enumerate(Ns):
    y_ns = np.concatenate( (y_ns, [i]*N) )

X_ns_train, X_ns_test, y_ns_train, y_ns_test = train_test_split(X_ns, y_ns, train_size=0.7, stratify=y_ns)
print(X_ns_train.shape, y_ns_train.shape, X_ns_test.shape, y_ns_test.shape)

(350, 4) (350,) (150, 4) (150,)


In [82]:
wrong = evaluateKnn(X_ns_train, y_ns_train, X_ns_test, y_ns_test, k=1)
plotScatterTT(X_ns_train, y_ns_train, X_ns_test, y_ns_test, wrong)

24.66666666666667% success (37/150)
[[ 8  7  5  4  6]
 [ 3 13  3  3  8]
 [ 6  2  4 12  6]
 [ 5  6  4  6  9]
 [ 8  5  6  5  6]]


In [83]:
X_ns_train_norm = (X_ns_train - X_ns_train.mean(axis=0)) / X_ns_train.std(axis=0)
X_ns_test_norm = (X_ns_test - X_ns_train.mean(axis=0)) / X_ns_train.std(axis=0)

In [87]:
wrong1 = evaluateKnn(X_ns_train_norm, y_ns_train, X_ns_test_norm, y_ns_test, k=1)
wrong2 = evaluateKnnNormPerClass(X_ns_train, y_ns_train, X_ns_test, y_ns_test, k=1)
plotScatterTT(X_ns_train_norm, y_ns_train, X_ns_test_norm, y_ns_test, wrong1)

81.33333333333333% success (122/150)
[[26  1  3  0  0]
 [ 0 23  7  0  0]
 [14  2 14  0  0]
 [ 0  0  0 30  0]
 [ 1  0  0  0 29]]
86.0% success (129/150)
[[26  1  3  0  0]
 [ 1 24  5  0  0]
 [ 8  2 20  0  0]
 [ 0  0  0 30  0]
 [ 1  0  0  0 29]]
