# Manifold Learning - How to improve Classification

Unsupervised Learning can be useful in the Supervised Setting as well

In [1]:
print(__doc__)

import random
import numpy as np
#import matplotlib.pyplot as plt
#from mpl_toolkits.mplot3d import Axes3D
#from matplotlib.ticker import NullFormatter
#%matplotlib inline
from sklearn import manifold, datasets
from sklearn import random_projection
from sklearn.decomposition import PCA

from sklearn import neighbors, linear_model
from sklearn.model_selection import train_test_split

Automatically created module for IPython interactive environment


## Toy problem - Classify digits using KNN and Logistic Regression

In [2]:
digits = datasets.load_digits(n_class=10)
X_digits = digits.data
y_digits = digits.target
n_samples, n_features = X_digits.shape
print('Dataset dimensions:',X_digits.shape)

Dataset dimensions: (1797, 64)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_digits,y_digits,test_size=0.4,random_state=42, stratify=y_digits)

# 64 features
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
logistic = linear_model.LogisticRegression()

print('KNN score: %f' % knn.fit(X_train, y_train).score(X_test, y_test))
print('LogisticRegression score: %f'
      % logistic.fit(X_train, y_train).score(X_test, y_test))

KNN score: 0.977747
LogisticRegression score: 0.951321


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
print('Test set dimenstions:',X_test.shape)

Test set dimenstions: (719, 64)


## Number of KNN errors

In [6]:
round(X_test.shape[0]-X_test.shape[0]*knn.fit(X_train, y_train).score(X_test, y_test),0)

16.0

# Automated feature engineering

## Try out different manifold-learned features

In [27]:
n_additional_features = 20 # for all methods but tSNE

#transformer = random_projection.SparseRandomProjection(n_additional_features, random_state=42)
#transformer = random_projection.GaussianRandomProjection(n_additional_features, random_state=0)
#transformer = manifold.MDS(n_additional_features, max_iter=300, n_init=3) # works with 20 add feat
#transformer = manifold.TSNE(n_components=3, init='pca', random_state=42)
transformer = manifold.TSNE(n_components=3, perplexity=15, random_state=42) # random init
#transformer = PCA(n_additional_features) # works with 13 or 20 add feat

In [28]:
New_features_train = transformer.fit_transform(X_train)
X_train2 = np.c_[X_train, New_features_train]
#X_train2 = New_features_train

# Calculate transformation for test data by hand (there is no fit function for the transformation)
Handmade_fit_transform = np.linalg.lstsq(X_train, New_features_train, rcond=None)
Transformation_matrix = Handmade_fit_transform[0]

# Calculate new features 
X_test2 = np.c_[X_test, np.matmul(X_test, Transformation_matrix)]
#X_test2 = np.matmul(X_test, Transformation_matrix)

## Test error comparison without/with additional "manifold" features

In [29]:
print('KNN test score pure    : %f' % knn.fit(X_train, y_train).score(X_test, y_test))
print('KNN test score manifold: %f' % knn.fit(X_train2, y_train).score(X_test2, y_test))
print('LogisticRegression test score pure    : %f'
      % logistic.fit(X_train, y_train).score(X_test, y_test))
print('LogisticRegression test score manifold: %f'
      % logistic.fit(X_train2, y_train).score(X_test2, y_test))

KNN test score pure    : 0.977747
KNN test score manifold: 0.981919
LogisticRegression test score pure    : 0.947149
LogisticRegression test score manifold: 0.926287


## Training error comparison without/with additional "manifold" features

In [30]:
print('KNN train score pure    : %f' % knn.fit(X_train, y_train).score(X_train, y_train))
print('KNN train score manifold: %f' % knn.fit(X_train2, y_train).score(X_train2, y_train))
print('LogisticRegression train score pure    : %f'
      % logistic.fit(X_train, y_train).score(X_train, y_train))
print('LogisticRegression train score manifold: %f'
      % logistic.fit(X_train2, y_train).score(X_train2, y_train))

KNN train score pure    : 0.987941
KNN train score manifold: 0.989796
LogisticRegression train score pure    : 0.999072
LogisticRegression train score manifold: 1.000000


## Check against random feature vectors

In [31]:
nof_random_features = 60
X_train3 = np.c_[X_train,np.random.randint(low=0,high=16, size = [X_train.shape[0],nof_random_features])]
X_test3 = np.c_[X_test,np.random.randint(low=0,high=16, size = [X_test.shape[0],nof_random_features])]

In [32]:
print('KNN test score pure  : %f' % knn.fit(X_train, y_train).score(X_test, y_test))
print('KNN test score random: %f' % knn.fit(X_train3, y_train).score(X_test3, y_test))
print('LogisticRegression test score pure   : %f'
      % logistic.fit(X_train, y_train).score(X_test, y_test))
print('LogisticRegression test score random: %f'
      % logistic.fit(X_train3, y_train).score(X_test3, y_test))

KNN test score pure  : 0.977747
KNN test score random: 0.951321
LogisticRegression test score pure   : 0.947149
LogisticRegression test score random: 0.922114


## Just because we can do it - putting everything together

Not always a good idea because too many features might make it harder for the algorithms

In [33]:
nof_additional_features = 50
transformer1 = random_projection.SparseRandomProjection(nof_additional_features, random_state=42)
transformer2 = random_projection.GaussianRandomProjection(nof_additional_features, random_state=0)
transformer3 = manifold.MDS(nof_additional_features, max_iter=300, n_init=3,random_state=11) # works with 20 add feat
transformer4 = manifold.TSNE(n_components=3, perplexity=15, random_state=42)
transformer5 = PCA(nof_additional_features) # works with 13/20 add feat

In [34]:
Nft1 = transformer1.fit_transform(X_train)
Nft2 = transformer2.fit_transform(X_train)
Nft3 = transformer3.fit_transform(X_train)
Nft4 = transformer4.fit_transform(X_train)
Nft5 = transformer5.fit_transform(X_train)

# Calculate transformation for test data by hand (there is no fit function for the transformation)
Hft1 = np.linalg.lstsq(X_train, Nft1, rcond=None)
Hft2 = np.linalg.lstsq(X_train, Nft2, rcond=None)
Hft3 = np.linalg.lstsq(X_train, Nft3, rcond=None)
Hft4 = np.linalg.lstsq(X_train, Nft4, rcond=None)
Hft5 = np.linalg.lstsq(X_train, Nft5, rcond=None)
Mat1 = np.matmul(X_test,Hft1[0])
Mat2 = np.matmul(X_test,Hft2[0])
Mat3 = np.matmul(X_test,Hft3[0])
Mat4 = np.matmul(X_test,Hft4[0])
Mat5 = np.matmul(X_test,Hft5[0])

In [35]:
# Put together all features 
#X_train4 = np.c_[Nft1,Nft2,Nft3,Nft4,Nft5]
#X_test4 = np.c_[Mat1,Mat2,Mat3,Mat4,Mat5]
#X_train4 = np.c_[X_train,Nft1,Nft2,Nft3,Nft4,Nft5]
#X_test4 = np.c_[X_test,Mat1,Mat2,Mat3,Mat4,Mat5]
X_train4 = np.c_[X_train,Nft3,Nft4,Nft5] # try this with 50 additional features
X_test4 = np.c_[X_test,Mat3,Mat4,Mat5]
#X_train4 = np.c_[Nft3,Nft5] # try this with 50 additional features
#X_test4 = np.c_[Mat3,Mat5]

In [36]:
print('KNN test score pure: %f' % knn.fit(X_train, y_train).score(X_test, y_test))
print('KNN test score all : %f' % knn.fit(X_train4, y_train).score(X_test4, y_test))
print('LogisticRegression test score pure: %f'
      % logistic.fit(X_train, y_train).score(X_test, y_test))
print('LogisticRegression test score all : %f'
      % logistic.fit(X_train4, y_train).score(X_test4, y_test))

KNN test score pure: 0.977747
KNN test score all : 0.983310
LogisticRegression test score pure: 0.947149
LogisticRegression test score all : 0.944367


## Number of KNN errors now

In [37]:
round(X_test.shape[0]-X_test.shape[0]*knn.fit(X_train4, y_train).score(X_test4, y_test),0)

12.0