In [8]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from time import time
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.decomposition import FastICA
from sklearn.ensemble import RandomForestClassifier  #Random Forest algorithm
from sklearn.manifold import TSNE

#...........................................................
#........Introduction.......................................
# In this work PCA for dimensionally reduction is applied
#  MNIST dataset is used.  MNIST contains
# 28*28 images of handwritten digits. The goal is to show that not all
#  28*28=784 features are needed to classify the digits.
#..........................................................
# Loading mnist train dataset and dividing it into x_train and y train

In [3]:
x_train =pd.read_csv("mnist_train.csv")
y_train = x_train['label']
del x_train['label']
# Loading mnist test dataset and dividing it into x_train and y train
x_test  =pd.read_csv("mnist_test.csv")
y_test  = x_test['label']
del x_test['label']

In [4]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(60000, 784) (60000,)
(10000, 784) (10000,)


#......................using all features.......................
# Linear Support Vector Machine (SVM) with all the 784 pixels of the MNIST images is used.
# a pipeline is set up  where scale is first applied, and then the classifier

In [5]:
steps = [('scaling', StandardScaler()), ('clf', SVC())]
pipeline = Pipeline(steps)
# train
t0 = time()
pipeline.fit(x_train, y_train)
# predict
y_pred = pipeline.predict(x_test)
# accuracy
print("accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")
# confusion matrix
print(metrics.confusion_matrix(y_true=y_test, y_pred=y_pred))
# time taken
t_all_feats = time() - t0
print("Training and classification done in {}s".format(t_all_feats))

accuracy: 0.966 

[[ 968    0    1    1    0    3    3    2    2    0]
 [   0 1127    3    0    0    1    2    0    2    0]
 [   5    1  996    2    2    0    1   15    9    1]
 [   0    0    4  979    1    7    0   12    7    0]
 [   0    0   12    0  944    2    4    7    3   10]
 [   2    0    1   10    2  854    6    8    7    2]
 [   6    2    1    0    4    8  930    2    5    0]
 [   1    6   13    2    3    0    0  990    0   13]
 [   3    0    4    6    6    9    3   14  926    3]
 [   4    6    5   11   12    2    0   20    3  946]]
Training and classification done in 373.257372379303s


#.........................using PCA..................................................
# The next step is to train and predict using a dataset reduced with PCA,
# the number of components for the PCA model is reduced to 20.

In [6]:
# define pipeline steps
steps = [('scaling', StandardScaler()), ('reduce_dim', PCA(n_components=20)), ('clf', SVC())]
pipeline = Pipeline(steps)
# train
t0 = time()
pipeline.fit(x_train, y_train)
# predict
y_pred = pipeline.predict(x_test)
# accuracy
print("accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")
# confusion matrix
print(metrics.confusion_matrix(y_true=y_test, y_pred=y_pred))
t_reduced_feats = time() - t0
print("Training and classification done in {}s".format(t_reduced_feats))
print("Speedup {}x".format(t_all_feats/t_reduced_feats))

accuracy: 0.962 

[[ 968    0    2    0    0    3    4    1    2    0]
 [   0 1126    3    0    1    1    1    0    3    0]
 [   3    1 1002    4    2    2    2    6    8    2]
 [   0    0    0  969    1    7    1    8   20    4]
 [   0    1    5    0  944    1    4    3    1   23]
 [   1    0    0   16    3  852    9    2    7    2]
 [   6    3    0    0    4    8  933    0    4    0]
 [   1    7   13    1    7    0    0  972    1   26]
 [   4    1    6   13    6   13    2    7  918    4]
 [   3    6    2   13   24    4    0   16    5  936]]
Training and classification done in 31.4891095161438s
Speedup 11.853538512670289x


#...................................Discussion..............................
# We get >11x speedup when preprocessing with PCA and an accuracy score
# that's quite comparable to having the whole dataset.

#.........................using FastICA..................................................
# The next step is to train and predict using a dataset reduced with FastICA,
# the number of components for the FastICA model is reduced to 20.


In [7]:
# define pipeline steps
steps = [('scaling', StandardScaler()), ('reduce_dim', FastICA(n_components=20)), ('clf', SVC())]
pipeline = Pipeline(steps)
# train
t2 = time()
pipeline.fit(x_train, y_train)
# predict
y_pred = pipeline.predict(x_test)
# accuracy
print("accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")
# confusion matrix
print(metrics.confusion_matrix(y_true=y_test, y_pred=y_pred))
t_reduced_feats = time() - t2
print("Training and classification done in {}s".format(t_reduced_feats))
print("Speedup {}x".format(t_all_feats/t_reduced_feats))

accuracy: 0.9258 

[[ 955    0    2    1    1    6    8    3    4    0]
 [   0 1119    3    8    0    1    1    0    3    0]
 [   9    1  955   37    7    4    2    4   12    1]
 [   1   10   17  907    7   10    2   13   33   10]
 [   2    0    7    3  925    2    3    4    4   32]
 [  12    0    2   27   10  794   11    5   28    3]
 [   5    5    4    3    6   27  902    1    5    0]
 [   0    7   14    6   13    2    0  934    2   50]
 [   5    1    8   25    8   27   10    7  877    6]
 [   3    6    2   19   45    5    1   31    7  890]]
Training and classification done in 54.37892937660217s
Speedup 6.864007376723122x


#...................................Discussion..............................
# We get >6x speedup when preprocessing with ICA and an accuracy score
# that's quite comparable to having the whole dataset.


In [9]:
#......................using all features.......................
rf=RandomForestClassifier(n_estimators=3)
#training random Forest
t0 = time()
rf.fit(x_train,y_train)
pred=rf.predict(x_test)
# accuracy
print("accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=pred), "\n")
# confusion matrix
print(metrics.confusion_matrix(y_true=y_test, y_pred=pred))
t_all_feats = time() - t0
print("Training and classification done in {}s".format(t_all_feats))

accuracy: 0.8834 

[[ 961    1    3    2    1    3    3    3    2    1]
 [   1 1120    2    1    0    2    2    1    4    2]
 [  33   20  913   15    7    5    5   17   14    3]
 [  13    9   43  882    5   22    3    6   23    4]
 [  12   10   24   11  876    3    6    5    5   30]
 [  27   10   19   67   16  724    6    5   14    4]
 [  23   12   21   11   18   16  849    0    7    1]
 [   5   15   36    5   13    2    2  920    6   24]
 [  34   11   45   56   24   23   11    2  756   12]
 [  12   14   24   33   45   18    2   19    9  833]]
Training and classification done in 1.522531270980835s


#.........................using PCA..................................................
# The next step is to train and predict using a dataset reduced with PCA,
# the number of components for the PCA model is reduced to 20.


In [10]:
pca = PCA(n_components=20)
pca.fit(x_train)
train_pca = pca.transform(x_train)
test_pca = pca.transform(x_test)
# train
t1 = time()
rf.fit(train_pca,y_train)
y_pred=rf.predict(test_pca)
# accuracy
print("accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")
# confusion matrix
print(metrics.confusion_matrix(y_true=y_test, y_pred=y_pred))
t_reduced_feats_PCA = time() - t1
print("Training and classification done in {}s".format(t_reduced_feats_PCA))
print("Speedup {}x".format(t_all_feats/t_reduced_feats_PCA))

accuracy: 0.8667 

[[ 923    3   18    8    1    9   10    1    4    3]
 [   4 1115    4    3    1    2    1    1    3    1]
 [  34   11  917   22   16    2    5    6   17    2]
 [  17    6   41  884    2   19    2    8   30    1]
 [  11    4   26    7  860    2    9    7    4   52]
 [  31   10   21   66   23  702    9    5   19    6]
 [  25    9   22    5   16   22  848    2    6    3]
 [  10   15   36   11   21    4    3  887    6   35]
 [  44    8   50   55   13   32    7    8  745   12]
 [  16    8   24   22  106    9    4   24   10  786]]
Training and classification done in 1.1006507873535156s
Speedup 1.383300942019693x


#...................................Discussion..............................
# We get >1.38x speedup when preprocessing with PCA and an accuracy score is better

# #.........................using ICA..................................................
# # The next step is to train and predict using a dataset reduced with fastICA,
# # the number of components for the fastICA model is reduced to 20.

In [11]:
ica = FastICA(n_components=20)
ica.fit(x_train)
train_ica = ica.transform(x_train)
test_ica = ica.transform(x_test)
# train
t2 = time()
rf.fit(train_ica,y_train)
y_pred=rf.predict(test_ica)
# accuracy
print("accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")
# confusion matrix
print(metrics.confusion_matrix(y_true=y_test, y_pred=y_pred))
t_reduced_feats_ica = time() - t2
print("Training and classification done in {}s".format(t_reduced_feats_ica))
print("Speedup {}x".format(t_all_feats/t_reduced_feats_ica))



accuracy: 0.8432 

[[ 940    0    9    7    1   11    9    2    1    0]
 [   0 1119    7    3    1    0    1    0    3    1]
 [  34   10  895   40    9    7    5   10   18    4]
 [  24    9   87  796    6   39    5   10   26    8]
 [   9   15   26   20  824    5   12   10   10   51]
 [  30   10   31   80   17  679   12    5   22    6]
 [  37    7   23   14   30   14  827    1    5    0]
 [   7   11   46   21   21    3    0  890    6   23]
 [  21   15   61   74   36   45   13   10  687   12]
 [  19    5   25   34   81   32    6   21   11  775]]
Training and classification done in 1.501979112625122s
Speedup 1.0136833849305615x


# We get >1.03x speedup when preprocessing with FastICA and an accuracy score is better

#...................................Discussion..............................
# The results after using SVM and RandomForest as follows
# - While using SVM and RandomForest, when we look at the time taken, we can see that PCA is better than FactICA and better than using all features,
# and when we look at the accuracy, we can see that the accuracy is not notable affected while using PCA and FactICA  