# Import Libraries
# import tensorflow as tf


In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from time import time
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE

# import umap
#...........................................................
#........Introduction.......................................
# In this work PCA for dimensionally reduction is applied
#  MNIST dataset is used.  MNIST contains
# 28*28 images of handwritten digits. The goal is to show that not all
#  28*28=784 features are needed to classify the digits.
#..........................................................
# Loading mnist train dataset and dividing it into x_train and y train

In [2]:
x_train =pd.read_csv("mnist_train.csv")
y_train = x_train['label']
del x_train['label']
# Loading mnist test dataset and dividing it into x_train and y train
x_test  =pd.read_csv("mnist_test.csv")
y_test  = x_test['label']
del x_test['label']

In [3]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(60000, 784) (60000,)
(10000, 784) (10000,)


#......................using all features.......................
# Linear Support Vector Machine (SVM) with all the 784 pixels of the MNIST images is used.
# a pipeline is set up  where scale is first applied, and then the classifier

In [4]:
steps = [('scaling', StandardScaler()), ('clf', SVC())]
pipeline = Pipeline(steps)
# train
t0 = time()
pipeline.fit(x_train, y_train)
# predict
y_pred = pipeline.predict(x_test)
# accuracy
print("accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")
# confusion matrix
print(metrics.confusion_matrix(y_true=y_test, y_pred=y_pred))
# time taken
t_all_feats = time() - t0
print("Training and classification done in {}s".format(t_all_feats))

accuracy: 0.966 

[[ 968    0    1    1    0    3    3    2    2    0]
 [   0 1127    3    0    0    1    2    0    2    0]
 [   5    1  996    2    2    0    1   15    9    1]
 [   0    0    4  979    1    7    0   12    7    0]
 [   0    0   12    0  944    2    4    7    3   10]
 [   2    0    1   10    2  854    6    8    7    2]
 [   6    2    1    0    4    8  930    2    5    0]
 [   1    6   13    2    3    0    0  990    0   13]
 [   3    0    4    6    6    9    3   14  926    3]
 [   4    6    5   11   12    2    0   20    3  946]]
Training and classification done in 521.6522998809814s


#.........................using PCA..................................................
# The next step is to train and predict using a dataset reduced with PCA,
# the number of components for the PCA model is reduced to 50.
# define pipeline steps

In [5]:
steps = [('scaling', StandardScaler()), ('reduce_dim', PCA(n_components=50)), ('clf', SVC())]
pipeline = Pipeline(steps)
# train
t0 = time()
pipeline.fit(x_train, y_train)
# predict
y_pred = pipeline.predict(x_test)
# accuracy
print("accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")
# confusion matrix
print(metrics.confusion_matrix(y_true=y_test, y_pred=y_pred))
t_reduced_feats = time() - t0
print("Training and classification done in {}s".format(t_reduced_feats))
print("Speedup {}x".format(t_all_feats/t_reduced_feats))

accuracy: 0.9715 

[[ 970    0    1    1    0    3    3    1    1    0]
 [   0 1127    4    1    0    1    1    0    1    0]
 [   4    0 1007    3    1    1    1    9    5    1]
 [   0    1    0  985    1    6    0    9    6    2]
 [   0    0    7    1  951    0    4    4    2   13]
 [   2    0    0   14    1  863    6    0    5    1]
 [   4    3    1    1    4    7  934    1    3    0]
 [   2    8   14    1    3    0    0  983    3   14]
 [   3    0    2    9    5    5    2    5  939    4]
 [   3    5    1    8   13    2    0   14    7  956]]
Training and classification done in 70.22853541374207s
Speedup 7.427925084977728x


#...................................Discussion..............................
# We get >7x speedup when preprocessing with PCA and an accuracy score
# that's quite comparable to having the whole dataset.