# MNIST character recognition with SVM
# Part 1. Simple classification
The MNIST dataset contains 70,000 samples of handwritten digits 0-9, with labels. This is a standard benchmark problem for machine learning. We will use SVM to learn classification of the digits.

In [1]:
import sys
import time
import numpy as np
import pickle
import tensorflow as tf
from sklearn import model_selection, svm, preprocessing, pipeline
from sklearn.metrics import accuracy_score,confusion_matrix
from Fashion_MNIST_Loader.mnist_loader import MNIST
import matplotlib.pyplot as plt
import matplotlib.axes
from matplotlib import style
style.use('ggplot')

ModuleNotFoundError: No module named 'tensorflow'

## Load the data
Tensorflow Keras provides a convenient source for loading the data. Note that the loading function automatically splits the data into training and testing sets. For our purposes, we need training, validation, and (holdout) test sets.

The images come in the form of 28x28 grayscale arrays, which need to be converted into 784-length feature vectors for SVM.

In [None]:
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()

X_train = X_train.reshape(X_train.shape[0],784,)
X_test = X_test.reshape(X_test.shape[0],784,)
X_validate, X_holdout, y_validate, y_holdout = model_selection.train_test_split(X_test, y_test, test_size=0.2, random_state=0)

# Train SVM
The labels are for the ten classes 0-9, and are in the form of integers. The SVM classifier automatically determines that this is not a binary classification. What type of multi-classification will it perform by default? How could you specify a different type?

Note: The classification is quite intensive, and may take several minutes to compute. For the demo, only the first 10,000 samples are used.

In [None]:
print('SVM Classifier')
clf = svm.SVC(C=10., kernel='linear')

clf.fit(X_train[:10000],y_train[:10000])

## Check accuracy with validation data

In [None]:
print('\nCalculating Accuracy of trained Classifier...')
acc = clf.score(X_validate,y_validate)

print('\nMaking Predictions on Validation Data...')
y_pred = clf.predict(X_validate)

print('\nCalculating Accuracy of Predictions...')
accuracy = accuracy_score(y_validate, y_pred)

print('\nCreating Confusion Matrix...')
conf_mat = confusion_matrix(y_validate,y_pred)

print('\nSVM Trained Classifier Accuracy: ',acc)
print('\nPredicted Values: ',y_pred)
print('\nAccuracy of Classifier on Validation Images: ',accuracy)
print('\nConfusion Matrix: \n',conf_mat)

# Plot Confusion Matrix Data as a Matrix
plt.matshow(conf_mat)
plt.title('Confusion Matrix for Validation Data')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

## Check accuracy with holdout test

In [None]:
print('\nMaking Predictions on Holdout Test Input Images...')
y_holdout_pred = clf.predict(X_holdout)

print('\nCalculating Accuracy of Trained Classifier on Test Data... ')
acc = accuracy_score(y_holdout,y_holdout_pred)

print('\n Creating Confusion Matrix for Test Data...')
conf_mat_test = confusion_matrix(y_holdout,y_holdout_pred)

print('\nPredicted Labels for Test Images: ',y_holdout_pred)
print('\nAccuracy of Classifier on Test Images: ',acc)
print('\nConfusion Matrix for Test Data: \n',conf_mat_test)



In [None]:
# Plot Confusion Matrix for Test Data
plt.matshow(conf_mat_test)
plt.title('Confusion Matrix for Test Data')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()


## Representative images

In [None]:
a = np.random.randint(0,len(X_holdout),16)
fig, axs = plt.subplots(8,2, figsize=(10,14))
for i,d in enumerate(a):
    two_d = np.reshape(X_holdout[d], (28, 28)) 
    axs[i//2,i % 2].imshow(two_d, interpolation='nearest')
    axs[i//2,i % 2].set_title('Original Label: {0}  Predicted Label: {1}'.format(y_holdout[d],y_holdout_pred[d]))
plt.show()

# 2. Hyperparameter sweep using grid search
Parameters such as `C` are not necessarily known in advance. `GridSearchCV` can try multiple combinations of parameters automatically, while performing k-fold cross-validation. (Note that grid search can take considerable time.)

Here there is no need to produce three sets of data, because grid search will automatically perform validation.

In [None]:
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train = X_train.reshape(X_train.shape[0],784,)
X_test = X_test.reshape(X_test.shape[0],784,)

In [None]:
parameters = {'C':[0.0001, 1., 1e5]}
svc = svm.SVC(kernel='linear')
clf = model_selection.GridSearchCV(svc, param_grid=parameters, cv=3, return_train_score=True)
clf.fit(X_train[:10000],y_train[:10000])

In [None]:
print("score = % 3.2f" %clf.score(X_holdout, y_holdout))

print("best parameters: ", clf.best_params_)

In [None]:
# The detailed results of cross-validation may optionally be accessed with this:
#clf.cv_results_

In [None]:
print('\nMaking Predictions on Holdout Test Input Images...')
y_test_pred = clf.predict(X_test)

print('\nCalculating Accuracy of Trained Classifier on Test Data... ')
acc = accuracy_score(y_test,y_test_pred)

print('\n Creating Confusion Matrix for Test Data...')
conf_mat_test = confusion_matrix(y_test,y_test_pred)

print('\nPredicted Labels for Test Images: ',y_test_pred)
print('\nAccuracy of Classifier on Test Images: ',acc)
print('\nConfusion Matrix for Test Data: \n',conf_mat_test)

# Part 3. Fashion MNIST
The original MNIST dataset is for training and testing recognition of handwritten numerals. Fashion MNIST is a more difficult dataset of images of clothing items. It is meant to be a nearly drop-in replacement for character MNIST. The images are originally in 28x28 format, to be reshaped into 784-length arrays.

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
LABEL_NAMES = ['t_shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle_boots']

x_train = x_train.reshape(x_train.shape[0],784,)
x_test = x_test.reshape(x_test.shape[0],784,)

## Examine images
Each image is a grayscale. The images can be viewed by reshaping them back into 28x28 arrays. The labels are indices, which can be converted to text with LABEL_NAMES.

In [None]:
m = 5
n = 5
whichimgs = np.random.randint(low=0, high=len(x_train), size=(m,n))
fig, axs = plt.subplots(m, n, sharex='col', sharey='row',
                       figsize=(8,8))

plt.xticks([])
plt.yticks([])
for i in range(m):
    for j in range(n):
        axs[i,j].imshow(x_train[whichimgs[i,j]].reshape(28,28))
        axs[i,j].set_title(LABEL_NAMES[y_train[whichimgs[i,j]]])
        axs[i,j].set_title(y_train[whichimgs[i,j]])
        axs[i,j].set_xticks([])
        axs[i,j].set_yticks([])