# MNIST Handwritten Digit classification using SGD Classifier

In [10]:
# Imports

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.subplots as ps
import sklearn as sk
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import tensorflow.keras as keras
from keras.datasets import mnist

In [11]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()
print(f'train input shape: {x_train.shape}')
print(f'train output shape: {y_train.shape}')
print(f'test input shape: {x_test.shape}')
print(f'test output shape: {y_test.shape}')

rows = 2
cols = 10
imagesPerRow = cols // rows
fig = ps.make_subplots(rows, cols)
for idx, img in enumerate(x_train[:cols]):
    fig.add_trace(px.imshow(img, binary_string=True).data[0], row=int(idx / imagesPerRow) + 1, col=idx % imagesPerRow + 1)

fig.show()

train input shape: (60000, 28, 28)
train output shape: (60000,)
test input shape: (10000, 28, 28)
test output shape: (10000,)


## Preparing data for learning

Mostly the same as in the `cnn.ipynb` but we also transform the image from a 2D array of `28x28` dimensions to a 1D array of `784` elements.

In [19]:
X = np.append(x_train, x_test, axis=0)
X = X.astype('float32')
X = keras.utils.normalize(X, axis=0)

lenX = X.shape[0]
X = np.array([X[i].flatten() for i in range(0, lenX)])
print(f'input shape: {X.shape}')

y = np.append(y_train, y_test, axis=0)
y = y.astype('float32')
print(f'output shape: {y.shape}')

print(X[0][200:300])
print(y[:5])

input shape: (70000, 784)
output shape: (70000,)
[0.0000000e+00 0.0000000e+00 0.0000000e+00 2.7354939e-03 1.0328551e-02
 8.9159748e-03 7.5740963e-03 6.6649374e-03 6.0825348e-03 5.7210918e-03
 5.5290204e-03 5.4744622e-03 5.5280072e-03 5.6400695e-03 2.2143552e-03
 2.1613017e-03 2.5297576e-03 2.1337918e-03 1.9491144e-03 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 8.6888362e-04 8.2670758e-03 7.8520989e-03 6.7766584e-03
 6.1477423e-03 5.8497037e-03 5.7507255e-03 4.5095682e-03 4.1668913e-03
 5.6463652e-03 5.5115474e-03 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 2.7775168e-03 4.4868058e-03 2.7138109e-03 6.0703307e-03 6.0921465e-03
 5.1131332e-03 2.8224458e-04

In [22]:
NO_FOLDS = 7
scores = []
kf = sk.model_selection.KFold(NO_FOLDS, shuffle=True, random_state=1)
for train_idx, test_idx in kf.split(X):

    # train
    trainX = X[train_idx]
    trainY = y[train_idx]

    # test data
    testX = X[test_idx]
    testY = y[test_idx]

    model = SGDClassifier(loss='log', penalty='l1')
    model.fit(trainX, trainY)
    predY = model.predict(testX)

    acc = accuracy_score(testY, predY)
    print(f'acc: {acc}')

    scores.append(acc)


acc: 0.7058
acc: 0.7038
acc: 0.6944
acc: 0.7009
acc: 0.709
acc: 0.7062
acc: 0.7147


In [23]:
print(f'Accuracy: mean={np.mean(scores)}, std={np.std(scores)}')
fig = px.box(pd.DataFrame(data=scores, columns=['Accuracy']), y='Accuracy')
fig.show()

Accuracy: mean=0.7049714285714286, std=0.005889146017575596
