# Digit Recognizer - MNIST data

In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd

In [2]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [3]:
train_data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
from sklearn.model_selection import train_test_split

In [17]:
# split data into data and labels
train_label = train_data['label'].values.ravel()
train = train_data.loc[:, train_data.columns != 'label']

In [18]:
X_train, X_valid, y_train, y_valid = train_test_split(train, train_label, test_size=0.2)

In [23]:
print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)

(33600, 784)
(33600,)
(8400, 784)
(8400,)


Import machine learning algorithm to use for classification

In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import accuracy_score

In [26]:
def train_valid_accuracy(algo, X_train = X_train, y_train = y_train, 
                         X_valid = X_valid, y_valid = y_valid):
    """
    Fit algorithm with training data and return accuracy score with validation data
    """
    train_algo = algo
    model = train_algo.fit(X_train, y_train)
    predict = model.predict(X_valid)
    return accuracy_score(y_valid, predict)

Use `train_valid_accuracy` to get accuracy score for each training algorithm.

In [33]:
print("KNeighborsClassifier:", train_valid_accuracy(KNeighborsClassifier()))
print("RandomForestClassifier:", train_valid_accuracy(RandomForestClassifier()))
print("AdaBoostClassifier:", train_valid_accuracy(AdaBoostClassifier()))
print("SVC:", train_valid_accuracy(SVC()))
print("DecisionTreeClassifier:", train_valid_accuracy(DecisionTreeClassifier()))
print("KMeans:", train_valid_accuracy(KMeans()))
print("GaussianMixture:", train_valid_accuracy(GaussianMixture()))
print("SGDClassifier:", train_valid_accuracy(SGDClassifier()))

KNeighborsClassifier: 0.9669047619047619
RandomForestClassifier: 0.9660714285714286
AdaBoostClassifier: 0.7204761904761905
SVC: 0.9764285714285714
DecisionTreeClassifier: 0.8509523809523809
KMeans: 0.24011904761904762
GaussianMixture: 0.09869047619047619
SGDClassifier: 0.8604761904761905


From the above scores, we see that `KNeightborsClassifier`, `RandomForestClassifier`, and `SVC` has the highest accuracy score. We use `RandomSearchCV` and `GridSearchCV` to find the best parameters for each classification algorithm and get the accuracy score.

In [35]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

### KNeighborsClassifier()

In [37]:
param_grid = [{
    'weights': ['uniform', 'distance'], 'n_neighbors': [3, 4, 5]
}]

grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=3)
grid_search.fit(X_train[:5000], y_train[:5000])

GridSearchCV(cv=3, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid=[{'n_neighbors': [3, 4, 5],
                          'weights': ['uniform', 'distance']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [38]:
grid_search.best_params_

{'n_neighbors': 4, 'weights': 'distance'}

In [39]:
y_pred = grid_search.best_estimator_.predict(X_valid)
print(accuracy_score(y_valid, y_pred))

0.940595238095238


### RandomForestClassifier()

In [41]:
param_grid = [{
    'n_estimators': [100, 150, 200]
}]

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=3)
grid_search.fit(X_train[:5000], y_train[:5000])

GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [42]:
grid_search.best_params_

{'n_estimators': 150}

In [43]:
y_pred = grid_search.best_estimator_.predict(X_valid)
print(accuracy_score(y_valid, y_pred))

0.9366666666666666


### SVC()

In [46]:
param_grid = [{
    'C': [1, 5, 10], 'gamma': ['scale', 'auto'], 'tol': [1e-2, 1e-3, 1e-4]
}]

random_search = RandomizedSearchCV(SVC(), param_grid, cv=3, n_jobs=3)
random_search.fit(X_train[:5000], y_train[:5000])

RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                                 class_weight=None, coef0=0.0,
                                 decision_function_shape='ovr', degree=3,
                                 gamma='scale', kernel='rbf', max_iter=-1,
                                 probability=False, random_state=None,
                                 shrinking=True, tol=0.001, verbose=False),
                   iid='deprecated', n_iter=10, n_jobs=3,
                   param_distributions=[{'C': [1, 5, 10],
                                         'gamma': ['scale', 'auto'],
                                         'tol': [0.01, 0.001, 0.0001]}],
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring=None, verbose=0)

In [47]:
random_search.best_params_

{'tol': 0.001, 'gamma': 'scale', 'C': 5}

In [48]:
y_pred = random_search.best_estimator_.predict(X_valid)
print(accuracy_score(y_valid, y_pred))

0.9607142857142857


### Neural Network

We create a neural network model to see if we can get a better accuracy score than the `SVC model`.

In [50]:
import os

In [53]:
model = keras.models.Sequential([
    keras.layers.Dense(10, activation="relu"),
    keras.layers.Dense(784, activation="relu"),
    keras.layers.Dense(10, activation="softmax")
])

model.compile(loss="sparse_categorical_crossentropy", optimizer=keras.optimizers.SGD(1e-3),
             metrics=["accuracy"])

run_index = 1 
run_logdir = os.path.join(os.curdir, "mnist_logs", "run_{:03d}".format(run_index))

early_stopping_cb = keras.callbacks.EarlyStopping(patience=20)
checkpoint_cb = keras.callbacks.ModelCheckpoint("mnist_ann_model_1.h5", save_best_only=True)
tensorboard_cb = keras.callbacks.TensorBoard(run_logdir)

history = model.fit(X_train, y_train, epochs=100, validation_data = (X_valid, y_valid),
                   callbacks=[early_stopping_cb, checkpoint_cb, tensorboard_cb])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100


In [54]:
model = keras.models.load_model("mnist_ann_model_1.h5")
model.evaluate(X_valid, y_valid)



[0.2614995241165161, 0.931071400642395]

In [56]:
model = keras.models.Sequential([
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(50, activation="relu"),
    keras.layers.Dense(10, activation="softmax")
])

model.compile(loss="sparse_categorical_crossentropy", optimizer=keras.optimizers.SGD(1e-3),
             metrics=["accuracy"])

run_index = 2
run_logdir = os.path.join(os.curdir, "mnist_logs", "run_{:03d}".format(run_index))

early_stopping_cb = keras.callbacks.EarlyStopping(patience=20)
checkpoint_cb = keras.callbacks.ModelCheckpoint("mnist_ann_model_2.h5", save_best_only=True)
tensorboard_cb = keras.callbacks.TensorBoard(run_logdir)

history = model.fit(X_train, y_train, epochs=100, validation_data = (X_valid, y_valid),
                   callbacks=[early_stopping_cb, checkpoint_cb, tensorboard_cb])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100


In [57]:
model = keras.models.load_model("mnist_ann_model_2.h5")
model.evaluate(X_valid, y_valid)



[0.2562721073627472, 0.9395238161087036]

In [58]:
model = keras.models.Sequential([
    keras.layers.Dense(300, activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(10, activation="softmax")
])

model.compile(loss="sparse_categorical_crossentropy", optimizer=keras.optimizers.SGD(1e-3),
             metrics=["accuracy"])

run_index = 3
run_logdir = os.path.join(os.curdir, "mnist_logs", "run_{:03d}".format(run_index))

early_stopping_cb = keras.callbacks.EarlyStopping(patience=20)
checkpoint_cb = keras.callbacks.ModelCheckpoint("mnist_ann_model_3.h5", save_best_only=True)
tensorboard_cb = keras.callbacks.TensorBoard(run_logdir)

history = model.fit(X_train, y_train, epochs=100, validation_data = (X_valid, y_valid),
                   callbacks=[early_stopping_cb, checkpoint_cb, tensorboard_cb])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100


In [59]:
model = keras.models.load_model("mnist_ann_model_3.h5")
model.evaluate(X_valid, y_valid)



[0.28021055459976196, 0.9388095140457153]

To get a better accuracy score, we try running the model again with different learning rate.

In [60]:
model = keras.models.Sequential([
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(50, activation="relu"),
    keras.layers.Dense(10, activation="softmax")
])

model.compile(loss="sparse_categorical_crossentropy", optimizer=keras.optimizers.SGD(0.1),
             metrics=["accuracy"])

run_index = 4
run_logdir = os.path.join(os.curdir, "mnist_logs", "run_{:03d}".format(run_index))

early_stopping_cb = keras.callbacks.EarlyStopping(patience=20)
checkpoint_cb = keras.callbacks.ModelCheckpoint("mnist_ann_model_4.h5", save_best_only=True)
tensorboard_cb = keras.callbacks.TensorBoard(run_logdir)

history = model.fit(X_train, y_train, epochs=100, validation_data = (X_valid, y_valid),
                   callbacks=[early_stopping_cb, checkpoint_cb, tensorboard_cb])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100


In [61]:
model = keras.models.load_model("mnist_ann_model_4.h5")
model.evaluate(X_valid, y_valid)



[2.3000805377960205, 0.11535714566707611]

Increasing the learning rate results in poor accuracy.

In [69]:
model = keras.models.Sequential([
    keras.layers.Dense(300, activation="relu"),
    keras.layers.Dense(50, activation="relu"),
    keras.layers.Dense(10, activation="softmax")
])

model.compile(loss="sparse_categorical_crossentropy", optimizer=keras.optimizers.SGD(1e-3),
             metrics=["accuracy"])

run_index = 5
run_logdir = os.path.join(os.curdir, "mnist_logs", "run_{:03d}".format(run_index))

early_stopping_cb = keras.callbacks.EarlyStopping(patience=20)
checkpoint_cb = keras.callbacks.ModelCheckpoint("mnist_ann_model_5.h5", save_best_only=True)
tensorboard_cb = keras.callbacks.TensorBoard(run_logdir)

history = model.fit(X_train, y_train, epochs=100, validation_data = (X_valid, y_valid),
                   callbacks=[early_stopping_cb, checkpoint_cb, tensorboard_cb])

model.evaluate(X_valid, y_valid)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100


[0.3084529638290405, 0.9448809623718262]

We see that changing the amount of neurons in the second layers improves accuracy.

In [55]:
%load_ext tensorboard
%tensorboard --logdir=./mnist_logs --port=6006

By comparing the model we see that the `SVC()` model returns the highest accuracy score despite only using 5000 data to train.

In [71]:
test_data.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [72]:
random_search.best_params_

{'tol': 0.001, 'gamma': 'scale', 'C': 5}

In [73]:
predict = random_search.best_estimator_.predict(test_data)

In [81]:
submit_pd = pd.DataFrame({"ImageId": list(range(1,len(predict)+1)),
    "Label": predict})
submit_pd.head()

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,7
4,5,3


In [82]:
submit_pd.to_csv('./Data/answer.csv', index=False)