# Classification using MNIST dataset
MNIST dataset is a database of handwritten digits. The goal is to identify the actual digit using this dataset where humans have made handwritten endtries for numbers. We will try multiple classification algorithms to figure out the most suitable one. Algorithms we will try:
1. SGDClassifier
2. RandomForest classifier
3. Logistic regression
4. Knearestneighbours Classifier
5. SVM classifier

In [1]:
#loading all the packages 
import pandas as pd
import numpy as np
import tensorflow_data_validation as tfdv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn import datasets
import time

import warnings
warnings.filterwarnings('ignore')

## Reading the data

In [2]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version = 1, as_frame = True)
print(mnist.keys())

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])


In [3]:
#Extracting required data for modelling purposes
X,y = mnist["data"], mnist["target"]
df = pd.concat([X,y], axis = 1)
print(df.shape)
df.head(5)

(70000, 785)


Unnamed: 0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,pixel10,...,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784,class
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9


## EDA

Let's take a look at typical summary stats for the data that we're going to be working with

In [4]:
#splitting up the data in train and test first
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 22)
print(X_train.shape)
print(X_test.shape)

(52500, 784)
(17500, 784)


In [None]:
#visualizing summary stats for the train set
train_stats = tfdv.generate_statistics_from_dataframe(X_train)
tfdv.visualize_statistics(train_stats)

Observations:
1. There are a lot of 0 values but given that this is image data where there  aren't any pixels in a large portion of the image, this makes sense. We don't want to treat these values
2. There is no missing data that we need to impute in this dataset

In [7]:
#Taking a look at the table schema
train_schema = tfdv.infer_schema(statistics = train_stats)
tfdv.display_schema(train_schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'pixel1',FLOAT,required,,-
'pixel2',FLOAT,required,,-
'pixel3',FLOAT,required,,-
'pixel4',FLOAT,required,,-
'pixel5',FLOAT,required,,-
...,...,...,...,...
'pixel780',FLOAT,required,,-
'pixel781',FLOAT,required,,-
'pixel782',FLOAT,required,,-
'pixel783',FLOAT,required,,-


## Preprocessing pipeline

There is no need to create a transformation pipeline for this data as there is no need to scale, impute etc anything here. If required, we would have created a transformation pipeline using sklearn's Pipeline class to apply transformations to training and test data

## Hyperparameter and Model building functions

In [5]:
#creating an empty dataset to store model acuracies
modelComparisons = pd.DataFrame(columns = ['Model name', 'Accuracy', 'Precision', 'Recall','F1 score'])

### Let's first create the Hyperparameter tuning + modeling functions

In [18]:
#function to measure accuracy
def modelMetrics(actual, pred):
    accuracy = round(accuracy_score(actual,pred)*100,2)
    precision = round(precision_score(actual,pred,average='macro')*100,2)
    recall = round(recall_score(actual,pred,average='macro')*100,2)
    f1score = round(f1_score(actual, pred, average='macro'),2)
    
    print("Accuracy: ",accuracy,'%')
    print("Precision: ",precision,'%')
    print("Recall: ",recall,'%')
    print("F1 score: ",f1score)
    
    return(accuracy,precision,recall,f1score)

In [14]:
#hyperparameters and modelling for SGDclassifier
def sgdPara(model, iterations, X_training, y_training, X_testing, y_testing):
    start = time.time()
    global modelComparisons
    para_grid = {"loss": ["hinge"],
                "penalty": ["l2"] #, "elasticnet"
                }
    hp_tuning = RandomizedSearchCV(model, param_distributions = para_grid, n_iter = iterations)
    hp_tuning.fit(X_training, y_training)
    
    print('SGDClassifier Best Parameters:\n', hp_tuning.best_params_)
    y_pred = hp_tuning.predict(X_testing)
    
    accuracy, precision, recall, f1score = modelMetrics(y_pred, y_testing)
    
    #appending in table
    results = [['SGDClassifier', accuracy, precision, recall, f1score]]
    modelComparisons = modelComparisons.append(pd.DataFrame(results,columns = ['Model name', 'Accuracy', 'Precision', 'Recall','F1 score']),
               ignore_index = True)
    print("Done in {} seconds".format(time.time()-start))
    #return(accuracy,precision,recall,f1score)

#hyperparameters and modelling for RandomForestClassifier
def rfPara(model, X_training, y_training, X_testing, y_testing):
    start = time.time()
    global modelComparisons
    para_grid = {
    'max_depth': [10, 20, 30],#, 40, 50, 60, 70, 80, 90, 100, None
     'max_features': ['auto', 'sqrt'],
     'min_samples_leaf': [4],#1, 2,
     'min_samples_split': [10],#2, 5,
     'n_estimators': [ 1800, 2000] #200, 400, 600, 800, 1000, 1200, 1400, 1600,
    }
    hp_tuning = RandomizedSearchCV(model, param_distributions = para_grid)
    hp_tuning.fit(X_training, y_training)
    
    print('RandomForest Best Parameters:\n', hp_tuning.best_params_)
    y_pred = hp_tuning.predict(X_testing)
    
    accuracy, precision, recall, f1score = modelMetrics(y_pred, y_testing)
    
    #appending in table
    results = [['RandomForestClassifier', accuracy, precision, recall, f1score]]
    modelComparisons = modelComparisons.append(pd.DataFrame(results,columns = ['Model name', 'Accuracy', 'Precision', 'Recall','F1 score']),
               ignore_index = True)
    print("Done in {} seconds".format(time.time()-start))
    
#hyperparameters and modelling for LogisticRegression 
def lrPara(model, X_training, y_training, X_testing, y_testing):
    start = time.time()
    global modelComparisons
    para_grid = {"solver":["lbfgs","sag"],#
                 "penalty": ["l2","none"]#
                }
    hp_tuning = RandomizedSearchCV(model, param_distributions = para_grid)
    hp_tuning.fit(X_training, y_training)
    
    print('SGDClassifier Best Parameters:\n', hp_tuning.best_params_)
    y_pred = hp_tuning.predict(X_testing)
    
    accuracy, precision, recall, f1score = modelMetrics(y_pred, y_testing)
    
    #appending in table
    results = [['LogisticRegression', accuracy, precision, recall, f1score]]
    modelComparisons = modelComparisons.append(pd.DataFrame(results,columns = ['Model name', 'Accuracy', 'Precision', 'Recall','F1 score']),
               ignore_index = True)
    print("Done in {} seconds".format(time.time()-start))

## Model predictions and Accuracy

Building the models using the best params

In [16]:
#SGDClassifier
sgdModel = SGDClassifier()
sgdPara(sgdModel, 1, X_train, y_train, X_test, y_test)

SGDClassifier Best Parameters:
 {'penalty': 'l2', 'loss': 'hinge'}
Accuracy:  87.61 %
Precision:  87.61 %
Recall:  87.61 %
F1 score:  0.88
Done in 541.3047416210175 seconds


In [None]:
#RandomForest
rfModel = RandomForestClassifier()
rfPara(rfModel, X_train, y_train, X_test, y_test)

In [19]:
#Logistic regression
lrModel = LogisticRegression()
lrPara(lrModel, X_train, y_train, X_test, y_test)

SGDClassifier Best Parameters:
 {'solver': 'lbfgs', 'penalty': 'l2'}
Accuracy:  92.01 %
Precision:  91.89 %
Recall:  91.93 %
F1 score:  0.92
Done in 165.6199915409088 seconds


In [20]:
modelComparisons

Unnamed: 0,Model name,Accuracy,Precision,Recall,F1 score
0,LogisticRegression,92.01,92.01,92.01,0.92
1,SGDClassifier,87.61,87.61,87.61,0.88
2,LogisticRegression,92.01,91.89,91.93,0.92


## Best model selection