# Handwritten Digit Recognition 

## 1. Project Summary

In this project, we use the handwritten digit dataset to conduct classification with neural network. 

In [1]:
# Import packages

# 1. Basic packages
import pandas as pd
import numpy as np
import random

# 2. Keras
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, Conv2D, MaxPooling2D, Reshape
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


In [2]:
dataSet = pd.read_csv("train.csv")

In [3]:
def crossValidation(data,
                    fold = 5):
    
    """
    Summary:
    This is the function that takes the data and model as input. 
    It conducts cross validation. 
    
    Args:
    data: pandas dataframe
    model: Model for validation
    """
    
    # Create index for cross validation
    splits = np.array_split(data.sample(frac = 1), fold)
    
    # start the cross validation
    for i in range(fold):
        
        # split data
        train = splits.copy()
        test = splits[i]
        del train[i]
        train = pd.concat(train, sort = False)
        
        # Training and testing
        model = trainNN(train, 
                        isCNN = False)
        validateNN(test, model)
        

In [4]:
# Building a function to train a neural network model

def trainNN(trainingSet,
            isCNN = 0,
            loss = 'categorical_crossentropy', 
            optimizer = 'adam', 
            epochs = 5):
    
    """
    Summary:
    This function trains the model from the given training data
    
    Args:
    X_training: pandas dataframe, predictors
    y_training: pandas dataframe, response
    isCNN: int, indicating whether this is an CNN
    loss: string, loss function - refer to Keras documentation
    optimizer: string, optimization algorithm - refer to Keras documentation.
        'adam', 'SGD', 'RMSprop', etc.,
    epochs: int, epochs times
    
    Note: Third argument is created as the input dimension differs. 
    
    """
    
    # Data preprocessing - Normalize feature
    (X_train, y_train) = trainingSet[trainingSet.columns.difference(['label'])], trainingSet['label'] 
    # X_train = (X_train / 255).values.reshape(X_train.shape[0], 28, 28, 1)
    X_train = (X_train / 255).values
    
    # Data preprocessing - Hot encoding
    y_train = to_categorical(y_train)

    # Start with input dimension
    if isCNN == 1:
        inputDim = X_train.shape[1:4]
    else:
        inputDim = X_train.shape[1]
        
    # Neural Network Structure
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim = inputDim))
    model.add(Dropout(0.1))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(10, activation='softmax'))
    
    # Neural Network Training Specification
    model.compile(loss = loss,
                  optimizer = optimizer,
                  metrics = ['accuracy'])
    
    # Fit the model
    model.fit(X_train, 
              y_train, 
              epochs = epochs,
              verbose = False)
    
    return model

In [5]:
def validateNN(testingSet,
               model):
    
    """
    X_testing: pandas dataframe, predictors
    y_testing: pandas dataframe, response
    isCNN: int, indicating whether this is an CNN
    
    """
    
    # Preprocessing
    (X_test, y_test) = testingSet[testingSet.columns.difference(['label'])], testingSet['label'] 
    # X_test = (X_test / 255).values.reshape(X_test.shape[0], 28, 28, 1)
    X_test = (X_test / 255).values
    
    # Hot encoding
    y_test = to_categorical(y_test)
    
    result = model.evaluate(X_test, y_test, verbose = False)
    print("-----------------------------------------")
    print("The loss is ",
          result[0], "\n")
    print("The accuracy is ",
          result[1], "\n")
    print("-----------------------------------------")

In [6]:
crossValidation(dataSet)


-----------------------------------------
The loss is  0.11363575707588877 

The accuracy is  0.9670237898826599 

-----------------------------------------
-----------------------------------------
The loss is  0.1063876793861744 

The accuracy is  0.9700000286102295 

-----------------------------------------
-----------------------------------------
The loss is  0.09799418065253468 

The accuracy is  0.9694047570228577 

-----------------------------------------
-----------------------------------------
The loss is  0.08287743925604792 

The accuracy is  0.9757142663002014 

-----------------------------------------
-----------------------------------------
The loss is  0.11147530789176623 

The accuracy is  0.9664285778999329 

-----------------------------------------
