In [41]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Implementing with keras library


In [72]:
import keras
from keras.models import Sequential
from keras.layers import Dense,Flatten, BatchNormalization,Conv2D,MaxPooling2D,Dropout
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split


# Loading train and test datasets

In [73]:

train = pd.read_csv("../input/digit-recognizer/train.csv")
train.shape


In [74]:
train

In [75]:

test = pd.read_csv("../input/digit-recognizer/test.csv")
test

The datasets contains total of 70,000 images with 784 pixel values for each images.
Since we are applying CNN ( Convolutional Neural Network ), the input for the model must be in the form of 4D array. So we convert the pixel values to 4D array.
We obtain the image with 28 * 28 size with only one channel, since the images are in grayscale.

In [76]:
X = train.drop(columns = 'label')
y = train['label']
X = X.to_numpy(dtype='float32').reshape((42000,28,28,1))

X.shape


In [78]:
test.shape

We then normalze the pixel values in between the range of 0 and 1, since the computation becomes faster if the inputs are normalized.

In [77]:
X = X/255.0
plt.figure(figsize=(15,15)) 
sns.heatmap(X[9].reshape(28,28), annot= True, cmap="binary")

We perform one hot encoding to the class labels using to_categorical from keras library.

In [49]:
y = to_categorical(y,num_classes = 10)


In [50]:
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=0.3,random_state=42)

In [51]:
X_train.shape

In [None]:
X_train

# Model Building

In [52]:
model = Sequential()
model.add(Conv2D(32, kernel_size=(3,3), padding='same', input_shape=(28, 28, 1), activation='relu'))
model.add(Conv2D(32, (3,3), padding='same', activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
model.add(Dropout(0.1))

model.add(Conv2D(64, (3,3), padding='same', activation='relu'))
model.add(Conv2D(64, (3,3), padding='same', activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
model.add(Dropout(0.2))

model.add(Conv2D(128, (3,3), padding='same', activation='relu'))
model.add(Conv2D(64, (3,3), padding='same', activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
model.add(Dropout(0.3))

model.add(Conv2D(256, (3,3), padding='same', activation='relu'))
model.add(Conv2D(256, (3,3), padding='same', activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
model.add(Dropout(0.4))

model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(10, activation='softmax'))

In [53]:
model.summary()

Altogether we have used 
Convolution layer = 8 
Batch normaliation layer = 4
Max pooling layer = 4
dropout layer = 4
flattern layer = 1
dense(fully connected layer) = 2

1. Convolution layer with various filters for extracting differnet features of image are used with size of 3 * 3 pixels. Padding = 'same' is used for adding the zero padding by 1 pixels in each side. The input and output size remains same if we apply the stride size of 1 in padding = 'same' case. The activation function relu is used to introduce the non linearity in the pixels values. It removes the linear component created by shadows in the image.

2. Batch normalization layer are used to speed up the training and to use the higher learning rate. It is done within the value of a individual neuron.

3. Max pooling layer is used to reduce the dimensionality of feature maps without losing the information. As a feature map becomes small, it becomes increasingly independent of the location of the feature.

4. Dropout layer is used to nullify the contribution of some neurons toward the next layer and leaves unmodified all others. They are important in training CNNs because they prevent overfitting on the training data.

5. Flatten layer rehshape the feature maps dimensions to have a dimesnion of one with the number of elements equal to the number of elements in the feature maps.
For (None,4,4,512) ----(flatten)----- (None,8192)

6. Dense layer is that layer that is deeply connected with its preceding layer which means the neurons of the layer are connected to every neuron of its preceding layer.

In [54]:
model.compile(optimizer = 'Adam',loss='categorical_crossentropy',metrics=['categorical_accuracy'])

We used the optimzer 'Adam' since it automatically tune the learning rate.
The loss function used is categorical cross entropy which is calculated for each class label as - log(predicted probability of that class).

In [55]:
training = model.fit(X_train,y_train,validation_data=(X_val,y_val),batch_size=128,epochs=25,verbose = 0)

Epochs: One epoch is finished when the network has seen the whole dataset.

Batch size: The number of examples used to train the network for a forward and backward pass. 

Example: 

number of training examples: 29400

batch size : 128

number of iteration to complete one epoch = 29400/128 = 230

In [56]:
loss = model.evaluate(X_val, y_val,verbose = 2)
print("Val Loss", loss[0])
print("Val Accuracy", loss[1])

In [79]:
test.shape

In [80]:
test = test.to_numpy(dtype='float32').reshape((28000,28,28,1))

In [81]:
test.shape

In [85]:
predictions = np.argmax(model.predict(test,verbose=0),axis=-1)


In [93]:
predictions


In [96]:
submissions=pd.DataFrame({"ImageId": list(range(1,len(predictions)+1)),
                         "Label": predictions})
submissions.to_csv("submission.csv", index=False, header=True)