In [None]:
import pandas as pd
import numpy as np

np.random.seed(1212)

import keras #Keras is a high-level, deep learning API developed by Google for implementing neural networks.
from keras.models import Model #a model is a function with learnable parameters that maps an input to an output.
from keras.layers import * #define the architecture and functionality of neural network models
from keras import optimizers  #change the attributes of machine/deep learning model such as weights and learning rate in order to reduce the losses.

In [None]:
from google.colab import files


uploaded = files.upload()

Saving test.csv to test.csv


In [None]:
from google.colab import files


uploaded = files.upload()

Saving train.csv to train.csv


In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [None]:
df_train.head() # 784 features, 1 label

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_features = df_train.iloc[:, 1:785] #contain the training data without the target (label) column.
df_label = df_train.iloc[:, 0] #contain the target (label) column

X_test = df_test.iloc[:, 0:784] # selects all rows and column

print(X_test.shape) #get number of rows and columns

(28000, 784)


In [None]:
from sklearn.model_selection import train_test_split                     #split data into training and cross-validation (cv) sets
X_train, X_cv, y_train, y_cv = train_test_split(df_features, df_label,
                                                test_size = 0.2,         #20% data will used for cross validation
                                                random_state = 1212)     # ensures reproducibility of the results.

#x_train - 80% df_features, x_cv - 20% df_feature,
X_train = X_train.values.reshape(33600, 784) #(33600, 784)
X_cv = X_cv.values.reshape(8400, 784) #(8400, 784)

X_test = X_test.values.reshape(28000, 784)  #2-dimensional input data

In [None]:
print((min(X_train[1]), max(X_train[1])))
#original pixel values range from 0 to 255 ( 8-bit grayscale images)

In [None]:
# Feature Normalization
X_train = X_train.astype('float32'); X_cv= X_cv.astype('float32'); X_test = X_test.astype('float32')
X_train /= 255; X_cv /= 255; X_test /= 255   #dividing by 255 scales the values to a range between 0 and 1.

# Convert labels to One Hot Encoded
num_digits = 10
y_train = keras.utils.to_categorical(y_train, num_digits)  # convert categorical labels into a binary representation
y_cv = keras.utils.to_categorical(y_cv, num_digits)

In [None]:
# Printing 2 examples of labels after conversion
print(y_train[0]) # 2
print(y_train[3]) # 7

[0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]


In [None]:
# Input Parameters
n_input = 784 # number of features in the input data - match the number of columns in X_train, X_cv, x_test
n_hidden_1 = 300  #300 neurons in the first hidden layer
n_hidden_2 = 100
n_hidden_3 = 100
n_hidden_4 = 200
num_digits = 10   #categories in the classification task  (0 to 9)

In [None]:
Inp = Input(shape=(784,))   #the batch size will be determined during the model's training.
x = Dense(n_hidden_1, activation='relu', name = "Hidden_Layer_1")(Inp)
x = Dense(n_hidden_2, activation='relu', name = "Hidden_Layer_2")(x)
x = Dense(n_hidden_3, activation='relu', name = "Hidden_Layer_3")(x)
x = Dense(n_hidden_4, activation='relu', name = "Hidden_Layer_4")(x)
output = Dense(num_digits, activation='softmax', name = "Output_Layer")(x) #'softmax' activation function is used in the output layer for multi-class classification to produce class probability, num_digits = 10(digits 0 -9)


In [None]:
# Our model would have '6' layers - input layer, 4 hidden layer and 1 output layer
model = Model(Inp, output)
model.summary() # We have 297,910 parameters to estimate. Parameters in a neural network are the weights and biases of the neurons in each layer.

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 784)]             0         
                                                                 
 Hidden_Layer_1 (Dense)      (None, 300)               235500    
                                                                 
 Hidden_Layer_2 (Dense)      (None, 100)               30100     
                                                                 
 Hidden_Layer_3 (Dense)      (None, 100)               10100     
                                                                 
 Hidden_Layer_4 (Dense)      (None, 200)               20200     
                                                                 
 Output_Layer (Dense)        (None, 10)                2010      
                                                                 
Total params: 297,910
Trainable params: 297,910
Non-trainable

In [None]:
# Insert Hyperparameters
learning_rate = 0.1
training_epochs = 20 # Each epoch consists of one forward pass (prediction) and one backward pass (weight update) for all the training samples.
batch_size = 100     # Number of samples that are processed together in parallel
sgd = optimizers.SGD(lr=learning_rate)  #SGD - Stochastic Gradient Descent, updates the model's weights based on the gradients of the loss function with respect to the weights.

  super().__init__(name, **kwargs)


In [None]:
# We rely on the plain vanilla Stochastic Gradient Descent as our optimizing methodology
model.compile(loss='categorical_crossentropy', #Categorical Crossentropy is a common loss function used for multi-class classification problem
              optimizer='sgd',
              metrics=['accuracy'])  #accuracy metric measures the percentage of correctly predicted samples out of the total samples during training

In [None]:
history1 = model.fit(X_train, y_train,
                     batch_size = batch_size,
                     epochs = training_epochs,
                     verbose = 2,
                     validation_data=(X_cv, y_cv))

#X_train represents the training features (input data), and y_train represents the corresponding one-hot encoded labels (target values).
#batch size is number of samples that will be processed in each update of the model's weights during a single training iteration.
#The training_epochs hyperparameter determines the number of times the entire training dataset (X_train and y_train) will be passed through the neural network during the training process.
#A value of 2 means that progress will be displayed for each epoch, showing the training loss and metrics.
# the model will be evaluated on this data after each epoch to monitor its performance on unseen data and detect overfitting.

Epoch 1/20
336/336 - 4s - loss: 1.8235 - accuracy: 0.4861 - val_loss: 0.9412 - val_accuracy: 0.7963 - 4s/epoch - 11ms/step
Epoch 2/20
336/336 - 3s - loss: 0.5939 - accuracy: 0.8494 - val_loss: 0.4369 - val_accuracy: 0.8793 - 3s/epoch - 8ms/step
Epoch 3/20
336/336 - 2s - loss: 0.3876 - accuracy: 0.8887 - val_loss: 0.3506 - val_accuracy: 0.9011 - 2s/epoch - 7ms/step
Epoch 4/20
336/336 - 5s - loss: 0.3260 - accuracy: 0.9044 - val_loss: 0.3119 - val_accuracy: 0.9118 - 5s/epoch - 14ms/step
Epoch 5/20
336/336 - 3s - loss: 0.2908 - accuracy: 0.9141 - val_loss: 0.2867 - val_accuracy: 0.9152 - 3s/epoch - 8ms/step
Epoch 6/20
336/336 - 2s - loss: 0.2636 - accuracy: 0.9233 - val_loss: 0.2590 - val_accuracy: 0.9258 - 2s/epoch - 7ms/step
Epoch 7/20
336/336 - 2s - loss: 0.2438 - accuracy: 0.9288 - val_loss: 0.2443 - val_accuracy: 0.9280 - 2s/epoch - 7ms/step
Epoch 8/20
336/336 - 3s - loss: 0.2251 - accuracy: 0.9346 - val_loss: 0.2357 - val_accuracy: 0.9286 - 3s/epoch - 9ms/step
Epoch 9/20
336/336 - 3

In [None]:
Inp = Input(shape=(784,))
x = Dense(n_hidden_1, activation='relu', name = "Hidden_Layer_1")(Inp)
x = Dense(n_hidden_2, activation='relu', name = "Hidden_Layer_2")(x)
x = Dense(n_hidden_3, activation='relu', name = "Hidden_Layer_3")(x)
x = Dense(n_hidden_4, activation='relu', name = "Hidden_Layer_4")(x)
output = Dense(num_digits, activation='softmax', name = "Output_Layer")(x)

# We rely on ADAM as our optimizing methodology instead of Stochastic Gradient Descent (SGD)
adam = keras.optimizers.Adam(lr=learning_rate)
model2 = Model(Inp, output)

model2.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

  super().__init__(name, **kwargs)


In [None]:
history2 = model2.fit(X_train, y_train,
                      batch_size = batch_size,
                      epochs = training_epochs,
                      verbose = 2,
                      validation_data=(X_cv, y_cv))

Epoch 1/20
336/336 - 5s - loss: 0.3398 - accuracy: 0.8975 - val_loss: 0.1877 - val_accuracy: 0.9442 - 5s/epoch - 15ms/step
Epoch 2/20
336/336 - 3s - loss: 0.1234 - accuracy: 0.9616 - val_loss: 0.1089 - val_accuracy: 0.9654 - 3s/epoch - 9ms/step
Epoch 3/20
336/336 - 3s - loss: 0.0781 - accuracy: 0.9758 - val_loss: 0.1073 - val_accuracy: 0.9658 - 3s/epoch - 9ms/step
Epoch 4/20
336/336 - 3s - loss: 0.0541 - accuracy: 0.9829 - val_loss: 0.1057 - val_accuracy: 0.9683 - 3s/epoch - 9ms/step
Epoch 5/20
336/336 - 4s - loss: 0.0450 - accuracy: 0.9863 - val_loss: 0.0924 - val_accuracy: 0.9755 - 4s/epoch - 12ms/step
Epoch 6/20
336/336 - 4s - loss: 0.0390 - accuracy: 0.9872 - val_loss: 0.0974 - val_accuracy: 0.9732 - 4s/epoch - 12ms/step
Epoch 7/20
336/336 - 4s - loss: 0.0307 - accuracy: 0.9896 - val_loss: 0.0991 - val_accuracy: 0.9731 - 4s/epoch - 13ms/step
Epoch 8/20
336/336 - 4s - loss: 0.0240 - accuracy: 0.9924 - val_loss: 0.1060 - val_accuracy: 0.9726 - 4s/epoch - 12ms/step
Epoch 9/20
336/336 

In [None]:
Inp = Input(shape=(784,))
x = Dense(n_hidden_1, activation='relu', name = "Hidden_Layer_1")(Inp)
x = Dense(n_hidden_2, activation='relu', name = "Hidden_Layer_2")(x)
x = Dense(n_hidden_3, activation='relu', name = "Hidden_Layer_3")(x)
x = Dense(n_hidden_4, activation='relu', name = "Hidden_Layer_4")(x)
output = Dense(num_digits, activation='softmax', name = "Output_Layer")(x)

learning_rate = 0.01  #made changes on the learning rate
adam = keras.optimizers.Adam(lr=learning_rate)
model2a = Model(Inp, output)

model2a.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
history2a = model2a.fit(X_train, y_train,
                        batch_size = batch_size,
                        epochs = training_epochs,
                        verbose = 2,
                        validation_data=(X_cv, y_cv))

Epoch 1/20
336/336 - 4s - loss: 0.3320 - accuracy: 0.9007 - val_loss: 0.1699 - val_accuracy: 0.9494 - 4s/epoch - 13ms/step
Epoch 2/20
336/336 - 3s - loss: 0.1195 - accuracy: 0.9639 - val_loss: 0.1090 - val_accuracy: 0.9643 - 3s/epoch - 10ms/step
Epoch 3/20
336/336 - 3s - loss: 0.0802 - accuracy: 0.9744 - val_loss: 0.1094 - val_accuracy: 0.9668 - 3s/epoch - 9ms/step
Epoch 4/20
336/336 - 3s - loss: 0.0571 - accuracy: 0.9817 - val_loss: 0.1020 - val_accuracy: 0.9702 - 3s/epoch - 8ms/step
Epoch 5/20
336/336 - 3s - loss: 0.0450 - accuracy: 0.9850 - val_loss: 0.1021 - val_accuracy: 0.9710 - 3s/epoch - 10ms/step
Epoch 6/20
336/336 - 3s - loss: 0.0349 - accuracy: 0.9885 - val_loss: 0.1350 - val_accuracy: 0.9646 - 3s/epoch - 10ms/step
Epoch 7/20
336/336 - 3s - loss: 0.0284 - accuracy: 0.9907 - val_loss: 0.0892 - val_accuracy: 0.9758 - 3s/epoch - 9ms/step
Epoch 8/20
336/336 - 3s - loss: 0.0266 - accuracy: 0.9919 - val_loss: 0.0942 - val_accuracy: 0.9749 - 3s/epoch - 9ms/step
Epoch 9/20
336/336 -

In [None]:
Inp = Input(shape=(784,))
x = Dense(n_hidden_1, activation='relu', name = "Hidden_Layer_1")(Inp)
x = Dense(n_hidden_2, activation='relu', name = "Hidden_Layer_2")(x)
x = Dense(n_hidden_3, activation='relu', name = "Hidden_Layer_3")(x)
x = Dense(n_hidden_4, activation='relu', name = "Hidden_Layer_4")(x)
output = Dense(num_digits, activation='softmax', name = "Output_Layer")(x)

learning_rate = 0.5 #increase the learning rate
adam = keras.optimizers.Adam(lr=learning_rate)
model2b = Model(Inp, output)

model2b.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
history2b = model2b.fit(X_train, y_train,
                        batch_size = batch_size,
                        epochs = training_epochs,
                            validation_data=(X_cv, y_cv))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
# Input Parameters - increase the hidden layer
n_input = 784 # number of features
n_hidden_1 = 300
n_hidden_2 = 100
n_hidden_3 = 100
n_hidden_4 = 100
n_hidden_5 = 200
num_digits = 10

In [None]:
Inp = Input(shape=(784,))
x = Dense(n_hidden_1, activation='relu', name = "Hidden_Layer_1")(Inp)
x = Dense(n_hidden_2, activation='relu', name = "Hidden_Layer_2")(x)
x = Dense(n_hidden_3, activation='relu', name = "Hidden_Layer_3")(x)
x = Dense(n_hidden_4, activation='relu', name = "Hidden_Layer_4")(x)
x = Dense(n_hidden_5, activation='relu', name = "Hidden_Layer_5")(x)
output = Dense(num_digits, activation='softmax', name = "Output_Layer")(x)

In [None]:
# Our model would have '7' layers - input layer, 5 hidden layer and 1 output layer
model3 = Model(Inp, output)
model3.summary() # We have 308,010 parameters to estimate

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 784)]             0         
                                                                 
 Hidden_Layer_1 (Dense)      (None, 300)               235500    
                                                                 
 Hidden_Layer_2 (Dense)      (None, 100)               30100     
                                                                 
 Hidden_Layer_3 (Dense)      (None, 100)               10100     
                                                                 
 Hidden_Layer_4 (Dense)      (None, 100)               10100     
                                                                 
 Hidden_Layer_5 (Dense)      (None, 200)               20200     
                                                                 
 Output_Layer (Dense)        (None, 10)                2010

In [None]:
# We rely on 'Adam' as our optimizing methodology
adam = keras.optimizers.Adam(lr=0.01)

model3.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
history3 = model3.fit(X_train, y_train,
                      batch_size = batch_size,
                      epochs = training_epochs,
                      validation_data=(X_cv, y_cv))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
# Input Parameters
n_input = 784 # number of features
n_hidden_1 = 300
n_hidden_2 = 100
n_hidden_3 = 100
n_hidden_4 = 200
num_digits = 10

In [None]:
Inp = Input(shape=(784,))
x = Dense(n_hidden_1, activation='relu', name = "Hidden_Layer_1")(Inp)
x = Dropout(0.3)(x)
x = Dense(n_hidden_2, activation='relu', name = "Hidden_Layer_2")(x)
x = Dropout(0.3)(x)
x = Dense(n_hidden_3, activation='relu', name = "Hidden_Layer_3")(x)
x = Dropout(0.3)(x)
x = Dense(n_hidden_4, activation='relu', name = "Hidden_Layer_4")(x)
output = Dense(num_digits, activation='softmax', name = "Output_Layer")(x)

In [None]:
# Our model would have '6' layers - input layer, 4 hidden layer and 1 output layer
model4 = Model(Inp, output)
model4.summary() # We have 297,910 parameters to estimate

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 784)]             0         
                                                                 
 Hidden_Layer_1 (Dense)      (None, 300)               235500    
                                                                 
 dropout (Dropout)           (None, 300)               0         
                                                                 
 Hidden_Layer_2 (Dense)      (None, 100)               30100     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 Hidden_Layer_3 (Dense)      (None, 100)               10100     
                                                                 
 dropout_2 (Dropout)         (None, 100)               0   

In [None]:
model4.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
history = model4.fit(X_train, y_train,
                    batch_size = batch_size,
                    epochs = training_epochs,
                    validation_data=(X_cv, y_cv))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
test_pred = pd.DataFrame(model4.predict(X_test, batch_size=200)) #takes the test data as input and returns an array of predicted probabilities for each class (digit) for every test image.
test_pred = pd.DataFrame(test_pred.idxmax(axis = 1))  #predicted labels (class indices) for the test data.
test_pred.index.name = 'ImageId'  #'ImageId' represents the image number (starting from 1) and 'Label' represents the predicted digit (class) for each test image.
test_pred = test_pred.rename(columns = {0: 'Label'}).reset_index()
test_pred['ImageId'] = test_pred['ImageId'] + 1

test_pred.head()



Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,9
4,5,3


In [None]:
test_pred.to_csv('mnist_submission.csv', index = False)

In [None]:
import pandas as pd

In [None]:
df=pd.read_csv('mnist_submission.csv')

In [None]:
df

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,9
4,5,3
...,...,...
27995,27996,9
27996,27997,7
27997,27998,3
27998,27999,9


