In [1]:
import numpy as np
import pandas as pd
from tensorflow import keras 
from tensorflow.keras import layers
import tensorflow as tf
from sklearn.model_selection import train_test_split 
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
y = train['label']
train = train.drop(['label'],axis =1)
print(train)

       pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  pixel8  \
0           0       0       0       0       0       0       0       0       0   
1           0       0       0       0       0       0       0       0       0   
2           0       0       0       0       0       0       0       0       0   
3           0       0       0       0       0       0       0       0       0   
4           0       0       0       0       0       0       0       0       0   
...       ...     ...     ...     ...     ...     ...     ...     ...     ...   
41995       0       0       0       0       0       0       0       0       0   
41996       0       0       0       0       0       0       0       0       0   
41997       0       0       0       0       0       0       0       0       0   
41998       0       0       0       0       0       0       0       0       0   
41999       0       0       0       0       0       0       0       0       0   

       pixel9  ...  pixel77

In [6]:
X = tf.convert_to_tensor(train)
y = tf.convert_to_tensor(y)
Xtest =tf.convert_to_tensor(test)
print(X)
print(y)

tf.Tensor(
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]], shape=(42000, 784), dtype=int64)
tf.Tensor([1 0 1 ... 7 6 9], shape=(42000,), dtype=int64)


In [7]:
unique, counts = np.unique(np.array(y), return_counts=True)#understand distribution of y_train so as to choose metric for classification
#As the distribution of y_train in this dataset is somewhat uniform we can use accuarcy as evaluation metric 
print (np.asarray((unique, counts)).T)

[[   0 4132]
 [   1 4684]
 [   2 4177]
 [   3 4351]
 [   4 4072]
 [   5 3795]
 [   6 4137]
 [   7 4401]
 [   8 4063]
 [   9 4188]]


In [18]:
X = tf.cast(X,float)
shape = tf.shape( X ) # get dynamic tensor shape
X = X/255.0
print(X)
Xtest = tf.cast(Xtest,float)
Xtest = X_test/255.0

tf.Tensor(
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]], shape=(42000, 784), dtype=float32)


In [19]:
X_train, X_cv, y_train, y_cv = train_test_split(X.numpy(), y.numpy(), test_size=0.20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.30, random_state=42)
X_train, X_cv= tf.constant(X_train), tf.constant(X_cv)
y_train, y_cv = tf.constant(y_train), tf.constant(y_cv)
X_test, y_test = tf.constant(X_test), tf.constant(y_test)
print(X_cv.shape)
print(X_train.shape)
print(X_test.shape)

(8400, 784)
(23520, 784)
(10080, 784)


In [20]:
lr_schedule = keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=1e-2,decay_steps=100000,decay_rate=0.8)

In [17]:
initializer1 = tf.keras.initializers.GlorotNormal()#Glorot Normal initializer/Xavier initialization works better for sigmoid layers
initializer2 = tf.keras.initializers.HeNormal()#He initalizer works better for Relu layers
#In general if we add more sigmoid layers to our neural network then its accuracy decreases as number of sigmoid layers increase
#Instead use relu layers or tanh activations
#Create a deep neural network using keras library 
# We can use dropout and Batch Normalization also
#optimiser adam is best as compared to gradient descent or rms-prop 
dropouts = [0.09,0.1,0.15,0.18]
for d in dropouts:
    print("Dropout: ",d)
    model = tf.keras.Sequential([
    keras.layers.InputLayer(784),
    layers.Dense(512,activation ="relu" ,name ="first_hidden_layer" , kernel_initializer = initializer2),
    layers.BatchNormalization(),
    layers.Dropout(d),
    layers.Dense(200,activation ="relu" ,name ="second_hidden_layer" , kernel_initializer = initializer2),
    layers.BatchNormalization(),
    layers.Dropout(d),
    layers.Dense(128,activation ="relu" ,name ="third_hidden_layer" , kernel_initializer = initializer2),
    layers.BatchNormalization(),
    layers.Dropout(d),
    layers.Dense(10,activation ="softmax" ,name ="output_layer" , kernel_initializer = initializer1)])
    model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(), 
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics=["accuracy"])
    model.fit(X_cv, y_cv, batch_size=32, epochs=10, verbose=1)

Dropout:  0.09
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Dropout:  0.1
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Dropout:  0.15
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Dropout:  0.18
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [21]:
d =0.1 
model = tf.keras.Sequential([
    keras.layers.InputLayer(784),
    layers.Dense(512,activation ="relu" ,name ="first_hidden_layer" , kernel_initializer = initializer2),
    layers.BatchNormalization(),
    layers.Dropout(d),
    layers.Dense(256,activation ="relu" ,name ="second_hidden_layer" , kernel_initializer = initializer2),
    layers.BatchNormalization(),
    layers.Dropout(d),
    layers.Dense(64,activation ="relu" ,name ="third_hidden_layer" , kernel_initializer = initializer2),
    layers.BatchNormalization(),
    layers.Dropout(d),
    layers.Dense(10,activation ="softmax" ,name ="output_layer" , kernel_initializer = initializer1)])

In [22]:
model.summary()

Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 first_hidden_layer (Dense)  (None, 512)               401920    
                                                                 
 batch_normalization_54 (Bat  (None, 512)              2048      
 chNormalization)                                                
                                                                 
 dropout_54 (Dropout)        (None, 512)               0         
                                                                 
 second_hidden_layer (Dense)  (None, 256)              131328    
                                                                 
 batch_normalization_55 (Bat  (None, 256)              1024      
 chNormalization)                                                
                                                                 
 dropout_55 (Dropout)        (None, 256)             

In [23]:
model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(), 
    optimizer=keras.optimizers.Adam(learning_rate=lr_schedule),
    metrics=["accuracy"])

In [24]:
model.fit(X, y, batch_size=32, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x267260a9370>

In [25]:
model.evaluate(X_test, y_test, batch_size=32, verbose=1)



[0.0032670360524207354, 0.9991071224212646]