# Deep Network

In [3]:
import tensorflow as tf
import tensorflow.keras as k
import numpy as np

In [6]:
(x_train, y_train), (x_test, y_test) = k.datasets.mnist.load_data()
x = tf.reshape(x_train, [-1, 28*28])
y = tf.one_hot(y_train, 10)
x_t = tf.reshape(x_test, [-1, 28*28])

## Deep & Wide NN

In [30]:
deep = k.Sequential()
deep.add(k.layers.Dense(units = 16, input_dim = 28*28, activation = 'relu'))
deep.add(k.layers.Dense(units = 16, activation = 'relu'))
deep.add(k.layers.Dense(units = 16, activation = 'relu'))
deep.add(k.layers.Dense(units = 16, activation = 'relu'))
deep.add(k.layers.Dense(units = 16, activation = 'relu'))
deep.add(k.layers.Dense(units = 16, activation = 'relu'))
deep.add(k.layers.Dense(units = 10, activation = 'softmax'))
deep.compile(loss = 'categorical_crossentropy', optimizer = k.optimizers.SGD(0.001), metrics = ['acc']) # lr > 0.1 didn't work - acc: 11%
deep.summary()

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_83 (Dense)             (None, 16)                12560     
_________________________________________________________________
dense_84 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_85 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_86 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_87 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_88 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_89 (Dense)             (None, 10)              

In [31]:
history_deep = deep.fit(x, y, epochs = 15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [32]:
wide = k.Sequential()
wide.add(k.layers.Dense(units = 256, input_dim = 28*28, activation = 'relu'))
wide.add(k.layers.Dense(units = 256, activation = 'relu'))
wide.add(k.layers.Dense(units = 10, activation = 'softmax'))
wide.compile(loss = 'categorical_crossentropy', optimizer = k.optimizers.SGD(0.001), metrics = ['acc'])
wide.summary()

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_90 (Dense)             (None, 256)               200960    
_________________________________________________________________
dense_91 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_92 (Dense)             (None, 10)                2570      
Total params: 269,322
Trainable params: 269,322
Non-trainable params: 0
_________________________________________________________________


In [39]:
history_wide = wide.fit(x, y, epochs = 10) ## acc > 99% in less then 5 epochs, got worried about overfitting

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [40]:
print('deep: {}% accuracy'.format(np.mean(deep.predict_classes(x_t) == y_test)*100))
print('wide: {}% accuracy'.format(np.mean(wide.predict_classes(x_t) == y_test)*100))

deep: 91.17% accuracy
wide: 95.73% accuracy


## Trial \#1
### 4 layers,  dropout (0, 0.2, 0.3, 0), 512 units, he_normal - 96.3%

In [14]:
He_dr_adam = k.Sequential()
He_dr_adam.add(k.layers.Dense(units = 512, input_dim = 28*28, activation = 'relu', kernel_initializer = 'he_normal'))
He_dr_adam.add(k.layers.Dense(units = 512, activation = 'relu', kernel_initializer = 'he_normal'))
He_dr_adam.add(k.layers.Dropout(0.2))
He_dr_adam.add(k.layers.Dense(units = 512, activation = 'relu', kernel_initializer = 'he_normal'))
He_dr_adam.add(k.layers.Dropout(0.3))
He_dr_adam.add(k.layers.Dense(units = 10, activation = 'softmax', kernel_initializer = 'he_normal'))
He_dr_adam.compile(loss = 'categorical_crossentropy', optimizer = k.optimizers.Adam(0.001), metrics = ['acc'])
He_dr_adam.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_17 (Dense)             (None, 512)               401920    
_________________________________________________________________
dense_18 (Dense)             (None, 512)               262656    
_________________________________________________________________
dropout_12 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_19 (Dense)             (None, 512)               262656    
_________________________________________________________________
dropout_13 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_20 (Dense)             (None, 10)                5130      
Total params: 932,362
Trainable params: 932,362
Non-trainable params: 0
________________________________________________

In [15]:
history_He_dr_adam = He_dr_adam.fit(x, y, epochs = 15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [19]:
np.mean(He_dr_adam.predict_classes(x_t) == y_test)

0.9629

## Trial \#2
### 3 layers,  dropout (0.3, 0.3, 0.3), 256 units, he_normal - 96.7%
Turns out shallow layers give fast convergence & high training-accuracy, but low acc on testing.

In [27]:
He_dr_adam2 = k.Sequential()
He_dr_adam2.add(k.layers.Dense(units = 256, input_dim = 28*28, activation = 'relu', kernel_initializer = 'he_normal'))
He_dr_adam.add(k.layers.Dropout(0.3))
He_dr_adam2.add(k.layers.Dense(units = 256, activation = 'relu', kernel_initializer = 'he_normal'))
He_dr_adam.add(k.layers.Dropout(0.3))
He_dr_adam2.add(k.layers.Dense(units = 10, activation = 'softmax', kernel_initializer = 'he_normal'))
He_dr_adam.add(k.layers.Dropout(0.3))
He_dr_adam2.compile(loss = 'categorical_crossentropy', optimizer = k.optimizers.Adam(0.001), metrics = ['acc'])
He_dr_adam2.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_27 (Dense)             (None, 256)               200960    
_________________________________________________________________
dense_28 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_29 (Dense)             (None, 10)                2570      
Total params: 269,322
Trainable params: 269,322
Non-trainable params: 0
_________________________________________________________________


In [28]:
He_dr_adam2.fit(x, y, epochs = 15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f7370b13810>

In [30]:
np.mean(He_dr_adam2.predict_classes(x_t) == y_test)

96.7

## Trial \#3
### 5 layers,  dropout (0.3, 0.3, 0.3, 0.3, 0), 512 units, glorot_normal, 100 batch - 97.2%
Dropout at output layer might lead losses to blow up.

In [13]:
He_dr_adam3 = k.Sequential()
He_dr_adam3.add(k.layers.Dense(units = 512, input_dim = 28*28, activation = 'relu', kernel_initializer = 'glorot_normal' ))
He_dr_adam3.add(k.layers.Dropout(0.3))
He_dr_adam3.add(k.layers.Dense(units = 512, activation = 'relu', kernel_initializer = 'glorot_normal' ))
He_dr_adam3.add(k.layers.Dropout(0.3))
He_dr_adam3.add(k.layers.Dense(units = 512, activation = 'relu', kernel_initializer = 'glorot_normal' ))
He_dr_adam3.add(k.layers.Dropout(0.3))
He_dr_adam3.add(k.layers.Dense(units = 512, activation = 'relu', kernel_initializer = 'glorot_normal' ))
He_dr_adam3.add(k.layers.Dropout(0.3))
He_dr_adam3.add(k.layers.Dense(units = 10, activation = 'softmax', kernel_initializer = 'glorot_normal' ))
# He_dr_adam3.add(k.layers.Dropout(0.3))
He_dr_adam3.compile(loss = 'categorical_crossentropy', optimizer = k.optimizers.Adam(0.001), metrics = ['acc'])
He_dr_adam3.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_16 (Dense)             (None, 512)               401920    
_________________________________________________________________
dropout_15 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 512)               262656    
_________________________________________________________________
dropout_16 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_18 (Dense)             (None, 512)               262656    
_________________________________________________________________
dropout_17 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_19 (Dense)             (None, 512)              

In [14]:
He_dr_adam3.fit(x, y, epochs = 15, batch_size = 100)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f9e380bbfd0>

In [15]:
np.mean(He_dr_adam3.predict_classes(x_t) == y_test)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


0.9717

## Trial \#4
### 5 layers,  dropout (0.3, 0.3, 0.3, 0.3, 0), 512 units, he_normal, 25 batch - 96%
initializer?

In [30]:
He_dr_adam5 = k.Sequential()
He_dr_adam5.add(k.layers.Dense(units = 512, input_dim = 28*28, activation = 'relu', kernel_initializer = 'he_normal' ))
He_dr_adam5.add(k.layers.Dropout(0.3))
He_dr_adam5.add(k.layers.Dense(units = 512, activation = 'relu', kernel_initializer = 'he_normal' ))
He_dr_adam5.add(k.layers.Dropout(0.3))
He_dr_adam5.add(k.layers.Dense(units = 512, activation = 'relu', kernel_initializer = 'he_normal' ))
He_dr_adam5.add(k.layers.Dropout(0.3))
He_dr_adam5.add(k.layers.Dense(units = 512, activation = 'relu', kernel_initializer = 'he_normal' ))
He_dr_adam5.add(k.layers.Dropout(0.3))
He_dr_adam5.add(k.layers.Dense(units = 10, activation = 'softmax', kernel_initializer = 'he_normal' ))
He_dr_adam5.compile(loss = 'categorical_crossentropy', optimizer = k.optimizers.Adam(0.001), metrics = ['acc'])
He_dr_adam5.summary()

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_57 (Dense)             (None, 512)               401920    
_________________________________________________________________
dropout_47 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_58 (Dense)             (None, 512)               262656    
_________________________________________________________________
dropout_48 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_59 (Dense)             (None, 512)               262656    
_________________________________________________________________
dropout_49 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_60 (Dense)             (None, 512)             

In [31]:
He_dr_adam5.fit(x, y, epochs = 15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
 254/1875 [===>..........................] - ETA: 2:31 - loss: 0.3063 - acc: 0.9305

KeyboardInterrupt: 

In [32]:
np.mean(He_dr_adam5.predict_classes(x_t) == y_test)

0.9598

In [34]:
try1 = k.Sequential()
try1.add(k.layers.Dense(units = 256, input_dim = 28*28, activation = 'relu'))
try1.add(k.layers.Dense(units = 256, activation = 'relu'))
try1.add(k.layers.Dense(units = 10, activation = 'softmax'))
try1.compile(loss = 'categorical_crossentropy', optimizer = k.optimizers.SGD(0.001), metrics = ['acc'])

In [36]:
for i in range(15):
    try1.fit(x, y, epochs = 1)
    print('test:',np.mean(try1.predict_classes(x_t) == y_test)*100,'%')

test: 93.89 %
test: 94.54 %
test: 94.81 %
test: 95.08 %
test: 95.32000000000001 %

KeyboardInterrupt: 

In [39]:
try2 = k.Sequential()
try2.add(k.layers.Dense(units = 512, input_dim = 28*28, activation = 'relu', kernel_initializer = 'he_normal' ))
try2.add(k.layers.Dropout(0.3))
try2.add(k.layers.Dense(units = 512, activation = 'relu', kernel_initializer = 'he_normal' ))
try2.add(k.layers.Dropout(0.3))
try2.add(k.layers.Dense(units = 512, activation = 'relu', kernel_initializer = 'he_normal' ))
try2.add(k.layers.Dropout(0.3))
try2.add(k.layers.Dense(units = 512, activation = 'relu', kernel_initializer = 'he_normal' ))
try2.add(k.layers.Dropout(0.3))
try2.add(k.layers.Dense(units = 10, activation = 'softmax', kernel_initializer = 'he_normal' ))
try2.compile(loss = 'categorical_crossentropy', optimizer = k.optimizers.Adam(0.001), metrics = ['acc'])

In [41]:
for i in range(15):
    try2.fit(x, y, epochs = 1, batch_size = 100)
    print('test:',np.mean(try2.predict_classes(x_t) == y_test)*100,'%')

test: 96.89 %
test: 97.16 %
test: 97.14 %
test: 96.67999999999999 %
test: 97.43 %
test: 97.39999999999999 %
test: 97.43 %
test: 97.23 %
test: 97.28999999999999 %
test: 97.41 %
test: 97.28999999999999 %
 10/600 [..............................] - ETA: 1:28 - loss: 0.1181 - acc: 0.9720

KeyboardInterrupt: 

In [16]:
try3 = k.Sequential()
try3.add(k.layers.Dense(units = 512, input_dim = 28*28, activation = 'relu', kernel_initializer = 'glorot_normal' ))
try3.add(k.layers.Dropout(0.2))
try3.add(k.layers.Dense(units = 512, activation = 'relu', kernel_initializer = 'glorot_normal' ))
try3.add(k.layers.Dropout(0.2))
try3.add(k.layers.Dense(units = 10, activation = 'softmax', kernel_initializer = 'glorot_normal' ))
try3.compile(loss = 'categorical_crossentropy', optimizer = k.optimizers.Adam(0.001), metrics = ['acc'])

In [17]:
for i in range(15):
    print('='*10, 'step', i+1, '='*10)
    try3.fit(x, y, epochs = 1, batch_size = 10)
    print('test:',np.mean(try3.predict_classes(x_t) == y_test)*100,'%')

test: 89.61 %
test: 92.36999999999999 %
test: 92.92 %
test: 93.08 %
test: 91.23 %
test: 92.32000000000001 %
test: 92.29 %
test: 90.92 %
test: 90.84 %
 858/6000 [===>..........................] - ETA: 2:17 - loss: 0.5104 - acc: 0.8858

KeyboardInterrupt: 

In [14]:
try4 = k.Sequential()
try4.add(k.layers.Dense(units = 256, input_dim = 28*28, activation = 'relu', kernel_initializer = 'glorot_normal', bias_initializer = 'glorot_normal'))
try4.add(k.layers.Dropout(0.3))
try4.add(k.layers.Dense(units = 256, activation = 'relu', kernel_initializer = 'glorot_normal', bias_initializer = 'glorot_normal' ))
try4.add(k.layers.Dropout(0.3))
try4.add(k.layers.Dense(units = 256, activation = 'relu', kernel_initializer = 'glorot_normal' , bias_initializer = 'glorot_normal'))
try4.add(k.layers.Dropout(0.3))
try4.add(k.layers.Dense(units = 256, activation = 'relu', kernel_initializer = 'glorot_normal' , bias_initializer = 'glorot_normal'))
try4.add(k.layers.Dropout(0.3))
try4.add(k.layers.Dense(units = 10, activation = 'softmax', kernel_initializer = 'glorot_normal' , bias_initializer = 'glorot_normal'))
try4.compile(loss = 'categorical_crossentropy', optimizer = k.optimizers.Adam(0.001), metrics = ['acc'])

In [13]:
for i in range(15):
    print('='*10, 'step', i+1, '='*10)
    try4.fit(x, y, epochs = 1, batch_size = 100)
    print('test:',np.mean(try4.predict_classes(x_t) == y_test)*100,'%')



KeyboardInterrupt: 

In [4]:
learning_rate = 0.001
batch_size = 100
training_epochs = 15
nb_classes = 10
drop_rate = 0.3
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
print(x_train.shape)

x_train = x_train.reshape(x_train.shape[0], 28 * 28)
x_test = x_test.reshape(x_test.shape[0], 28 * 28)

y_train = tf.keras.utils.to_categorical(y_train, nb_classes)
y_test = tf.keras.utils.to_categorical(y_test, nb_classes)

tf.model = tf.keras.Sequential()
# Glorot normal initializer, also called Xavier normal initializer.
# see https://www.tensorflow.org/api_docs/python/tf/initializers

tf.model.add(tf.keras.layers.Dense(input_dim=784, units=512, kernel_initializer='glorot_normal', activation='relu'))
tf.model.add(tf.keras.layers.Dropout(drop_rate))
tf.model.add(tf.keras.layers.Dense(units=512, kernel_initializer='glorot_normal', activation='relu'))
tf.model.add(tf.keras.layers.Dropout(drop_rate))
tf.model.add(tf.keras.layers.Dense(units=512, kernel_initializer='glorot_normal', activation='relu'))
tf.model.add(tf.keras.layers.Dropout(drop_rate))
tf.model.add(tf.keras.layers.Dense(units=512, kernel_initializer='glorot_normal', activation='relu'))
tf.model.add(tf.keras.layers.Dropout(drop_rate))
tf.model.add(tf.keras.layers.Dense(units=nb_classes, kernel_initializer='glorot_normal', activation='softmax'))
tf.model.compile(loss='categorical_crossentropy',
                 optimizer=tf.keras.optimizers.Adam(lr=learning_rate), metrics=['accuracy'])
tf.model.summary()

history = tf.model.fit(x_train, y_train, batch_size=batch_size, epochs=training_epochs)

# predict 10 random hand-writing data
y_predicted = tf.model.predict(x_test)
for x in range(0, 10):
    random_index = random.randint(0, x_test.shape[0]-1)
    print("index: ", random_index,
          "actual y: ", np.argmax(y_test[random_index]),
          "predicted y: ", np.argmax(y_predicted[random_index]))

# evaluate test set
evaluation = tf.model.evaluate(x_test, y_test)
print('loss: ', evaluation[0])
print('accuracy', evaluation[1])

(60000, 28, 28)
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               401920    
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               262656    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               262656    
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 512)

NameError: name 'random' is not defined

In [7]:
np.mean(tf.model.predict_classes(x_t) == y_test)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


0.9715

In [8]:
final = k.Sequential()
final.add(k.layers.Dense(input_dim = 784, units = 128, activation = 'relu'))
final.add(k.layers.Dropout(0.2))
final.add(k.layers.Dense(units = 10, activation = 'softmax'))
final.compile(loss = 'categorical_crossentropy', optimizer = k.optimizers.Adam(0.001), metrics = ['acc'])

In [9]:
final.fit(x / 255, y, epochs = 15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15

_NotOkStatusException: InvalidArgumentError: Error while reading CompositeTensor._type_spec.

In [10]:
np.mean(final.predict_classes(x_t / 255) == y_test)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


0.9811

to do :
* reproduce wide -> done in try1
* try 100 batch -> might matter
* try SGD -> much more slower

## Result :
- Input normalization & standardization DO MATTER
- Simpler models might converge well
- Batch size might matter : but why bigger=better?? contrary to studies. (10 < 25 < 100)