<a href="https://colab.research.google.com/github/yashveersinghsohi/Hands_On_ML_Book_Practice/blob/master/Chapter_11/Chapter11_Training_Deep_Neural_Networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Vanishing/Exploding Gradients

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
fashion_mnist = tf.keras.datasets.fashion_mnist
(X_train_val, y_train_val), (X_test, y_test) = fashion_mnist.load_data()
X_train, y_train = X_train_val[:50000], y_train_val[:50000]
X_val, y_val = X_train_val[50000:], y_train_val[50000:]
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((50000, 28, 28),
 (50000,),
 (10000, 28, 28),
 (10000,),
 (10000, 28, 28),
 (10000,))

In [None]:
tf.random.set_seed(42)

model = tf.keras.models.Sequential([
  tf.keras.layers.InputLayer(shape=[28, 28]),
  tf.keras.layers.Flatten(),
  tf.keras.layers.BatchNormalization(),
  tf.keras.layers.Dense(300, activation='relu', kernel_initializer='he_normal'),
  tf.keras.layers.BatchNormalization(),
  tf.keras.layers.Dense(100, activation='relu', kernel_initializer='he_normal'),
  tf.keras.layers.BatchNormalization(),
  tf.keras.layers.Dense(10, activation='softmax')
])

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3, clipnorm=1)
loss = 'sparse_categorical_crossentropy'
metrics = ['accuracy']

model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

model.summary()

In [None]:
[(var.name, var.trainable) for var in model.layers[1].variables]

[('gamma', True),
 ('beta', True),
 ('moving_mean', False),
 ('moving_variance', False)]

In [None]:
callbacks = [
  tf.keras.callbacks.EarlyStopping(patience=5),
  tf.keras.callbacks.ModelCheckpoint('model.keras', save_best_only=True)
]
history = model.fit(
  X_train, y_train,
  epochs=20,
  callbacks=callbacks,
  validation_data=(X_val, y_val)
)

Epoch 1/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 8ms/step - accuracy: 0.7964 - loss: 0.5820 - val_accuracy: 0.8653 - val_loss: 0.3668
Epoch 2/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 8ms/step - accuracy: 0.8697 - loss: 0.3602 - val_accuracy: 0.8745 - val_loss: 0.3445
Epoch 3/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 8ms/step - accuracy: 0.8894 - loss: 0.3058 - val_accuracy: 0.8759 - val_loss: 0.3499
Epoch 4/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 9ms/step - accuracy: 0.9044 - loss: 0.2637 - val_accuracy: 0.8767 - val_loss: 0.3559
Epoch 5/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 8ms/step - accuracy: 0.9177 - loss: 0.2262 - val_accuracy: 0.8818 - val_loss: 0.3682
Epoch 6/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 8ms/step - accuracy: 0.9318 - loss: 0.1915 - val_accuracy: 0.8815 - val_loss: 0.3876
Epoch 7/20

In [None]:
model.evaluate(X_test, y_test, verbose=2, return_dict=True)

313/313 - 1s - 3ms/step - accuracy: 0.8732 - loss: 0.4807


{'accuracy': 0.873199999332428, 'loss': 0.4807271957397461}

# Transfer Learning

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
fashion_mnist = tf.keras.datasets.fashion_mnist
(X_train_val, y_train_val), (X_test, y_test) = fashion_mnist.load_data()
X_train, y_train = X_train_val[:50000], y_train_val[:50000]
X_val, y_val = X_train_val[50000:], y_train_val[50000:]
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((50000, 28, 28),
 (50000,),
 (10000, 28, 28),
 (10000,),
 (10000, 28, 28),
 (10000,))

In [3]:
y_train_8 = (y_train == 8).astype(np.int32)
y_val_8 = (y_val == 8).astype(np.int32)
y_test_8 = (y_test == 8).astype(np.int32)

In [4]:
tf.random.set_seed(42)

model_A = tf.keras.models.Sequential([
  tf.keras.layers.InputLayer(shape=[28, 28]),
  tf.keras.layers.Flatten(),
  tf.keras.layers.BatchNormalization(),
  tf.keras.layers.Dense(300, activation='relu', kernel_initializer='he_normal'),
  tf.keras.layers.BatchNormalization(),
  tf.keras.layers.Dense(100, activation='relu', kernel_initializer='he_normal'),
  tf.keras.layers.BatchNormalization(),
  tf.keras.layers.Dense(10, activation='softmax')
])

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3, clipnorm=1)
loss = 'sparse_categorical_crossentropy'
metrics = ['accuracy']

model_A.compile(loss=loss, optimizer=optimizer, metrics=metrics)

callbacks = [
  tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True),
  tf.keras.callbacks.ModelCheckpoint('model_A.keras', save_best_only=True)
]
history_A = model_A.fit(
  X_train, y_train,
  epochs=20,
  callbacks=callbacks,
  validation_data=(X_val, y_val)
)
model_A.evaluate(X_test, y_test, verbose=2, return_dict=True)

Epoch 1/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 9ms/step - accuracy: 0.7962 - loss: 0.5773 - val_accuracy: 0.8655 - val_loss: 0.3632
Epoch 2/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 8ms/step - accuracy: 0.8680 - loss: 0.3610 - val_accuracy: 0.8724 - val_loss: 0.3388
Epoch 3/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 8ms/step - accuracy: 0.8886 - loss: 0.3070 - val_accuracy: 0.8774 - val_loss: 0.3420
Epoch 4/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 8ms/step - accuracy: 0.9034 - loss: 0.2651 - val_accuracy: 0.8785 - val_loss: 0.3491
Epoch 5/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 8ms/step - accuracy: 0.9174 - loss: 0.2303 - val_accuracy: 0.8762 - val_loss: 0.3730
Epoch 6/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 9ms/step - accuracy: 0.9293 - loss: 0.1976 - val_accuracy: 0.8764 - val_loss: 0.3943
Epoch 7/20

{'accuracy': 0.8677999973297119, 'loss': 0.36821138858795166}

In [6]:
model_A.export('model_A_full')

Saved artifact at 'model_A_full'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 28, 28), dtype=tf.float32, name='keras_tensor')
Output Type:
  TensorSpec(shape=(None, 10), dtype=tf.float32, name=None)
Captures:
  134491863721664: TensorSpec(shape=(), dtype=tf.resource, name=None)
  134491863724832: TensorSpec(shape=(), dtype=tf.resource, name=None)
  134491863722896: TensorSpec(shape=(), dtype=tf.resource, name=None)
  134491863722544: TensorSpec(shape=(), dtype=tf.resource, name=None)
  134491863723424: TensorSpec(shape=(), dtype=tf.resource, name=None)
  134491863718496: TensorSpec(shape=(), dtype=tf.resource, name=None)
  134491863727296: TensorSpec(shape=(), dtype=tf.resource, name=None)
  134491864105376: TensorSpec(shape=(), dtype=tf.resource, name=None)
  134491863725536: TensorSpec(shape=(), dtype=tf.resource, name=None)
  134491863727120: TensorSpec(shape=(), dtype=tf.resource, name=None)
  134491864106432: Tenso

In [14]:
model_A_loaded = tf.keras.models.load_model('model_A.keras')
model_B_on_A = tf.keras.Sequential(model_A_loaded.layers[:-1])
model_B_on_A.add(tf.keras.layers.Dense(1, activation='sigmoid'))

# model_A_clone = tf.keras.model.clone_model(model_A_loaded)
# model_A_clone.set_weights(model_A_loaded.get_weights())

for layer in model_B_on_A.layers[:-1]: layer.trainable = False
model_B_on_A.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3, clipnorm=1), metrics=['accuracy'])
history_B_1 = model_B_on_A.fit(
  X_train, y_train_8,
  epochs=4,
  validation_data=(X_val, y_val_8)
)
model_B_on_A.evaluate(X_test, y_test_8, verbose=2, return_dict=True)

for layer in model_B_on_A.layers[:-1]: layer.trainable = True
model_B_on_A.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3, clipnorm=1), metrics=['accuracy'])
callbacks = [
  tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True),
  tf.keras.callbacks.ModelCheckpoint('model_A.keras', save_best_only=True)
]
history_B_2 = model_B_on_A.fit(
  X_train, y_train_8,
  epochs=16,
  callbacks=callbacks,
  validation_data=(X_val, y_val_8)
)
model_B_on_A.evaluate(X_test, y_test_8, verbose=2, return_dict=True)

Epoch 1/4
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.8588 - loss: 0.3430 - val_accuracy: 0.9857 - val_loss: 0.0822
Epoch 2/4
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.9894 - loss: 0.0620 - val_accuracy: 0.9898 - val_loss: 0.0460
Epoch 3/4
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.9924 - loss: 0.0350 - val_accuracy: 0.9905 - val_loss: 0.0349
Epoch 4/4
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.9933 - loss: 0.0255 - val_accuracy: 0.9912 - val_loss: 0.0300
313/313 - 1s - 2ms/step - accuracy: 0.9922 - loss: 0.0301
Epoch 1/16
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 9ms/step - accuracy: 0.9910 - loss: 0.0319 - val_accuracy: 0.9926 - val_loss: 0.0239
Epoch 2/16
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 9ms/step - accuracy: 0.9954 - loss: 0.0153 - 

{'accuracy': 0.992900013923645, 'loss': 0.02817167341709137}