# 10.9 Lab: Deep Learning

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow.keras as tk
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Embedding, LSTM
from keras.utils.np_utils import to_categorical
from keras.regularizers import l1, l2
from keras.applications.vgg16 import VGG16
from keras.preprocessing.image import load_img, img_to_array
from keras.preprocessing import sequence
from keras.applications.vgg16 import preprocess_input
from keras.applications.vgg16 import decode_predictions
from keras.models import Model

%matplotlib inline


ModuleNotFoundError: No module named 'tensorflow'

In [None]:
""" 
In Python, we use keras as the DL interface with backend Tensorflow.
pip install keras 
"""

## 10.9.1 A Single Layer Network on the Hitters Data

In [None]:
# in this exercise, we will use Hitters data set to predict the salary of a player
# I will skip the linear regression and Lasso part since we covered them in previous chapters

In [None]:
# read the data and take a look at the data and split it into train and test 
# I copied the code from chapter (because of laziness :- )
Hitters = pd.read_csv('https://raw.githubusercontent.com/tvanzyl/Sharing_ISL_python/master/data/Hitters.csv', header=0, na_values='NA')
Hitters = Hitters.dropna().reset_index(drop=True) # drop the observation with NA values and reindex the obs from 0
dummies = pd.get_dummies(Hitters[['League', 'Division', 'NewLeague']])

y = Hitters.Salary  # the response variable 
X_prep = Hitters.drop (['Salary', 'League', 'Division', 'NewLeague'], axis = 1).astype('float64')
X = pd.concat([X_prep,  dummies[['League_A', 'Division_E', 'NewLeague_A']]], axis=1)
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.66)


In [None]:
# the DL model is similar to other models implemeted in sklearn. 
# we first define the model, then fit the model, and finally predict the result

"""
I am listing out the hyperameters to be tuned. 
From this simple example with only one layer, we could get a sense of the number of hyperameters in NN.

Actually the number of the hyperameters gets exponentially larger as the number of layers increases.
"""
# define the model.model.add
dropout_rate = 0.4
first_layout = 50
epochs = 150
batch_size = 32
activation = 'relu'
loss = 'mean_squared_error'
optimizer = 'rmsprop'
metrics = ['mae']

model = Sequential()
model.add(Dropout(rate=dropout_rate, input_shape=(X_train.shape[1],)))
model.add(Dense(first_layout, activation=activation))
model.add(Dense(1))
model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
# we can use the model.summary() to see the structure of the model
print(model.summary())



In [None]:
# fit the model
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)
print(model.summary())

In [None]:
# evaluate the model
# model.evaluate returns the loss value & metrics values for the model in test mode.
mse_test, mae_test = model.evaluate(X_test, y_test)
print('Test mse: %.3f, Test mae: %.3f' % (mse_test, mae_test))

## 10.9.2 A Multilayer Network on the MNIST Digit Data

In [None]:
# we could load the MNIST data set from keras.datasets
# keras.datasets also contains other well-known datasets, such as cifar10, fashion_mnist, etc.
(X_train, y_train), (X_test, y_test) = tk.datasets.mnist.load_data()

In [None]:
# since the data set if for image, so each image is a 28*28 matrix.
print(X_train.shape)
print(np.max(X_train))

# the y_train the group label for the training data
print(y_train.shape)
print(np.unique(y_train))

(60000, 28, 28)
255
(60000,)
[0 1 2 3 4 5 6 7 8 9]


In [None]:
# let us plot some of the images
for i in range(9):
	# define subplot
	plt.subplot(330 + 1 + i)
	plt.imshow(X_train[i], cmap=plt.get_cmap('gray'))
plt.show()

In [None]:
# now let us reshape each image (i.e. matrix) to a vector
X_train = X_train.reshape((X_train.shape[0], 28*28)).astype('float32')
X_test = X_test.reshape((X_test.shape[0], 28*28)).astype('float32')
""" 
We know that each pixel has its unique color code and also we know that it has a maximum value of 255. 
To perform Machine Learning, it is important to convert all the values from 0 to 255 for every pixel to 
a range of values from 0 to 1. The simplest way is to divide the value of every pixel by 255 to get the 
values in the range of 0 to 1.
"""
X_train = X_train / 255
X_test = X_test / 255

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [None]:
model = Sequential()
model.add(Dense(256, input_dim = 28 * 28, activation= 'relu'))
model.add(Dropout(rate=0.4))
model.add(Dense(128, activation = 'relu'))
model.add(Dropout(rate=0.3))
model.add(Dense(10, activation = 'softmax'))

In [None]:
# compile the model and fit the model 
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
history_model = model.fit(X_train, y_train, epochs=15, batch_size=32, validation_data=(X_test, y_test))
_, acc = model.evaluate(X_test, y_test)
print('Test accuracy: %.3f' % acc)

In [None]:
# we can plot the metric history 
history_dict = history_model.history
plt.title('Classification Accuracy')
plt.plot(history_dict['accuracy'], color='black', label='train')
plt.plot(history_dict['val_accuracy'], color='orange', label='test')
plt.legend(loc='best')
plt.show()
""" 
One thing to notice is the test accuracy is actually higher than the training accuracy. 
It is kind of uncommon - if you have a good explaination on this, let me know. 
"""

## 10.9.3 Convolutional Neural Networks

In [None]:
(X_train, y_train), (X_test, y_test) = tk.datasets.cifar100.load_data()

In [None]:
# take a look at the data shape, compare to MNIST, this dataset is also image but with 
# different channels, i.e. squared image with 32×32 pixels and three color channels
print(X_train.shape)
print(np.max(X_train))

# the y_train the group label for the training data
print(y_train.shape)
print(np.unique(y_train))

In [None]:
# show a few samples
for i in range(9):
	# define subplot
	plt.subplot(330 + 1 + i)
	plt.imshow(X_train[i], cmap=plt.get_cmap('gray'))
plt.show()

In [None]:
# the workflow is similar to before
# we prepare the data 
# then define the model architecture 
# then fit the model 
# then exam the model performance 

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train = X_train / 255
X_test = X_test / 255

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [None]:
# this is a 3 block (each block contains conv layer and pooling layer) VGG architecture
model = Sequential()
model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
model.add(Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
model.add(MaxPooling2D((2, 2)))

# then define the output classifer part of the model 
model.add(Flatten())
model.add(Dense(128, activation='relu', kernel_initializer='he_uniform'))
model.add(Dense(100, activation='softmax'))

In [None]:
# compile the model and fit the model 
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
history_model = model.fit(X_train, y_train, epochs=5, batch_size=128, validation_data=(X_test, y_test), verbose=0)
_, acc = model.evaluate(X_test, y_test)
print('Test accuracy: %.3f' % acc)

In [None]:
# we can plot the metric history 
history_dict = history_model.history
plt.title('Classification Accuracy')
plt.plot(history_dict['accuracy'], color='black', label='train')
plt.plot(history_dict['val_accuracy'], color='orange', label='test')
plt.legend(loc='best')
plt.show()

In [None]:
# as before, we can add in dropout 
model = Sequential()
model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.2))
model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.2))
model.add(Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
model.add(Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(128, activation='relu', kernel_initializer='he_uniform'))
model.add(Dropout(0.2))
model.add(Dense(100, activation='softmax'))

# compile the model and fit the model 
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
history_model = model.fit(X_train, y_train, epochs=5, batch_size=128, validation_data=(X_test, y_test), verbose=0)
_, acc = model.evaluate(X_test, y_test)
print('Test accuracy: %.3f' % acc)
""" 
From the result, we can see that adding in dropout decreases the test accuracy from 
37.6% to 34.1%. This is mainly due to the underfit of the model.

Dropout helps in the case of overfitting, to see the benefit of the dropout, we would need to 
increase epochs to a large number (~ 1000). 
"""

In [None]:
"""
Some other regularization technique is not mentioned in this lab is weight decay (i.e. similar to the concept of Lasso and Ridge)
In this setup, we can add regularization to the weights using the syntax 'kernel_regularizer'
model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same', kernel_regularizer=l2(0.001)))

Some other regularization methods are data augmentation and early stopping. 
"""

## 10.9.4 Using Pretrained CNN Models

In [None]:
# instead of using the examples in the book, I choose a VGG model 
# to make it simple, I used VGG16. There are other pretrained models in this keras.applications class
# from keras.applications.vgg16 import VGG16
# load model
model = VGG16()
# summarize the model
model.summary()
# fun to check the number of paramters 

In [None]:
# load an image from file
image = load_img('https://raw.githubusercontent.com/tvanzyl/Sharing_ISL_python/master/data/dog_test.jpg', target_size=(224, 224))
# convert the image pixels to a numpy array
image = img_to_array(image)
# reshape data for the model
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
# prepare the image for the VGG model
image = preprocess_input(image)

In [None]:
plt.imshow(image[0])
plt.show

In [None]:
# predict the probability across all output classes
yhat = model.predict(image)
# convert the probabilities to class labels
label = decode_predictions(yhat)
print(label)

In [None]:
"""
there are other usage on pre-trained models. The example we showed above is to use the model directly to do the prediction. 

Other usage could be as a feature extracter: one example is below. Then we could run other simpler model on top of these features.
"""
# load model
model = VGG16()
# remove the output layer
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
# get extracted features
features = model.predict(image)
print(features.shape)

## 10.9.5 IMDb Document Classification

In [None]:
(X_train, y_train), (X_test, y_test) = tk.datasets.imdb.load_data(num_words=10000)

In [None]:
# take a look at the data shape, compare to MNIST, this dataset is also image but with 
# different channels, i.e. squared image with 32×32 pixels and three color channels
print(X_train.shape)

# the y_train the group label for the training data
print(y_train.shape)
print(np.unique(y_train))

In [None]:
# summarize review length
print("Review length: ")
result = [len(x) for x in X_train]
print("Mean %.2f words (%f)" % (np.mean(result), np.std(result)))
# plot review length
plt.boxplot(result)
plt.show()

In [None]:
# we can cap the length of each review at 500
max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

In [None]:
# similar to before, we can build a model. Here I skipped the OHE case, and directly went to use the 
# embedding 
model = Sequential()
model.add(Embedding(10000, 32, input_length=max_words))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print(model.summary())

In [None]:
# to save compute, I only run 5 epochs
history_model = model.fit(X_train, y_train, epochs=5,
                          batch_size=512, validation_data=(X_test, y_test))
_, acc = model.evaluate(X_test, y_test)
print('Test accuracy: %.3f' % acc)

In [None]:
# we can plot the metric history 
history_dict = history_model.history
plt.title('Classification Accuracy')
plt.plot(history_dict['accuracy'], color='black', label='train')
plt.plot(history_dict['val_accuracy'], color='orange', label='test')
plt.legend(loc='best')
plt.show()

# similar to before, we can add dropout and other regularizations

## 10.9.6 Recurrent Neural Networks

In [None]:
# to expedite the run， I used vocabulary_size = 5000 here 
vocabulary_size = 5000
(X_train, y_train), (X_test, y_test) = tk.datasets.imdb.load_data(num_words=vocabulary_size)

# we can cap the length of each review at 500
max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

embedding_size=32
model=Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(LSTM(100)) # this is the key part of this section: the LSTM layer
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [None]:
# to save compute, I only run 10 epochs，this takes ~10 mins on my machine. 
# In theory, this RNN setup should produce better results than the CNN setup.
history_model = model.fit(X_train, y_train, epochs=10,
                          batch_size=512, validation_data=(X_test, y_test))
_, acc = model.evaluate(X_test, y_test)
print('Test accuracy: %.3f' % acc)

In [None]:
# we can plot the metric history 
history_dict = history_model.history
plt.title('Classification Accuracy')
plt.plot(history_dict['accuracy'], color='black', label='train')
plt.plot(history_dict['val_accuracy'], color='orange', label='test')
plt.legend(loc='best')
plt.show()

""" 
I hope by now, you have a good understanding of the NN model and the reason why those models are becoming popular.
Those model are easy to define, easy to train, easy to use, and easy to apply to new usecases.
"""

In [None]:
# End of Chapter 10