# Dropout Regularization

Link to the Youtube tutorial video: https://www.youtube.com/watch?v=lcI8ukTUEbo&list=PLeo1K3hjS3uu7CxAacxVndI4bE_o3BDtO&index=20

**Theory behind dropout regularization:**    <br />
<img src="hidden\photo1.png" alt="This image is a representation of the simple neural network" style="width: 400px;"/>  <br />
<img src="hidden\photo2.png" alt="This image is a representation of the simple neural network" style="width: 400px;"/>  <br />
<img src="hidden\photo3.png" alt="This image is a representation of the simple neural network" style="width: 400px;"/>  <br />
<img src="hidden\photo4.png" alt="This image is a representation of the simple neural network" style="width: 400px;"/>  <br />
<img src="hidden\photo5.png" alt="This image is a representation of the simple neural network" style="width: 400px;"/>  <br />

In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# Load the dataset

In [36]:
# Load the dataset into a dataframe called df. header=None because the dataset does not have header (column names)
df = pd.read_csv("./sonar_dataset.csv", header = None)
df.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
150,0.0209,0.0278,0.0115,0.0445,0.0427,0.0766,0.1458,0.143,0.1894,0.1853,...,0.0096,0.0014,0.0049,0.0039,0.0029,0.0078,0.0047,0.0021,0.0011,M
166,0.0411,0.0277,0.0604,0.0525,0.0489,0.0385,0.0611,0.1117,0.1237,0.23,...,0.0217,0.0038,0.0019,0.0065,0.0132,0.0108,0.005,0.0085,0.0044,M
121,0.0162,0.0041,0.0239,0.0441,0.063,0.0921,0.1368,0.1078,0.1552,0.1779,...,0.0173,0.0135,0.0114,0.0062,0.0157,0.0088,0.0036,0.0053,0.003,M
174,0.0191,0.0173,0.0291,0.0301,0.0463,0.069,0.0576,0.1103,0.2423,0.3134,...,0.004,0.0136,0.0137,0.0172,0.0132,0.011,0.0122,0.0114,0.0068,M
179,0.0394,0.042,0.0446,0.0551,0.0597,0.1416,0.0956,0.0802,0.1618,0.2558,...,0.0146,0.004,0.0114,0.0032,0.0062,0.0101,0.0068,0.0053,0.0087,M


# Data Exploration

In [37]:
print('The dataset consists of ' + str(df.shape[0]) + ' samples, each sample consists of ' + str(df.shape[1]-1) + ' features and 1 output')

The dataset consists of 208 samples, each sample consists of 60 features and 1 output


In [38]:
# Check if any column/feature of df dataframe containing null (NA/NaN/space/empty entry)
df.isna().sum()

# Insight: Since no column/feature contains null, we dont need to do something to process the null

0     0
1     0
2     0
3     0
4     0
     ..
56    0
57    0
58    0
59    0
60    0
Length: 61, dtype: int64

In [39]:
# Print the column names of df dataframe
print('The column names of df dataframe:\n', df.columns)

# Insight: Column name 0 to 59 are the features, column name 60 is the ground truth

The column names of df dataframe:
 Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
       54, 55, 56, 57, 58, 59, 60],
      dtype='int64')


In [40]:
# Print the number of rock (R) and metal cylinder (M) classified in the dataset
print('In the dataset, the number of samples classified as:')
print('Metal cylinder (M): ', df[60].value_counts()[0])
print('Rock (R): ', df[60].value_counts()[1])

In the dataset, the number of samples classified as:
Metal cylinder (M):  111
Rock (R):  97


# Data Preprocessing

## Split the features and ground truth of the dataset

In [41]:
# Split the features and ground truth of the dataset into separate variables
X = df.drop(60, axis='columns') # Variable X only contains the features 
Y = df[60]  # Variable Y only contains the ground truth

In [42]:
# Show the first 5 samples of X variable
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0232,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0125,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0033,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0241,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0156,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094


In [43]:
# Show the first 5 samples of Y variable
Y.head()

0    R
1    R
2    R
3    R
4    R
Name: 60, dtype: object

## Convert Y variable from object-type (text data) into number type

Since the unique values of the ground truth is binary (0 or 1), we can use get_dummies instead of using one-hot-encoding

In [44]:
# Convert ground truths in Y variable from text data into number data. drop_first = True is used to drop a column, which is redundant here (because the ground truth here is binary, hence only require 1 column which contains either 0 or 1), to ensure efficient model training.
Y = pd.get_dummies(Y,  dtype=int, drop_first=True)
Y.sample(5)

# Insight: Now in Y variable, 1 represents Rock (R) while 0 represents Metal cylinder (M)

Unnamed: 0,R
131,0
126,0
18,1
17,1
69,1


In [45]:
# Show the statistics of Y_variable
Y.value_counts()

R
0    111
1     97
Name: count, dtype: int64

## Split data into train and test sets

In [46]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=1)

print('There are ' + str(X_train.shape[0]) + ' samples in train set, while each sample consists of ' + str(X_train.shape[1]) + ' features.')

There are 156 samples in train set, while each sample consists of 60 features.


# Develop a neural network

The neural network built here is an Artificial Neural Network (1 input, 3 hidden, & 1 output layer)

## Develop a overfitted neural network

In [47]:
import tensorflow as tf
from tensorflow import keras

model_overfit = keras.Sequential([
    # The 1st hidden layer (2nd layer), consists of 60 neurons, with ReLU as the activation function. The input layer (1st layer) has input size of 60 in 1D array (60 input neurons) to accept 60 features of each sample (so that each input neuron process a feature of the sample)
    keras.layers.Dense(60, input_dim=60, activation = 'relu'),
    # The 2nd hidden layer (3rd layer), consists of 30 neurons, with ReLU as the activation function.
    keras.layers.Dense(30, input_dim=60, activation = 'relu'),
    # The 3rd hidden layer (4td layer), consists of 15 neurons, with ReLU as the activation function.
    keras.layers.Dense(15, input_dim=60, activation = 'relu'),
    # The output layer (5th layer), consists of 1 output neurons to provide binary classification score (only 1 output class [0 for M; 1 for R]), with sigmoid function as the activation function.
    keras.layers.Dense(1, input_dim=60, activation = 'sigmoid')
])

# Compile the neural network
model_overfit.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # binary cross entropy is used as the cost function because this neural network is used to solve the binary classification problem

# Train the neural network with 100 epochs. batch size is set as 8 to perform mini batch gradient descent (means in each iteration/epoch, I randomly select only 8 samples (out of all samples in the train set) and feed them to the neural network to calculate the cost function at that epoch before doing the backward propagation)
model_overfit.fit(X_train, Y_train, epochs=100, batch_size=8) # Set these parameters to purposely overfit the model (obtain training accuracy of 1)

Epoch 1/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5697 - loss: 0.6813
Epoch 2/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5663 - loss: 0.6693 
Epoch 3/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5224 - loss: 0.6612 
Epoch 4/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4908 - loss: 0.6568 
Epoch 5/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5170 - loss: 0.6408 
Epoch 6/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6513 - loss: 0.5978 
Epoch 7/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7151 - loss: 0.5466 
Epoch 8/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7919 - loss: 0.5746 
Epoch 9/100
[1m20/20[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x12591647500>

In [48]:
# Evaluate the performance of the overfitted model using test set
model_overfit.evaluate(X_test, Y_test)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7292 - loss: 1.2914  


[1.1294811964035034, 0.75]

In [49]:
# Make predictions using the overfitted model
Y_predicted_overfit = model_overfit.predict(X_test).reshape(-1)
print('The predicted results of the overfitted model:\n', Y_predicted_overfit[:10])
Y_predicted_rounded_overfit = np.round(Y_predicted_overfit)
print('\nThe rounded predicted results of the overfitted model:\n', Y_predicted_rounded_overfit[:10])
print('\nThe ground truth of the corresponding samples:\n', Y_test[:10])

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
The predicted results of the overfitted model:
 [1.11894245e-13 9.78408396e-01 9.40359056e-01 2.18618030e-08
 9.99988019e-01 9.99668956e-01 3.08465119e-02 9.99989927e-01
 1.02021724e-08 9.99994576e-01]

The rounded predicted results of the overfitted model:
 [0. 1. 1. 0. 1. 1. 0. 1. 0. 1.]

The ground truth of the corresponding samples:
      R
186  0
155  0
165  0
200  0
58   1
34   1
151  0
18   1
202  0
62   1


In [50]:
# Evaluate the performance of the overfitted model
from sklearn.metrics import confusion_matrix, classification_report

print(classification_report(Y_test,Y_predicted_rounded_overfit))

              precision    recall  f1-score   support

           0       0.72      0.85      0.78        27
           1       0.80      0.64      0.71        25

    accuracy                           0.75        52
   macro avg       0.76      0.75      0.75        52
weighted avg       0.76      0.75      0.75        52



## Develop a neural network with dropout layer (to avoid overfitting)

1) The usual practice is to put a dropout layer after a hidden layer.
2) Dropout() will randomly select the neurons on the hidden layer to drop, while total dropped neurons is based on the dropout factor (the parameter you input). Hence, you will get different results every time you run the script.
3) Using dropout will reduce the accuracy on training set (so that it's likely to reduce the difference between training accuracy and testing accuracy [which also means reduce overfitting OR improve generality])
3) Dropout is not like a sure short guaranteed way to see the improvement of model performance, but you will likely see the improvement. So it's about trial and error.
4) Mostly, dropout is being used in computer vision type of problems where your neural network is really big and complex (because it has too many layers and too many neurons in each of these layers) 
5) Note: Since the neural network built in this tutorial is a very simple neural network, so sometimes you might see its accuracy will be same or decrease with the dropout layer, and it is ok for this context.

In [51]:
import tensorflow as tf
from tensorflow import keras

# Create the structure of the neural netwok (with dropout layer)
model_wDropout = keras.Sequential([
    # The 1st hidden layer (2nd layer), consists of 60 neurons, with ReLU as the activation function. The input layer (1st layer) has input size of 60 in 1D array (60 input neurons) to accept 60 features of each sample (so that each input neuron process a feature of the sample)
    keras.layers.Dense(60, input_dim=60, activation = 'relu'),
    # The dropout layer. Drop 50% of neurons (the neurons selection to be dropped is random) in the 1st hidden layer. The usual practice is to put a dropout layer after a hidden layer.
    keras.layers.Dropout(0.5),
    # The 2nd hidden layer (3rd layer), consists of 30 neurons, with ReLU as the activation function.
    keras.layers.Dense(30, input_dim=60, activation = 'relu'),
    # The dropout layer. Drop 50% of neurons (the neurons selection to be dropped is random) in the 2nd hidden layer
    keras.layers.Dropout(0.5),
    # The 3rd hidden layer (4td layer), consists of 15 neurons, with ReLU as the activation function.
    keras.layers.Dense(15, input_dim=60, activation = 'relu'),
    # The dropout layer. Drop 50% of neurons (the neurons selection to be dropped is random) in the 3rd hidden layer
    keras.layers.Dropout(0.5),
    # The output layer (5th layer), consists of 1 output neurons to provide binary classification score (only 1 output class [0 for M; 1 for R]), with sigmoid function as the activation function.
    keras.layers.Dense(1, input_dim=60, activation = 'sigmoid')
])

# Compile the neural network
model_wDropout.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # binary cross entropy is used as the cost function because this neural network is used to solve the binary classification problem

# Train the neural network with 100 epochs. batch size is set as 8 to perform mini batch gradient descent (means in each iteration/epoch, I randomly select only 8 samples (out of all samples in the train set) and feed them to the neural network to calculate the cost function at that epoch before doing the backward propagation)
model_wDropout.fit(X_train, Y_train, epochs=100, batch_size=8)

Epoch 1/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.4043 - loss: 0.7808
Epoch 2/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5702 - loss: 0.6877 
Epoch 3/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4938 - loss: 0.7174 
Epoch 4/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5462 - loss: 0.6848 
Epoch 5/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6386 - loss: 0.7025 
Epoch 6/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5928 - loss: 0.6575 
Epoch 7/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5484 - loss: 0.6903 
Epoch 8/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5664 - loss: 0.6653 
Epoch 9/100
[1m20/20[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x12596948800>

In [52]:
# Evaluate the performance of the neural network with dropout layer using test set
model_wDropout.evaluate(X_test,Y_test)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7524 - loss: 0.5143  


[0.47998905181884766, 0.7692307829856873]

In [53]:
# Make predictions using the model containing dropout layer
Y_predicted_wDropout = model_overfit.predict(X_test).reshape(-1)
print('The predicted results of the overfitted model:\n', Y_predicted_wDropout[:10])
Y_predicted_rounded_wDropout = np.round(Y_predicted_wDropout)
print('\nThe rounded predicted results of the overfitted model:\n', Y_predicted_rounded_wDropout[:10])
print('\nThe ground truth of the corresponding samples:\n', Y_test[:10])

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
The predicted results of the overfitted model:
 [1.11894245e-13 9.78408396e-01 9.40359056e-01 2.18618030e-08
 9.99988019e-01 9.99668956e-01 3.08465119e-02 9.99989927e-01
 1.02021724e-08 9.99994576e-01]

The rounded predicted results of the overfitted model:
 [0. 1. 1. 0. 1. 1. 0. 1. 0. 1.]

The ground truth of the corresponding samples:
      R
186  0
155  0
165  0
200  0
58   1
34   1
151  0
18   1
202  0
62   1


In [54]:
# Evaluate the performance of the model containing dropout layer
print(classification_report(Y_test,Y_predicted_rounded_wDropout))

              precision    recall  f1-score   support

           0       0.72      0.85      0.78        27
           1       0.80      0.64      0.71        25

    accuracy                           0.75        52
   macro avg       0.76      0.75      0.75        52
weighted avg       0.76      0.75      0.75        52

