# ANN Hyperparameter Optimization Trial

In [2]:
import keras
import tensorflow as tf
import time
import numpy as np
import pandas as pd
from joblib import dump, load
import random
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import SGD, RMSprop, Adam
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.initializers import RandomNormal, RandomUniform, GlorotUniform, GlorotNormal, HeNormal
from keras.optimizers.schedules import ExponentialDecay


random.seed(46)
np.random.seed(46)
tf.random.set_seed(46)

In [5]:
# !pip install keras-tuner
from keras_tuner import RandomSearch, GridSearch, BayesianOptimization
from keras_tuner.engine.hyperparameters import HyperParameters

### Functions

In [6]:
def preprocess_data(filepath):
    data = pd.read_csv(filepath)
    scaler = StandardScaler()
    X = scaler.fit_transform(data.drop('Outcome', axis=1))
    y = data['Outcome'].values
    dump(scaler, 'scaler.joblib')
    return X, y

def prepare_datasets(X_train, X_val, y_train, y_val, batch_size=None):
    if batch_size is None:
        batch_size = len(X_train)
    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    train_dataset = train_dataset.shuffle(buffer_size=len(X_train)).batch(batch_size)
    val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
    val_dataset = val_dataset.batch(batch_size)
    return train_dataset, val_dataset

def plot_training_history(history, train_loss='loss', train_metric='accuracy', val_loss='val_loss', val_metric='val_accuracy'):

    #Loss
    plt.figure(figsize=(10, 5))
    plt.plot(history.history[train_loss], label='Training Loss')
    plt.plot(history.history[val_loss], label='Validation Loss')
    plt.title('Training and Validation Loss Over Epochs')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

    # Metrics
    plt.figure(figsize=(10, 5))
    plt.plot(history.history[train_metric], label=f"Training: {train_metric}")
    plt.plot(history.history[val_metric], label=f"Validation: {val_metric}")
    plt.title(f'Training and Validation {train_metric} Over Epochs')
    plt.xlabel('Epochs')
    plt.ylabel(f'train_metric')
    plt.legend()
    plt.show()

def get_best_epoch_details(history):
    val_losses = history.history['val_loss']
    min_val_loss_index = val_losses.index(min(val_losses))
    best_epoch = min_val_loss_index + 1

    epoch_details = {}
    for key in history.history.keys():
        epoch_details[key] = history.history[key][min_val_loss_index]

    epoch_details['best_epoch'] = best_epoch
    print(f"Best epoch details: {epoch_details}")

### Data Prep

In [7]:
X, y = preprocess_data('/content/diabetes.csv')

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

train_ds, val_ds = prepare_datasets(X_train, X_val, y_train, y_val, batch_size=32)

### Base Model
* The main model we might want to look back during hyperparameter optimization process.

In [8]:
base_model = Sequential([
    Input(shape=(train_ds.element_spec[0].shape[1],)),
    Dense(50, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(1, activation='sigmoid')])

optimizer = SGD(learning_rate=0.01, momentum=0.0)

base_model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"])

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=200, verbose=1, restore_best_weights=True)

base_model_history = base_model.fit(train_ds, epochs=1000, validation_data=val_ds, verbose=0, callbacks=early_stopping)

Epoch 235: early stopping
Restoring model weights from the end of the best epoch: 35.


In [9]:
get_best_epoch_details(base_model_history)

Best epoch details: {'accuracy': 0.7491856813430786, 'loss': 0.5173870921134949, 'val_accuracy': 0.7792207598686218, 'val_loss': 0.49695226550102234, 'best_epoch': 35}


## A. Init Of Weights & Biases

In [10]:
def try_initializers(train_ds, val_ds):

    initializers = {
        'RandomNormal': RandomNormal(),
        'RandomUniform': RandomUniform(),
        'GlorotUniform': GlorotUniform(),
        'GlorotNormal': GlorotNormal(),
        'HeNormal': HeNormal()
    }

    for name, initializer in initializers.items():
        print()
        print(f"Training model with {name} initialization...")

        start_time = time.time()

        model = Sequential([
            Input(shape=(train_ds.element_spec[0].shape[1],)),
            Dense(50, activation='relu', kernel_initializer=initializer, kernel_regularizer=l2(0.001)),
            BatchNormalization(),
            Dropout(0.5),
            Dense(1, activation='sigmoid', kernel_initializer=initializer)])
        
        model.compile(optimizer="SGD", loss="binary_crossentropy", metrics=["accuracy"])
        
        early_stopping = EarlyStopping(monitor='val_loss', patience=200, verbose=1, restore_best_weights=True)
        
        history = model.fit(train_ds, epochs=1000, validation_data=val_ds, verbose=0, callbacks=early_stopping)
        
        get_best_epoch_details(history)

        end_time = time.time()
        training_time = end_time - start_time
        print("Training time:", training_time, "seconds")

# func call
try_initializers(train_ds, val_ds)


Training model with RandomNormal initialization...
Epoch 229: early stopping
Restoring model weights from the end of the best epoch: 29.
Best epoch details: {'accuracy': 0.7785016298294067, 'loss': 0.46130287647247314, 'val_accuracy': 0.7792207598686218, 'val_loss': 0.5034934282302856, 'best_epoch': 29}
Training time: 14.411351680755615 seconds

Training model with RandomUniform initialization...
Epoch 243: early stopping
Restoring model weights from the end of the best epoch: 43.
Best epoch details: {'accuracy': 0.7931596040725708, 'loss': 0.43367868661880493, 'val_accuracy': 0.7727272510528564, 'val_loss': 0.5018975138664246, 'best_epoch': 43}
Training time: 14.589875936508179 seconds

Training model with GlorotUniform initialization...
Epoch 215: early stopping
Restoring model weights from the end of the best epoch: 15.
Best epoch details: {'accuracy': 0.7459283471107483, 'loss': 0.5508226156234741, 'val_accuracy': 0.7857142686843872, 'val_loss': 0.485345721244812, 'best_epoch': 15

## B. Layers, Units, Dropouts

In [12]:
# CREATING SEARCHING SPACE
def build_model(hp):
  model = Sequential()
  model.add(Input(shape=(train_ds.element_spec[0].shape[1],))) # We can add something like layers etc. with add() func.

  for i in range(hp.Int('num_layers', 1, 5)): # Search 1-5 in int type!
      model.add(Dense(units=hp.Int('units_' + str(i), min_value=32, max_value=512, step=32))) 
      # Determine the combination of the number of neurons in the corresponding layer "i"
      model.add(BatchNormalization())
      model.add(Dropout(hp.Float('dropout_' + str(i), min_value=0.0, max_value=0.5, step=0.1)))
      # Determination of the percentage of neurons to be damped in the relevant "i" layer

  model.add(Dense(1, activation='sigmoid'))
  model.compile(optimizer="SGD", loss="binary_crossentropy", metrics=["accuracy"])
  return model

In [13]:
# RANDOM SEARCH TUNER
random_search_tuner = RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=20, # 20 different combinations will be tried in the parameter search.
    executions_per_trial=1, # How many different models will be trained in each combination selection attempt?
    overwrite=True) # IMPORTANT parameter!

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=20,
    verbose=1,
    restore_best_weights=True)

random_search_tuner.search(train_ds,
                           epochs=100,
                           validation_data=val_ds,
                           callbacks=[early_stopping])

Trial 20 Complete [00h 00m 13s]
val_loss: 0.4831380844116211

Best val_loss So Far: 0.4578189551830292
Total elapsed time: 00h 03m 58s


In [14]:
random_search_tuner.search_space_summary()

Search space summary
Default search space size: 11
num_layers (Int)
{'default': None, 'conditions': [], 'min_value': 1, 'max_value': 5, 'step': 1, 'sampling': 'linear'}
units_0 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 32, 'sampling': 'linear'}
dropout_0 (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 0.5, 'step': 0.1, 'sampling': 'linear'}
units_1 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 32, 'sampling': 'linear'}
dropout_1 (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 0.5, 'step': 0.1, 'sampling': 'linear'}
units_2 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 32, 'sampling': 'linear'}
dropout_2 (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 0.5, 'step': 0.1, 'sampling': 'linear'}
units_3 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 32, 'samp

In [15]:
random_search_tuner.results_summary()

Results summary
Results in ./untitled_project
Showing 10 best trials
Objective(name="val_loss", direction="min")

Trial 02 summary
Hyperparameters:
num_layers: 3
units_0: 64
dropout_0: 0.30000000000000004
units_1: 416
dropout_1: 0.2
units_2: 512
dropout_2: 0.0
units_3: 192
dropout_3: 0.1
units_4: 224
dropout_4: 0.30000000000000004
Score: 0.4578189551830292

Trial 08 summary
Hyperparameters:
num_layers: 5
units_0: 160
dropout_0: 0.1
units_1: 160
dropout_1: 0.1
units_2: 384
dropout_2: 0.30000000000000004
units_3: 480
dropout_3: 0.0
units_4: 352
dropout_4: 0.0
Score: 0.4580150544643402

Trial 09 summary
Hyperparameters:
num_layers: 1
units_0: 448
dropout_0: 0.1
units_1: 448
dropout_1: 0.2
units_2: 480
dropout_2: 0.4
units_3: 352
dropout_3: 0.0
units_4: 32
dropout_4: 0.2
Score: 0.46267184615135193

Trial 03 summary
Hyperparameters:
num_layers: 5
units_0: 480
dropout_0: 0.1
units_1: 448
dropout_1: 0.4
units_2: 96
dropout_2: 0.0
units_3: 480
dropout_3: 0.0
units_4: 384
dropout_4: 0.0
Score: 

In [17]:
# BEST HYPERPARAMETERS
best_hps = random_search_tuner.get_best_hyperparameters(num_trials=1)[0]
# num_trials=1 : Returns 1 (odd-first) set of values from the best performing hyperparameter sets
print(f"Best Hyperparameters: {best_hps.values}")

Best Hyperparameters: {'num_layers': 3, 'units_0': 64, 'dropout_0': 0.30000000000000004, 'units_1': 416, 'dropout_1': 0.2, 'units_2': 512, 'dropout_2': 0.0, 'units_3': 192, 'dropout_3': 0.1, 'units_4': 224, 'dropout_4': 0.30000000000000004}


In [18]:
# BEST MODEL
best_model = random_search_tuner.get_best_models(num_models=1)[0]
best_model.summary()

In [19]:
# MODEL PERFORMANCE
loss, acc = best_model.evaluate(val_ds)
print(f"Validation loss: {loss}, Accuracy: {acc}")

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7775 - loss: 0.4625  
Validation loss: 0.4578189551830292, Accuracy: 0.7857142686843872


## C. All Together
<u>Here is the part where we discuss all of the previous parameter events + BatchSize, Activation Funcs, Learning Rate, Regularization.</u>

In [22]:
def build_model(hp):
    model = Sequential()
    model.add(Input(shape=(train_ds.element_spec[0].shape[1],)))

    # 1.Hidden layers, activation functions, l2, Dropout
    for i in range(hp.Int('num_layers', 1, 5)):

        model.add(Dense(units=hp.Int('units_' + str(i), min_value=32, max_value=512, step=32),

                        activation=hp.Choice('activation_' + str(i), values=['relu', 'tanh', 'sigmoid']),

                        kernel_regularizer=l2(hp.Float('l2_' + str(i), min_value=0.0001, max_value=0.01, sampling='log'))))
        # sampling='log' => Scale these values (0.0001 to 0.01) with a logarithmic transformation

        model.add(BatchNormalization())
        model.add(Dropout(hp.Float('dropout_' + str(i), min_value=0.0, max_value=0.5, step=0.1)))

    model.add(Dense(1, activation='sigmoid'))

    # 2.Learning rate schedule
    initial_learning_rate = hp.Float('initial_learning_rate', min_value=1e-4, max_value=1e-2, sampling='log')
    lr_schedule = ExponentialDecay(
        initial_learning_rate=initial_learning_rate,
        decay_steps=100,
        decay_rate=0.96,
        staircase=True
    )

    # 3. Optimizers
    optimizer_choice = hp.Choice('optimizer', values=['sgd', 'adam', "rmsprop"])
    
    if optimizer_choice == 'sgd':
        optimizer = SGD(
            learning_rate=lr_schedule,
            momentum=hp.Float('momentum', min_value=0.0, max_value=0.9, step=0.1)
        ) # momentum => It carries the gradient value from the previous iteration to the next iteration.
        
    elif optimizer_choice == 'rmsprop':
        optimizer = RMSprop(
            learning_rate=lr_schedule,
            rho=hp.Float('rho', min_value=0.8, max_value=0.99, step=0.01),  # Decay rate for moving average of squared gradients
            epsilon=hp.Float('epsilon', min_value=1e-10, max_value=1e-8, step=1e-10),
            momentum=hp.Float('momentum', min_value=0.0, max_value=0.9, step=0.1)
        ) 
        # rmsprop => The learning rate is adjusted dynamically. It works by taking the moving average of the squares of the gradients in the past iteration.
        # Moving average of squares of Gradients => It is included in the denominator of the learning rate in the update function.
        # Larger gradient values make the LR smaller, while smaller gradient values change very little. This is how dynamism is achieved.
    elif optimizer_choice == 'adam':
        optimizer = Adam(
            learning_rate=lr_schedule,
            beta_1=hp.Float('beta1', min_value=0.85, max_value=0.99, step=0.01), 
            # beta1 => determines how much weight we give to momentum from the past
            beta_2=hp.Float('beta2', min_value=0.999, max_value=0.9999, step=0.0001),
            # This parameter determines the moving average of the squares of the gradients and
            # how much weight to give to the gradients in the previous iteration to adjust the LRs adaptively.
            epsilon=hp.Float('epsilon', min_value=1e-8, max_value=1e-7, step=1e-8) 
            # It refers to the mathematical correction values to be seen in the formulations.
        ) 
        # Adam Method => Rmsprop + Momentum
        # t both carries gradients (speed and direction information) from the past and uses the LR adaptive adjustment feature.


    # Model Compilation
    model.compile(optimizer = optimizer,
                  loss="binary_crossentropy",
                  metrics=["accuracy"])

    return model

In [23]:
# RANDOM SEARCH
random_search_tuner = RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=20,
    executions_per_trial=1,
    overwrite=True)

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=20,
    verbose=1,
    restore_best_weights=True)

random_search_tuner.search(train_ds,
                           epochs=100,
                           validation_data=val_ds,
                           callbacks=[early_stopping])

Trial 20 Complete [00h 00m 23s]
val_loss: 0.9702298045158386

Best val_loss So Far: 0.5141968131065369
Total elapsed time: 00h 06m 07s


In [24]:
# BEST HYPERPARAMETERS
best_hps = random_search_tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"Best hyperparameters: {best_hps.values}")
dump(best_hps, 'best_hps.joblib')

Best hyperparameters: {'num_layers': 1, 'units_0': 224, 'activation_0': 'relu', 'l2_0': 0.00028636418591913084, 'dropout_0': 0.4, 'initial_learning_rate': 0.0012076098541628248, 'optimizer': 'adam', 'momentum': 0.30000000000000004, 'units_1': 64, 'activation_1': 'tanh', 'l2_1': 0.00012162727501515752, 'dropout_1': 0.0, 'beta1': 0.86, 'beta2': 0.9998, 'epsilon': 2e-08, 'rho': 0.8700000000000001, 'units_2': 416, 'activation_2': 'relu', 'l2_2': 0.002177828475108628, 'dropout_2': 0.4, 'units_3': 320, 'activation_3': 'sigmoid', 'l2_3': 0.00013978522409411077, 'dropout_3': 0.0, 'units_4': 256, 'activation_4': 'sigmoid', 'l2_4': 0.00536658240702523, 'dropout_4': 0.1}


['best_hps.joblib']

In [25]:
# BEST MODEL
best_model = random_search_tuner.get_best_models(num_models=1)[0]
best_model.summary()

  saveable.load_own_variables(weights_store.get(inner_path))


In [27]:
loss, acc = best_model.evaluate(val_ds)
print(f"Validation set üzerinde loss: {loss}, Accuracy: {acc}")

 # SAVE THE BEST MODEL
best_model.save('tuned_model.keras')

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8014 - loss: 0.5094  
Validation set üzerinde loss: 0.5141968131065369, Accuracy: 0.7922077775001526


## D. Retrain Model for Entire Dataset

In [28]:
# Load best hps, dataset
X, y = preprocess_data('/content/diabetes.csv')

dataset = tf.data.Dataset.from_tensor_slices((X, y)).shuffle(buffer_size=len(X)).batch(len(X))

In [29]:
# Model Build
best_hps = load('best_hps.joblib')

final_tuned_model = build_model(best_hps)

In [30]:
 # TRAINING
 early_stopping = EarlyStopping(
    monitor='loss',
    patience=5,
    verbose=1,
    restore_best_weights=True)

model_checkpoint = ModelCheckpoint(
    'final_tuned_model.keras',
    monitor='loss',
    verbose=0,
    save_best_only=True)

final_history = final_tuned_model.fit(dataset,
            epochs=500,# 100 would be better for this case.
            verbose=1,
            callbacks=[early_stopping, model_checkpoint])

Epoch 1/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.4518 - loss: 1.0131
Epoch 2/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.5104 - loss: 0.8982
Epoch 3/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.5417 - loss: 0.8590
Epoch 4/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.5560 - loss: 0.8280
Epoch 5/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.5951 - loss: 0.7325
Epoch 6/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.6458 - loss: 0.6919
Epoch 7/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.6549 - loss: 0.6819
Epoch 8/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.6758 - loss: 0.6226
Epoch 9/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [31]:
def get_best_epoch_details(history):
    val_losses = history.history['loss']
    min_val_loss_index = val_losses.index(min(val_losses))
    best_epoch = min_val_loss_index + 1

    epoch_details = {}
    for key in history.history.keys():
        epoch_details[key] = history.history[key][min_val_loss_index]

    epoch_details['best_epoch'] = best_epoch
    print(f"Best epoch details: {epoch_details}")

get_best_epoch_details(final_history)

Best epoch details: {'accuracy': 0.7591145634651184, 'loss': 0.521384596824646, 'best_epoch': 22}


In [32]:
# PREDICTION
diabetes_data = pd.read_csv('/content/diabetes.csv')

scaler = load('scaler.joblib')

loaded_final_tuned_model = load_model("/content/final_tuned_model.keras", compile=False)

In [33]:
def random_samples(scaler, data, num_samples=100):
    X = data.drop('Outcome', axis=1)
    X_scaled = scaler.fit_transform(X)
    new_dataset = tf.data.Dataset.from_tensor_slices((X_scaled)).batch(len(X))
    return new_dataset

random_samples = random_samples(scaler, diabetes_data)

loaded_final_tuned_model.predict(random_samples)[:10]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step


array([[0.456737  ],
       [0.24981508],
       [0.465028  ],
       [0.2488339 ],
       [0.5667088 ],
       [0.33804178],
       [0.27141044],
       [0.46307394],
       [0.4860419 ],
       [0.2637618 ]], dtype=float32)

***Suggestion => When we have a large number of candidate hyperparameters, it may be a reasonable approach to first pass them through random search and then enter the hyperparameter values they bring to us into grid search by adding certain numbers around them.***