# Hyperparameter Optimization

In [2]:
import numpy as np
from scipy.stats.distributions import expon, uniform, randint
from sklearn.model_selection import train_test_split, ParameterSampler
%matplotlib inline
import matplotlib.pyplot as plt

import wandb
from wandb.keras import WandbCallback

from keras.datasets import cifar10
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint
from keras.utils import to_categorical

Using TensorFlow backend.


Let's define a couple of helper functions:

In [None]:
def print_dict(d):
    for k, v in d.items():
        print('  {:>20}: {}'.format(k, v))
        
def print_header(s):
    divider = '=' * (len(s) + 4)
    print()
    print(divider)
    print('  {}  '.format(s))
    print(divider)

# Prepare Data

In [None]:
(X_train_valid, y_train_valid), (X_test, y_test) = cifar10.load_data()
X_train_valid = X_train_valid.astype('float32') / 255.
X_test = X_test.astype('float32') / 255.

y_train_valid = to_categorical(y_train_valid)
y_test = to_categorical(y_test)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_valid, y_train_valid, test_size=0.10)

In [None]:
print('Train shapes: x = {}, y = {}'.format(
    X_train.shape, y_train.shape))
print('Valid shapes: x = {}, y = {}'.format(
    X_valid.shape, y_valid.shape))
print('Test  shapes: x = {}, y = {}'.format(
    X_test.shape, y_test.shape))

# Model Creation
Make a function which accepts a config object containing your hyperparameters and returns a compiled model.

In [None]:
def build_compile(config):
    model = Sequential()
    
    # first convolution / pooling set
    model.add(Conv2D(config.conv1_num_filters, 
                     config.conv_filter_size, 
                     activation=config.activation, 
                     padding='same',
                     input_shape=X_train.shape[1:]))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    # second convolution / pooling set
    model.add(Conv2D(config.conv2_num_filters, 
                     config.conv_filter_size, 
                     padding='same',
                     activation=config.activation))
    model.add(Conv2D(config.conv3_num_filters, 
                     config.conv_filter_size, 
                     padding='same',
                     activation=config.activation))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    # third convolution / pooling set
    model.add(Conv2D(config.conv4_num_filters, 
                     config.conv_filter_size, 
                     padding='same',
                     activation=config.activation))
    model.add(Conv2D(config.conv5_num_filters, 
                     config.conv_filter_size, 
                     padding='same',
                     activation=config.activation))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Flatten())
    model.add(Dense(config.dense1_size,
                    activation=config.activation))
    model.add(Dropout(config.dropout))
    model.add(Dense(10, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=config.learn_rate),
                  metrics=['accuracy'])
    return model

# Hyperparameter Selection
Define the legal ranges for your hyperparameters and use `Sklearn`'s `ParameterSampler` to sample hyperparameters sets.

In [3]:
hp_ranges = {
    'conv1_num_filters': [32, 64, 128],
    'conv2_num_filters': [32, 64, 128],
    'conv3_num_filters': [32, 64, 128],
    'conv4_num_filters': [32, 64, 128],
    'conv5_num_filters': [32, 64, 128],
    'dense1_size':       [32, 64, 128, 256, 512],
    'dropout':           uniform,
    'learn_rate':        [0.1, 0.03, 0.001],
    'batch_size':        [8, 16, 32, 64, 128],
}

hp_sets = ParameterSampler(hp_ranges, n_iter=2, random_state=99)

In [None]:
for i, hp_set in enumerate(hp_sets):
    print()
    print("Hyperparameter Set {}:".format(i))
    print_dict(hp_set)

# Training

Static hyperparameters:

In [None]:
static_hyper_params = {
    'activation': 'relu',
    'conv_filter_size': 3,
    'num_epochs': 2,
}

Loop over `hp_sets`:

In [None]:
best_valid_acc = 0.0
best_hp_set = None
best_hp_ind = None

for hp_ind, hp_set in enumerate(hp_sets):
    # set up wandb
    print_header("Starting Training for Hyperparameter Set {}:".format(i))
    wandb.init()
    ## For short runs like this, wandb.monitor()
    # is just visual noise.  Reenable it for longer runs.
    # wandb.monitor()
    print_dict(hp_set)
   
    wandb.config.update(static_hyper_params, allow_val_change=True)
    wandb.config.update(hp_set, allow_val_change=True)

    # build model
    model = build_compile(wandb.config)
    print(model.summary())
    wandb.config.num_model_parameters = model.count_params()
    
    # train model
    history = model.fit(
        X_train, y_train,
        batch_size=wandb.config.batch_size,
        epochs=wandb.config.num_epochs,
        verbose=1,
        validation_data=(X_valid, y_valid),
        callbacks=[WandbCallback()]
    )
    
    # track best model so far
    valid_acc = history.history['val_acc'][-1]
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        best_hp_set = hp_set
        best_hp_ind = hp_ind

# Retrain Best Model on Full train+valid Data

In [None]:
print_header("Best Hyperparams were set {} with valid accuracy {}".format(best_hp_ind, best_valid_acc))
print_dict(best_hp_set)

# Retrain model on combined training and validation data
wandb.config.update(best_hp_set)
model = build_compile(wandb.config)
history = model.fit(
    X_train_valid, y_train_valid,
    batch_size=wandb.config.batch_size,
    epochs=wandb.config.num_epochs,
    verbose=1,
    callbacks=[WandbCallback()]
)

In [None]:
loss, acc = model.evaluate(X_test, y_test,
                           batch_size=wandb.config.batch_size)
print("Test loss: {}, test acc: {}".format(loss, acc))

### Inspect Results on WandB 
Go to https://app.wandb.ai/, then select your project name to see a summary of all your runs.

# Hyperparameter Gotchas
* It's easy to accidentally explode the size of your model.  In particular you get lots of parameters when:
  * You don't use much MaxPooling
  * You have a large first Dense layer after you Conv layers.
* As batch size goes up, learning rate can go up.  As batch size goes down, learning rate must go down.  Why?

## Exercise 1:
* Create a function, `build_compile_ex1`, which can create a CNN with a variable number of convolutional and dense layers using the hyperparameter ranges below.
  * Remember that you'll need to special case the first conv layer to set `input_shape`.
  * The hyperparameter `num_convs_per_max_pool` chooses how many conv layers should pass between each max pooling layer. 
    * You'll probably find python's modulus division operator useful for this.  e.g.: `5 % 3 ==> 2; 6 % 3 ==> 0`
* Use the hyperparameter sets in `hp_sets_ex1` as your hyperparameter samples.
* The number of filters in each conv layer can be constant, the number of neurons in the dense layer should be constant.
* Include a `Dropout` layer after each `Dense` layer.
* Don't forget the `Flatten` layer before switching to `Dense`.

In [None]:
# Legal Hyperparameter Ranges
hp_ranges_ex1 = {
    'num_conv_filters':       [32, 64, 128],
    'num_conv_layers':        randint(2, 8),
    'num_convs_per_max_pool': randint(1, 3),
    'dense_size':             [32, 64, 128, 256, 512],
    'num_dense_layers':       randint(1, 3),
    'dropout':                uniform,
    'learn_rate':             [0.1, 0.03, 0.001],
    'batch_size':             [8, 16, 32, 64, 128],
}

hp_sets_ex1 = ParameterSampler(hp_ranges_ex1, n_iter=2, random_state=1234)

In [None]:
for i, hp_set in enumerate(hp_sets_ex1):
    print()
    print("Hyperparameter Set {}:".format(i))
    print_dict(hp_set)

Define your `build_compile_ex1` function in the next cell:

In [5]:
for i in range(3):
    print(i)

0
1
2


In [None]:
def build_compile_ex1(config):
    model = Sequential()

    ######### YOUR CODE HERE #########
    
    for layer in range(config.num_conv_layers):
        # first convolution / pooling set
        model.add(Conv2D(config.conv1_num_filters, 
                         config.conv_filter_size, 
                         activation=config.activation, 
                         padding='same',
                         input_shape=X_train.shape[1:]))
        model.add(MaxPooling2D(pool_size=(2, 2)))

    # second convolution / pooling set
    model.add(Conv2D(config.conv2_num_filters, 
                     config.conv_filter_size, 
                     padding='same',
                     activation=config.activation))
    model.add(Conv2D(config.conv3_num_filters, 
                     config.conv_filter_size, 
                     padding='same',
                     activation=config.activation))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    # third convolution / pooling set
    model.add(Conv2D(config.conv4_num_filters, 
                     config.conv_filter_size, 
                     padding='same',
                     activation=config.activation))
    model.add(Conv2D(config.conv5_num_filters, 
                     config.conv_filter_size, 
                     padding='same',
                     activation=config.activation))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Flatten())
    model.add(Dense(config.dense1_size,
                    activation=config.activation))
    model.add(Dropout(config.dropout))
        
    model.add(Dense(10, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=config.learn_rate),
                  metrics=['accuracy'])

    return model

In [None]:
static_hyper_params = {
    'activation': 'relu',
    'conv_filter_size': 3,
    'num_epochs': 2,
}

best_valid_acc = 0.0
best_hp_set = None
best_hp_ind = None

for hp_ind, hp_set in enumerate(hp_sets_ex1):
    # set up wandb
    print_header("Starting Training for Hyperparameter Set {}:".format(i))
    wandb.init()
    ## For short runs like this, wandb.monitor()
    # is just visual noise.  Reenable it for longer runs.
    # wandb.monitor()
    print_dict(hp_set)
   
    wandb.config.update(static_hyper_params, allow_val_change=True)
    wandb.config.update(hp_set, allow_val_change=True)

    # build model
    model = build_compile_ex1(wandb.config)
    print(model.summary())
    wandb.config.num_model_parameters = model.count_params()
    
    # train model 
    history = model.fit(
        X_train, y_train,
        batch_size=wandb.config.batch_size,
        epochs=wandb.config.num_epochs,
        verbose=1,
        validation_data=(X_valid, y_valid),
        callbacks=[WandbCallback()]
    )
    
    valid_acc = history.history['val_acc'][-1]
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        best_hp_set = hp_set
        best_hp_ind = hp_ind

In [None]:
print_header("Best Hyperparams were set {} with valid accuracy {}".format(best_hp_ind, best_valid_acc))
print_dict(best_hp_set)

# Retrain model on combined training and validation data
wandb.config.update(best_hp_set)
model = build_compile_ex1(wandb.config)
history = model.fit(
    X_train_valid, y_train_valid,
    batch_size=wandb.config.batch_size,
    epochs=wandb.config.num_epochs,
    verbose=1,
    callbacks=[WandbCallback()]
)

In [None]:
loss, acc = model.evaluate(X_test, y_test, batch_size=wandb.config.batch_size)
print("Test loss: {}, test acc: {}".format(loss, acc))

## Exercise 2
* In practice, you don't conduct a hyperparameter search by wrapping many training runs in a for loop on a single machine.  
* Instead, you want to have a single machine which selects the hyperparameter sets, then sends them off to worker nodes which actually conduct the training.
* Multi-node training isn't hard to do, but it's out of scope for this 1-week class; too many IT hurdles.  In this exercise, though, we'll refactor our existing code to more closely approximate a real training setup.

### Instructions
* Refactor your existing code into a script rather than a notebook.
* The script should accept a series of keyword arguments containing all the hyperparameter values for a single run.  Check out the `argparse` python module.
* It should then combine these arguments into a Python dict representing a single hyperparameter set like the `hp_set` variable above.
* The script should then update the wandb.config object with the values from the input hyperparameter set and train a model using those values.  You don't need to save the final result anywhere, the `WandbCallback()` will take care of that for you.

## Exercise 3
*  Create a large number of hyperparameter sets.
*  For each hyperparameter set, print out the model summary and study the number of parameters that are produced.  Try to get a sense for what configurations produce large parameter counts.
*  If you have time, train models based on some of these hyperparameter sets and see which produce good results and which don't.