In [1]:
#https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/

In [2]:
datafile = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'

In [3]:
!pip install scikeras



# Batch size and epochs

In [4]:
# Use scikit-learn to grid search the batch size and epochs
import numpy as np
import tensorflow as tf
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasClassifier

# Function to create model, required for KerasClassifier
def create_model():
	# create model
	model = Sequential()
	model.add(Dense(12, input_shape=(8,), activation='relu'))
	model.add(Dense(1, activation='sigmoid'))
	# Compile model
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

# fix random seed for reproducibility
seed = 7
tf.random.set_seed(seed)

# load dataset
dataset = np.loadtxt(datafile, delimiter=",")

# split into input (X) and output (Y) variables
X = dataset[:,0:8]
Y = dataset[:,8]

# create model
model = KerasClassifier(model=create_model, verbose=0)

####################################################
# define the grid search parameters
batch_size = [10, 20, 40, 60, 80, 100]
epochs = [10, 50, 100]
param_grid = dict(batch_size=batch_size, epochs=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X, Y)
####################################################

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

  pid = os.fork()
  pid = os.fork()
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Best: 0.694010 using {'batch_size': 80, 'epochs': 100}
0.634115 (0.031948) with: {'batch_size': 10, 'epochs': 10}
0.661458 (0.009744) with: {'batch_size': 10, 'epochs': 50}
0.660156 (0.026107) with: {'batch_size': 10, 'epochs': 100}
0.578125 (0.088100) with: {'batch_size': 20, 'epochs': 10}
0.682292 (0.023073) with: {'batch_size': 20, 'epochs': 50}
0.679688 (0.003189) with: {'batch_size': 20, 'epochs': 100}
0.518229 (0.068875) with: {'batch_size': 40, 'epochs': 10}
0.664062 (0.019401) with: {'batch_size': 40, 'epochs': 50}
0.640625 (0.022097) with: {'batch_size': 40, 'epochs': 100}
0.528646 (0.088292) with: {'batch_size': 60, 'epochs': 10}
0.627604 (0.006639) with: {'batch_size': 60, 'epochs': 50}
0.639323 (0.033804) with: {'batch_size': 60, 'epochs': 100}
0.432292 (0.068579) with: {'batch_size': 80, 'epochs': 10}
0.634115 (0.010253) with: {'batch_size': 80, 'epochs': 50}
0.694010 (0.020505) with: {'batch_size': 80, 'epochs': 100}
0.368490 (0.022628) with: {'batch_size': 100, 'epochs':

# Optimizer

In [5]:
# Use scikit-learn to grid search the optimizer
import numpy as np
import tensorflow as tf
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasClassifier

# Function to create model, required for KerasClassifier
def create_model():
	# create model
	model = Sequential()
	model.add(Dense(12, input_shape=(8,), activation='relu'))
	model.add(Dense(1, activation='sigmoid'))
  ####################################################
	# return model without compile
	return model
	####################################################

# fix random seed for reproducibility
seed = 7
tf.random.set_seed(seed)

# load dataset
dataset = np.loadtxt(datafile, delimiter=",")

# split into input (X) and output (Y) variables
X = dataset[:,0:8]
Y = dataset[:,8]

# create model
model = KerasClassifier(model=create_model, loss="binary_crossentropy", epochs=100, batch_size=10, verbose=0)

####################################################
# define the grid search parameters
optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
param_grid = dict(optimizer=optimizer)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X, Y)
####################################################

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Best: 0.703125 using {'optimizer': 'SGD'}
0.703125 (0.048159) with: {'optimizer': 'SGD'}
0.618490 (0.020752) with: {'optimizer': 'RMSprop'}
0.591146 (0.039879) with: {'optimizer': 'Adagrad'}
0.488281 (0.090941) with: {'optimizer': 'Adadelta'}
0.670573 (0.049445) with: {'optimizer': 'Adam'}
0.664062 (0.022326) with: {'optimizer': 'Adamax'}
0.692708 (0.015073) with: {'optimizer': 'Nadam'}


# Learning rate and momentum

In [6]:
# Use scikit-learn to grid search the learning rate and momentum
import numpy as np
import tensorflow as tf
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from scikeras.wrappers import KerasClassifier

# Function to create model, required for KerasClassifier
def create_model():
	# create model
	model = Sequential()
	model.add(Dense(12, input_shape=(8,), activation='relu'))
	model.add(Dense(1, activation='sigmoid'))
  ####################################################
	return model
	####################################################

# fix random seed for reproducibility
seed = 7
tf.random.set_seed(seed)

# load dataset
dataset = np.loadtxt(datafile, delimiter=",")

# split into input (X) and output (Y) variables
X = dataset[:,0:8]
Y = dataset[:,8]

# create model
model = KerasClassifier(model=create_model, loss="binary_crossentropy", optimizer="SGD", epochs=100, batch_size=10, verbose=0)

####################################################
# define the grid search parameters
learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
param_grid = dict(optimizer__learning_rate=learn_rate, optimizer__momentum=momentum)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X, Y)
####################################################

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

  pid = os.fork()
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Best: 0.678385 using {'optimizer__learning_rate': 0.001, 'optimizer__momentum': 0.2}
0.667969 (0.016573) with: {'optimizer__learning_rate': 0.001, 'optimizer__momentum': 0.0}
0.678385 (0.009744) with: {'optimizer__learning_rate': 0.001, 'optimizer__momentum': 0.2}
0.653646 (0.021236) with: {'optimizer__learning_rate': 0.001, 'optimizer__momentum': 0.4}
0.652344 (0.012758) with: {'optimizer__learning_rate': 0.001, 'optimizer__momentum': 0.6}
0.670573 (0.023939) with: {'optimizer__learning_rate': 0.001, 'optimizer__momentum': 0.8}
0.656250 (0.011500) with: {'optimizer__learning_rate': 0.001, 'optimizer__momentum': 0.9}
0.667969 (0.022999) with: {'optimizer__learning_rate': 0.01, 'optimizer__momentum': 0.0}
0.666667 (0.025780) with: {'optimizer__learning_rate': 0.01, 'optimizer__momentum': 0.2}
0.657552 (0.021236) with: {'optimizer__learning_rate': 0.01, 'optimizer__momentum': 0.4}
0.645833 (0.004872) with: {'optimizer__learning_rate': 0.01, 'optimizer__momentum': 0.6}
0.649740 (0.003683)

# Weight initialization

In [7]:
# Use scikit-learn to grid search the weight initialization
import numpy as np
import tensorflow as tf
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasClassifier

# Function to create model, required for KerasClassifier
def create_model(init_mode='uniform'):
	# create model
	model = Sequential()
	model.add(Dense(12, input_shape=(8,), kernel_initializer=init_mode, activation='relu'))
	model.add(Dense(1, kernel_initializer=init_mode, activation='sigmoid'))
	# Compile model
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

# fix random seed for reproducibility
seed = 7
tf.random.set_seed(seed)

# load dataset
dataset = np.loadtxt(datafile, delimiter=",")

# split into input (X) and output (Y) variables
X = dataset[:,0:8]
Y = dataset[:,8]

# create model
model = KerasClassifier(model=create_model, epochs=100, batch_size=10, verbose=0)

####################################################
# define the grid search parameters
init_mode = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform']
param_grid = dict(model__init_mode=init_mode)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X, Y)
####################################################

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Best: 0.713542 using {'model__init_mode': 'normal'}
0.695312 (0.019918) with: {'model__init_mode': 'uniform'}
0.690104 (0.027866) with: {'model__init_mode': 'lecun_uniform'}
0.713542 (0.034401) with: {'model__init_mode': 'normal'}
0.651042 (0.001841) with: {'model__init_mode': 'zero'}
0.678385 (0.004872) with: {'model__init_mode': 'glorot_normal'}
0.697917 (0.018688) with: {'model__init_mode': 'glorot_uniform'}
0.670573 (0.014382) with: {'model__init_mode': 'he_normal'}
0.640625 (0.046983) with: {'model__init_mode': 'he_uniform'}


# Activation function

In [8]:
# Use scikit-learn to grid search the activation function
import numpy as np
import tensorflow as tf
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasClassifier

# Function to create model, required for KerasClassifier
def create_model(activation='relu'):
	# create model
	model = Sequential()
	model.add(Dense(12, input_shape=(8,), kernel_initializer='uniform', activation=activation))
	model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
	# Compile model
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

# fix random seed for reproducibility
seed = 7
tf.random.set_seed(seed)

# load dataset
dataset = np.loadtxt(datafile, delimiter=",")

# split into input (X) and output (Y) variables
X = dataset[:,0:8]
Y = dataset[:,8]

# create model
model = KerasClassifier(model=create_model, epochs=100, batch_size=10, verbose=0)

####################################################
# define the grid search parameters
activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
param_grid = dict(model__activation=activation)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X, Y)
####################################################

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

  pid = os.fork()
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Best: 0.725260 using {'model__activation': 'softplus'}
0.670573 (0.018136) with: {'model__activation': 'softmax'}
0.725260 (0.024150) with: {'model__activation': 'softplus'}
0.666667 (0.019225) with: {'model__activation': 'softsign'}
0.713542 (0.015073) with: {'model__activation': 'relu'}
0.664062 (0.033603) with: {'model__activation': 'tanh'}
0.677083 (0.021236) with: {'model__activation': 'sigmoid'}
0.678385 (0.007366) with: {'model__activation': 'hard_sigmoid'}
0.710938 (0.017758) with: {'model__activation': 'linear'}


# Dropout rate

In [10]:
# Use scikit-learn to grid search the dropout rate
import numpy as np
import tensorflow as tf
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.constraints import MaxNorm
from scikeras.wrappers import KerasClassifier

# Function to create model, required for KerasClassifier
def create_model(dropout_rate, weight_constraint):
	# create model
	model = Sequential()
	model.add(Dense(12, input_shape=(8,), kernel_initializer='uniform', activation='linear', kernel_constraint=MaxNorm(weight_constraint)))
	model.add(Dropout(dropout_rate))
	model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
	# Compile model
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

# fix random seed for reproducibility
seed = 7
tf.random.set_seed(seed)

# load dataset
dataset = np.loadtxt(datafile, delimiter=",")
print(dataset.dtype, dataset.shape)

# split into input (X) and output (Y) variables
X = dataset[:,0:8]
Y = dataset[:,8]

# create model
model = KerasClassifier(model=create_model, epochs=100, batch_size=10, verbose=0)

####################################################
# define the grid search parameters
weight_constraint = [1.0, 2.0, 3.0, 4.0, 5.0]
dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
param_grid = dict(model__dropout_rate=dropout_rate, model__weight_constraint=weight_constraint)
#param_grid = dict(model__dropout_rate=dropout_rate)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X, Y)
####################################################

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

float64 (768, 9)


  pid = os.fork()
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Best: 0.725260 using {'model__dropout_rate': 0.1, 'model__weight_constraint': 4.0}
0.700521 (0.003683) with: {'model__dropout_rate': 0.0, 'model__weight_constraint': 1.0}
0.717448 (0.010253) with: {'model__dropout_rate': 0.0, 'model__weight_constraint': 2.0}
0.704427 (0.012890) with: {'model__dropout_rate': 0.0, 'model__weight_constraint': 3.0}
0.690104 (0.018688) with: {'model__dropout_rate': 0.0, 'model__weight_constraint': 4.0}
0.712240 (0.013279) with: {'model__dropout_rate': 0.0, 'model__weight_constraint': 5.0}
0.704427 (0.004872) with: {'model__dropout_rate': 0.1, 'model__weight_constraint': 1.0}
0.714844 (0.017758) with: {'model__dropout_rate': 0.1, 'model__weight_constraint': 2.0}
0.720052 (0.015733) with: {'model__dropout_rate': 0.1, 'model__weight_constraint': 3.0}
0.725260 (0.014731) with: {'model__dropout_rate': 0.1, 'model__weight_constraint': 4.0}
0.712240 (0.012890) with: {'model__dropout_rate': 0.1, 'model__weight_constraint': 5.0}
0.716146 (0.010253) with: {'model__dr

# Number of neurons

In [11]:
# Use scikit-learn to grid search the number of neurons
import numpy as np
import tensorflow as tf
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.constraints import MaxNorm

# Function to create model, required for KerasClassifier
def create_model(neurons):
	# create model
	model = Sequential()
	model.add(Dense(neurons, input_shape=(8,), kernel_initializer='uniform', activation='linear', kernel_constraint=MaxNorm(4)))
	model.add(Dropout(0.2))
	model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
	# Compile model
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

# fix random seed for reproducibility
seed = 7
tf.random.set_seed(seed)

# load dataset
dataset = np.loadtxt(datafile, delimiter=",")

# split into input (X) and output (Y) variables
X = dataset[:,0:8]
Y = dataset[:,8]

# create model
model = KerasClassifier(model=create_model, epochs=100, batch_size=10, verbose=0)

####################################################
# define the grid search parameters
neurons = [1, 5, 10, 15, 20, 25, 30]
param_grid = dict(model__neurons=neurons)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X, Y)
####################################################

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

  pid = os.fork()
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Best: 0.709635 using {'model__neurons': 10}
0.692708 (0.008027) with: {'model__neurons': 1}
0.703125 (0.006379) with: {'model__neurons': 5}
0.709635 (0.015073) with: {'model__neurons': 10}
0.707031 (0.009568) with: {'model__neurons': 15}
0.703125 (0.012758) with: {'model__neurons': 20}
0.690104 (0.018688) with: {'model__neurons': 25}
0.696615 (0.004872) with: {'model__neurons': 30}


# Tips for Hyperparameter Optimization
This section lists some handy tips to consider when tuning hyperparameters of your neural network.

1. k-fold Cross Validation. You can see that the results from the examples in this post show some variance. A default cross-validation of 3 was used, but perhaps k=5 or k=10 would be more stable. Carefully choose your cross validation configuration to ensure your results are stable.
2. Review the Whole Grid. Do not just focus on the best result, review the whole grid of results and look for trends to support configuration decisions.
3. Parallelize. Use all your cores if you can, neural networks are slow to train and we often want to try a lot of different parameters. Consider spinning up a lot of AWS instances.
4. Use a Sample of Your Dataset. Because networks are slow to train, try training them on a smaller sample of your training dataset, just to get an idea of general directions of parameters rather than optimal configurations.
5. Start with Coarse Grids. Start with coarse-grained grids and zoom into finer grained grids once you can narrow the scope.
6. Do not Transfer Results. Results are generally problem specific. Try to avoid favorite configurations on each new problem that you see. It is unlikely that optimal results you discover on one problem will transfer to your next project. Instead look for broader trends like number of layers or relationships between parameters.
7. Reproducibility is a Problem. Although we set the seed for the random number generator in NumPy, the results are not 100% reproducible. There is more to reproducibility when grid searching wrapped Keras models than is presented in this post.