## optuna を TensorFlow で試してみる
- Reference
  - https://github.com/optuna/optuna/blob/master/examples/tensorflow_eager_simple.py

In [2]:
import pkg_resources
import tensorflow as tf
from tensorflow.keras.datasets import mnist
import optuna

In [3]:
if pkg_resources.parse_version(tf.__version__) < pkg_resources.parse_version("2.0.0"):
    raise RuntimeError("tensorflow>=2.0.0 is required for this example.")

N_TRAIN_EXAMPLES = 3000
N_VALID_EXAMPLES = 1000
BATCHSIZE = 128
CLASSES = 10
EPOCHS = 1

In [4]:
(x_train, y_train), (x_valid, y_valid) = mnist.load_data()
print(x_train.shape, y_train.shape)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
(60000, 28, 28) (60000,)


In [5]:
def create_model(trial):
    n_layers = trial.suggest_int("n_layers", 1, 3)
    weight_decay = trial.suggest_loguniform("weight_decay", 1e-10, 1e-3)
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Flatten())

    for i in range(n_layers):
        num_hidden = int(trial.suggest_loguniform("n_units_l{}".format(i), 4, 128))
        model.add(
            tf.keras.layers.Dense(
                num_hidden,
                activation="relu",
                kernel_regularizer=tf.keras.regularizers.l2(weight_decay),
            )
        )

    model.add(
        tf.keras.layers.Dense(CLASSES, kernel_regularizer=tf.keras.regularizers.l2(weight_decay))
    )

    return model

In [6]:
def create_optimizer(trial):
    kwargs = {}
    optimizer_options = ["RMSprop", "Adam", "SGD"]
    optimizer_selected = trial.suggest_categorical("optimizer", optimizer_options)

    if optimizer_selected == "RMSprop":
        kwargs["learning_rate"] = trial.suggest_loguniform("rmsprop_learning_rate", 1e-5, 1e-1)
        kwargs["decay"] = trial.suggest_uniform("rmsprop_decay", 0.85, 0.99)
        kwargs["momentum"] = trial.suggest_loguniform("rmsprop_momentum", 1e-5, 1e-1)

    elif optimizer_selected == "Adam":
        kwargs["learning_rate"] = trial.suggest_loguniform("adam_learning_rate", 1e-5, 1e-1)

    elif optimizer_selected == "SGD":
        kwargs["learning_rate"] = trial.suggest_loguniform("sgd_opt_learning_rate", 1e-5, 1e-1)
        kwargs["momentum"] = trial.suggest_loguniform("sgd_opt_momentum", 1e-5, 1e-1)

    optimizer = getattr(tf.optimizers, optimizer_selected)(**kwargs)

    return optimizer

In [7]:
def learn(model, optimizer, dataset, mode="eval"):
    accuracy = tf.metrics.Accuracy("accuracy", dtype=tf.float32)

    for batch, (images, labels) in enumerate(dataset):
        with tf.GradientTape() as tape:
            logits = model(images, training=(mode == "train"))
            loss_value = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)
            )

            if mode == "eval":
                accuracy(
                    tf.argmax(logits, axis=1, output_type=tf.int64), tf.cast(labels, tf.int64)
                )
            else:
                grads = tape.gradient(loss_value, model.variables)
                optimizer.apply_gradients(zip(grads, model.variables))

    if mode == "eval":
        return accuracy

In [8]:
def get_mnist():
    (x_train, y_train), (x_valid, y_valid) = mnist.load_data()

    x_train = x_train.astype("float32") / 255
    x_valid = x_valid.astype("float32") / 255

    y_train = y_train.astype("int32")
    y_valid = y_valid.astype("int32")

    train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    train_ds = train_ds.shuffle(60000).batch(BATCHSIZE).take(N_TRAIN_EXAMPLES)

    valid_ds = tf.data.Dataset.from_tensor_slices((x_valid, y_valid))
    valid_ds = valid_ds.shuffle(10000).batch(BATCHSIZE).take(N_VALID_EXAMPLES)

    return train_ds, valid_ds

In [9]:
def objective(trial):
    train_ds, valid_ds = get_mnist()

    model = create_model(trial)
    optimizer = create_optimizer(trial)

    with tf.device("/cpu:0"):
        for _ in range(EPOCHS):
            learn(model, optimizer, train_ds, "train")

        accuracy = learn(model, optimizer, valid_ds, "eval")

    return accuracy.result()

In [11]:
if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    #study.optimize(objective, n_trials=100)
    study.optimize(objective, n_trials=20)

    print("Number of finished trials: ", len(study.trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[I 2020-06-21 20:13:35,209] Finished trial#0 with value: 0.44909998774528503 with parameters: {'n_layers': 2, 'weight_decay': 1.0306354678511479e-09, 'n_units_l0': 18.701334875543605, 'n_units_l1': 21.721065593915235, 'optimizer': 'Adam', 'adam_learning_rate': 3.8071199615243025e-05}. Best is trial#0 with value: 0.44909998774528503.
[I 2020-06-21 20:13:38,662] Finished trial#1 with value: 0.10970000177621841 with parameters: {'n_layers': 3, 'weight_decay': 7.382097783465894e-06, 'n_units_l0': 36.73214531831566, 'n_units_l1': 7.8062325271984605, 'n_units_l2': 56.52211476840823, 'optimizer': 'SGD', 'sgd_opt_learning_rate': 7.035948426789005e-05, 'sgd_opt_momentum': 0.00018728118078059335}. Best is trial#0 with value: 0.44909998774528503.
[I 2020-06-21 20:13:41,964] Finished trial#2 with value: 0.08990000188350677 with parameters: {'n_layers': 3, 'weight_decay': 3.5380798065151636e-08, 'n_units_l0': 13.574619272024263, 'n_units_l1': 8.967318830880219, 'n_units_l2': 35.33196774288219, 'opt

Number of finished trials:  20
Best trial:
  Value:  0.9211999773979187
  Params: 
    n_layers: 2
    weight_decay: 6.31133734459572e-08
    n_units_l0: 40.69982964409763
    n_units_l1: 72.42731745597784
    optimizer: SGD
    sgd_opt_learning_rate: 0.09202340122938332
    sgd_opt_momentum: 1.3862233175784708e-05
