# Smoother models through changing hyperparameters

What happens to smoothness if we train models with different hyperparams? Specifically, we might try to decrease the init scale as this has yielded smoother models in the past. This is an updated version of this experiment to see how this interacts with explicit regularization. Can we get smoother models just by varying the hyperparameters?

Best models:

- init scale = 0.01
- LR = 0.1 or 0.01

In [None]:
import os
import sys
# If we don't need CUDA, do this before importing TF
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import tensorflow as tf
import numpy as np
import pandas as pd
import tqdm
import tqdm.notebook
import matplotlib.pyplot as plt
import seaborn as sns
import IPython
sns.set()

os.chdir("/nfs/scistore12/chlgrp/vvolhejn/smooth/logs/")

In [None]:
%load_ext autoreload
%aimport smooth.config
%aimport smooth.datasets
%aimport smooth.model
%aimport smooth.analysis
%aimport smooth.callbacks
%aimport smooth.measures
%aimport smooth.util
%autoreload 1

In [None]:
def load_measures(path):
    ms = pd.read_feather(path)

    bad_mask = ~np.isfinite(ms["loss_test"])
    print("Removing {} entries".format(sum(bad_mask)))
    ms = ms[~bad_mask]

    max_epochs = ms["model.epochs"].iloc[0]
    unconverged_mask = ms["actual_epochs"] == max_epochs
    print("Removing {} models which have not converged".format(sum(unconverged_mask)))
    ms = ms[~unconverged_mask]

    smooth.analysis.remove_constant_columns(ms, verbose=True)

    ms["kind"] = "LR: " + ms["model.learning_rate"].map(str) + ", IS: " + ms["model.init_scale"].map(str)
    ms = ms.sort_values("kind")

    print("Remaining:", len(ms))

    return ms

def should_plot_as_log(measure_name):
    patterns = ["loss", "weights_product"]
    
    for p in patterns:
        if p in measure_name:
            return True
    
    return False

def plot_measure(ms, measure_name):
    ms = ms.copy()
    if should_plot_as_log(measure_name):
        log_measure_name = "log10_{}".format(measure_name)
        ms[log_measure_name] = np.log10(ms[measure_name])
        measure_name = log_measure_name

    sns.boxplot(data=ms, x=measure_name, y="kind")
    sns.swarmplot(data=ms, x=measure_name, y="kind",
                  size=2, color=".3", linewidth=0
                 )
    
    plt.show()

def plot_all_measures(ms):
    for g_id, g in ms.groupby("kind"):
        print(g_id, g.count().iloc[0])

    for measure in [
                "loss_train", "loss_test",
                "gradient_norm_test",
                "weights_product",
                "path_length_f_test",
                "path_length_d_test",
                "actual_epochs",
            ]:
        plot_measure(ms, measure)

In [None]:
ms = load_measures("./0407_finetune/measures.feather")
plot_all_measures(ms)

In [None]:
ms = load_measures("./0408-170502/measures.feather")
plot_all_measures(ms)

In [None]:
ms = load_measures("./0409-113610/measures.feather")
plot_all_measures(ms)

In [None]:
dataset = smooth.datasets.from_params("mnist12")
model = smooth.model.train_shallow(
    dataset,
    learning_rate=0.01,
    init_scale=0.01,
    hidden_size=256,
    epochs=100,
    batch_size=64,
    verbose=1,
)

In [None]:
model = smooth.model.train_shallow(
    dataset,
    learning_rate=0.01,
    init_scale=0.01,
    hidden_size=256,
    epochs=100,
    batch_size=64,
    verbose=1,
    gradient_norm_reg_coef=1e-2,
    error_threshold=0.06,
)

In [None]:
model = smooth.model.train_shallow(
    dataset,
    learning_rate=0.01,
    init_scale=0.01,
    hidden_size=256,
    epochs=100,
    batch_size=64,
    verbose=1,
    gradient_norm_reg_coef=1e-2,
    error_threshold=0.06,
)

In [None]:
layer = tf.keras.layers.Dense(1, weights=[np.array([[2], [1]]), np.array([0])])

In [None]:
layer(np.array([[3, 4]]))

In [None]:
smooth.measures.gradient_norm(layer, np.array([[3., 4.]], dtype=np.float32))