# 1080 models trained on MNIST

For these models, we try to check whether our measures are a predictor of generalization (which corresponds simply to the test error here, since the networks all achieve very low training error).

However, the test accuracy is similar for all models as well so it is questionable whether these results are meaningful.

## Also,

first experiments with segment-based measures.

In [None]:
import os
import sys
# If we don't need CUDA, do this before importing TF
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import tensorflow as tf
import numpy as np
import pandas as pd
import tqdm
import tqdm.notebook
import scipy.stats
import matplotlib.pyplot as plt
import seaborn as sns

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    tf.config.experimental.set_visible_devices([gpus[1]], 'GPU')

sys.path.append("/nfs/scistore12/chlgrp/vvolhejn/smooth")

os.chdir("/nfs/scistore12/chlgrp/vvolhejn/smooth/logs/0214_mnist_1080/")

In [None]:
df = pd.read_feather("measures.feather")
df["model_dir"] = df["log_dir"].str.split("/").str.get(2)
del df["epochs"]
del df["log_dir"]
# These metrics are present in df2:
del df["gradient_norm"]
del df["l2"]

df2 = pd.read_feather("measures3.feather")
df2["model_dir"] = df2["model_path"].str.split("/").str.get(2)
del df2["model_path"]

df = df.merge(df2, on="model_dir")

In [None]:
for col in df.columns:
    if df[col].dtype == "object":
        continue
    plt.hist(df[col], bins=20)
    plt.title(col)
    plt.show()

In [None]:
def compare_training_speed(hparam):
    hparam_vals = sorted(df[hparam].unique())
    for hparam_val in hparam_vals:
        sns.distplot(
            df.loc[df[hparam] == hparam_val, "actual_epochs"],
            hist=False,
            label=str(hparam_val)
        )
    plt.title("Epochs to convergence by {}".format(hparam))
    plt.show()

compare_training_speed("batch_size")
compare_training_speed("init_scale")
compare_training_speed("learning_rate")

In [None]:
import smooth.analysis
import smooth.measures
import smooth.datasets
mnist = smooth.datasets.get_mnist()

In [None]:
ken = smooth.analysis.get_kendall_coefs(
    df,
    ["batch_size", "hidden_size", "init_scale", "learning_rate", "iteration"],
    "val_accuracy",
    ["accuracy", "actual_epochs", "gradient_norm", "l2", "loss", "val_loss",
     "seg_total_variation", "seg_total_variation_derivative"]
)

In [None]:
ken.astype("float32").round(3)

## 1D-based measures

In [None]:
def analyze_1d_measure(model, data, measure_f):
    single_seg_results = []
    for i in tqdm.notebook.tqdm(range(500)):
        single_seg_results.append(measure_f(model, data, n_segments=1, n_samples_per_segment=100))
    plt.hist(single_seg_results, bins=70)
    plt.title("Distribution of measure when sampling a single segment")
    plt.show()
    
    n_samples_x = list(range(10, 500, 10))
    n_samples_y = []
    for n_samples in tqdm.notebook.tqdm(n_samples_x):
        n_samples_y.append(measure_f(model, data, n_segments=30, n_samples_per_segment=n_samples))
    
    plt.plot(n_samples_x, n_samples_y)
    plt.title("Distribution of measure when varying n_samples_per_segment")
    plt.show()

In [None]:
from smooth.datasets import mnist

_model = tf.keras.models.load_model("bs=512_e=20000_hs=501_is=3.0_i=1_lr=0.01/model.h5")
analyze_1d_measure(_model, mnist.x_test, smooth.measures.segments_total_variation)

In [None]:
from functools import partial
m2 = partial(smooth.measures.segments_total_variation, derivative=True)
analyze_1d_measure(_model, mnist.x_test, m2)

In [None]:
def get_measures(log_dir):
    path = os.path.join(log_dir, "model.h5")
    model = tf.keras.models.load_model(path)
    return {
        "total_variation":
            smooth.measures.segments_total_variation(model, mnist.x_test, n_segments=100),
        "total_variation_derivative":
            smooth.measures.segments_total_variation(
                model, mnist.x_test,
                n_segments=100, n_samples_per_segment=100, derivative=True
            ),
    }

# get_measures(os.path.basename(df.iloc[0]["log_dir"]))

In [None]:
df["total_variation"] = None
df["total_variation_derivative"] = None

for i in tqdm.notebook.tqdm(df.index):
    log_dir = os.path.basename(df["log_dir"][i])
    measures = get_measures(log_dir)
    df.loc["total_variation", i] = measures["total_variation"]
    df.loc["total_variation_derivative", i] = measures["total_variation_derivative"]
    break

In [None]:
df.iloc[0]

In [None]:
corr = df2.corr().round(2)
corr.style.background_gradient(axis=None)
# corr

In [None]:
_model = tf.keras.models.load_model("./bs=128_e=20000_hs=1002_is=0.3_i=0_lr=0.003/model.h5")
from smooth.datasets import mnist

In [None]:
def _total_variation(samples, batch=False):
    """
    Given evenly spaced samples of a function's values, computes an approximation
    of the total variation, that is the sum of the distances of consecutive samples.

    For scalar samples, this means the sum of absolute values of the first difference,
    for vector-valued functions we sum the l2 norms of the first difference.

    >>> _total_variation([1, 2, 3, 1])
    4.0
    >>> print("{:.3f}".format(_total_variation([[0, 0], [1, 1], [1, 2]])))
    2.414
    """
    if not batch:
        samples = np.array([samples])
    res = np.diff(samples, axis=1)
    if res.ndim == 2:
        res = res[:, :, np.newaxis]
    res = np.linalg.norm(res, axis=2)
    res = np.sum(res, axis=1)
    
    if not batch:
        assert len(res) == 1
        return res[0]
    else:
        return res


def _interpolate(a, b, n_samples):
    """
    >>> _interpolate(1, 2, 3).tolist()
    [1.0, 1.5, 2.0]
    >>> _interpolate([0, 3], [3, 0], 4).tolist()
    [[0.0, 3.0], [1.0, 2.0], [2.0, 1.0], [3.0, 0.0]]
    >>> _interpolate([[0, 2], [1, 1]], [[2, 0], [2, 2]], 3).tolist()
    [[[0.0, 2.0], [1.0, 1.0]], [[1.0, 1.0], [1.5, 1.5]], [[2.0, 0.0], [2.0, 2.0]]]
    """
    a, b = np.array(a), np.array(b)
    assert a.shape == b.shape
    w = np.linspace(0, 1, n_samples)
    res = np.outer(1 - w, a) + np.outer(w, b)
    res = np.reshape(res, (-1,) + a.shape)
    return res

def _segment_total_variation(
    model: tf.keras.Model, x1, x2, n_samples, derivative: bool
):
    x1 = np.array(x1)
    x2 = np.array(x2)
    n_segments = len(x1)
    assert x1.shape == x2.shape
    samples = _interpolate(x1, x2, n_samples)
    samples_flat = np.reshape(samples, (n_samples * n_segments,) + samples.shape[2:])

    if not derivative:
        output_flat = model.predict(samples_flat)
        output = np.reshape(output_flat, (n_samples, n_segments) + output_flat.shape[1:])
        # at this point, `output` has shape (n_segments, n_samples, n_classes)
    else:
        with tf.GradientTape() as g:
            x = tf.constant(samples_flat)
            g.watch(x)
            y = model(x)
        output_flat = g.batch_jacobian(y, x)
        # We just stretch the Jacobian into a single vector and take its total variation
        # (meaning we sum the Frobenius norms of the first difference)
        # Does this make any sense mathematically?
        output_flat = np.reshape(output_flat, (len(samples_flat), -1))

    output = np.reshape(output_flat, (n_samples, n_segments) + output_flat.shape[1:])
    output = np.swapaxes(output, 0, 1)
    return _total_variation(output, batch=True)

In [None]:
# _segment_total_variation(
#     _model,
#     [mnist.x_test[0], mnist.x_test[2], mnist.x_test[4]],
#     [mnist.x_test[1], mnist.x_test[3], mnist.x_test[5]],
#     100, False,
# )

_segment_total_variation(
    _model,
    [mnist.x_test[0],mnist.x_test[2]],
    [mnist.x_test[1],mnist.x_test[3]],
    100, False,
)

In [None]:
def _segment_total_variation0(
    model: tf.keras.Model, x1, x2, n_samples, derivative: bool
):
    global _d1
    samples = _interpolate(x1, x2, n_samples)
    if not derivative:
        output = model.predict(samples)
    else:
        with tf.GradientTape() as g:
            x = tf.constant(samples)
            g.watch(x)
            y = model(x)
        output = g.batch_jacobian(y, x)
        # We just stretch the Jacobian into a single vector and take its total variation
        # (meaning we sum the Frobenius norms of the first difference)
        # Does this make any sense mathematically?
        output = np.reshape(output, (n_samples, -1))
    return _total_variation(output)

_segment_total_variation0(
    _model,
    mnist.x_test[2],
    mnist.x_test[3],
    100, False,
)

In [None]:
a = np.array(range(10,20))
a[[0,1]]

In [None]:
cifar = tf.keras.datasets.cifar10.load_data()