# Increasing training set size

In [None]:
import pandas as pd
import scipy.stats

def read_csv(filename):
    return pd.read_csv("../thesis_data/{}".format(filename), index_col=0)

In [None]:
measure_names = [
    "gradient_norm_test",
    "path_length_f_test",
    "path_length_d_test",
    "weights_product",
]

def get_taus(ms, x_col):
    y_cols = measure_names
    res = [scipy.stats.kendalltau(ms[x_col], ms[y_col]).correlation for y_col in y_cols]
    return pd.Series(res, index=y_cols)


def get_taus_per_group(ms, groupby, x_col):
    """
    First, group `ms` using `groupby`. For each group, compute the Kendall rank
    correlation coefficient between `x_col` and each of the measures. This yields
    a coefficient for each group and each measure.
    """
    return ms.groupby(groupby).apply(
        lambda df: get_kendalls(df, x_col)
    )

## One-dimensional

In [None]:
measures_nn = read_csv("increasing_training_set_size_1d.csv")
measures_poly = read_csv("increasing_training_set_size_1d_polynomials.csv")

# Remove the models which did not converge within the set number of epochs
measures_nn = measures_nn[measures_nn["actual_epochs"] < max(measures_nn["actual_epochs"])]

# Each row of `taus_nn` and `taus_poly` corresponds to one dataset (RNG seed),
# and each column to one of the smoothness measures.
taus_nn = get_taus_per_group(measures_nn, groupby="dataset.seed", x_col="dataset.samples_train")
taus_poly = get_taus_per_group(measures_poly, groupby="dataset.seed", x_col="dataset.samples_train")

In [None]:
taus_nn_summary = taus_nn.describe().loc[["mean", "std"]].T
taus_poly_summary = taus_poly.describe().loc[["mean", "std"]].T

pd.concat([taus_nn_summary, taus_poly_summary], axis="columns", keys=["Neural networks", "Polynomials"]).round(2)

## n-dimensional

In [None]:
measures_nn = read_csv("increasing_training_set_size_nd.csv")

# Remove the models which did not converge within the set number of epochs
measures_nn = measures_nn[measures_nn["actual_epochs"] < max(measures_nn["actual_epochs"])]

taus_nn = get_taus_per_group(measures_nn, groupby="dataset.name", x_col="dataset.samples_train")

In [None]:
taus_nn_summary = taus_nn.describe().loc[["mean", "std"]].T
taus_nn_summary.round(2)

## n-dimensional - initialization scale 1.0

In [None]:
measures_nn = read_csv("increasing_training_set_size_nd_init_scale_1.csv")

# Remove the models which did not converge within the set number of epochs
measures_nn = measures_nn[measures_nn["actual_epochs"] < max(measures_nn["actual_epochs"])]

taus_nn = get_taus_per_group(measures_nn, groupby="dataset.name", x_col="dataset.samples_train")

In [None]:
taus_nn_summary = taus_nn.describe().loc[["mean", "std"]].T
taus_nn_summary.round(2)