# n-dimensional Gaussian process datasets 2

Fewer datasets, but more different numbers of training samples.

Also contains first experiments using kernel ridge regression.

In [None]:
import os
import sys
# If we don't need CUDA, do this before importing TF
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import tensorflow as tf
import numpy as np
import pandas as pd
import tqdm
import tqdm.notebook
import scipy.stats
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    tf.config.experimental.set_visible_devices([gpus[1]], 'GPU')

os.chdir("/nfs/scistore12/chlgrp/vvolhejn/smooth/logs/0227_gp_nd/")

In [None]:
%load_ext autoreload
%aimport smooth.datasets
%aimport smooth.model
%aimport smooth.analysis
%aimport smooth.callbacks
%aimport smooth.measures
%aimport smooth.util
%autoreload 1

In [None]:
ms_raw = pd.read_feather("measures.feather")

if "error" in ms_raw.columns:
    print("Errors:", len(ms_raw[~ms_raw["error"].isnull()]))
    ms_raw = ms_raw.loc[ms_raw["error"].isnull()]

ms_raw = smooth.analysis.expand_dataset_columns(ms_raw)
ms = ms_raw

# divergent_model_mask = (ms["loss"] == np.inf) | (~(ms["train_loss"] < 0.1))
# print("Divergent models:", len(ms[divergent_model_mask]))
# ms = ms.loc[~divergent_model_mask]

print("Remaining:", len(ms))
smooth.analysis.remove_constant_columns(ms, verbose=True)
# ms = smooth.analysis.expand_dataset_columns(ms)
ms.loc[:,"log_dir"] = ms["log_dir"].str.split("/").str.get(-1)

ms.head()

In [None]:
cols = ["actual_epochs", "train_loss", "test_loss"]
cols = ms.columns

trim = 0.0

for col in cols:
    if ms[col].dtype == "object":
        continue
    
    data = ms.loc[(ms[col] >= ms[col].quantile(trim/2)) & (ms[col] <= ms[col].quantile(1-trim/2)), col]
    
    plt.hist(data, bins=20)
    plt.title(col)
    plt.show()

In [None]:
ms.columns

In [None]:
def get_optimal_path_length_f(dataset_name):
    dataset = smooth.datasets.from_name(dataset_name)
    n = len(dataset.x_test)
    y = sorted(dataset.y_test.reshape((-1,)))
#     cs = np.cumsum(y)[::-1]
#     res = 0
#     for i in range(n - 1):
#         res += cs[i] - y[i] * (n - i)

#     return res / (n ** 2)
    res = 0
    for a in y:
        for b in y:
            res += np.abs(a - b)
    return res / (n ** 2)

optimal_lengths = {}
for seed in tqdm.notebook.tqdm(range(1, 6)):
    optimal_lengths[seed] = get_optimal_path_length_f("gp-100-{}-1.0-77".format(seed))

In [None]:
optimal_lengths

In [None]:
np.mean(list(optimal_lengths.values()))

In [None]:
def make_palette(values):
    values = sorted(values)
    pal = dict(zip(values, sns.cubehelix_palette(len(values))))
    return pal

In [None]:
ms1 = ms

measure_cols = ["gradient_norm", "path_length_d", "path_length_f"]
for measure in measure_cols + ["l2", "train_loss", "test_loss"]:
#     ax = plt.subplot()
#     ms1.loc[:,"hidden_size_s"] = ms1["hidden_size"].astype(str) + " units"
    sns.lineplot(
        data=ms1,
        x="samples_train",
        y=measure,
        hue="hidden_size",
#         col="batch_size",
#         kind="line",
        palette=make_palette(ms1["hidden_size"].unique()),
#         sns.cubehelix_palette(8),
#                 ax=ax
    )
    if measure == "path_length_f":
        ol = np.mean(list(optimal_lengths.values()))
        plt.plot([ms1["samples_train"].min(), ms1["samples_train"].max()], [ol, ol])
        
    plt.show()

In [None]:
ms["dataset"]

In [None]:
datasets = {}
for seed in tqdm.notebook.tqdm(range(1, 6)):
    datasets[seed] = smooth.datasets.from_name("gp-100-{}-1.0-1000".format(seed))

In [None]:
df = pd.DataFrame()
df = df.append({"a":1, "b":" x"}, ignore_index=True)
df = df.append({"a":1, "b":" x"}, ignore_index=True)
df

In [None]:
np.mean(tf.losses.mean_squared_error(model.predict(dataset.x_test), dataset.y_test))

In [None]:
class KRRModel:
    
    def __init__(self, krr):
        self.krr = krr
    
    def predict(self, x, batch_size=None):
        # batch_size is a fake argument which is ignored
        return self.krr.predict(x)


def measure_krr(krr, dataset):
    def mse(y1, y2):
        return np.mean(tf.losses.mean_squared_error(y1, y2))

    train_loss = mse(krr.predict(dataset.x_train), dataset.y_train)
    test_loss = mse(krr.predict(dataset.x_test), dataset.y_test)
    path_length_f = smooth.measures.path_length(KRRModel(krr), dataset.x_test)

    return {
        "train_loss": train_loss,
        "test_loss": test_loss,
        "path_length_f": path_length_f,
    }

In [None]:
%%time

import sklearn.kernel_ridge

ms_krr_l = []

for seed in tqdm.notebook.tqdm(range(1, 6)[:1], desc="seed"):
    for iteration in tqdm.notebook.tqdm(range(3), desc="iteration"):
        for alpha in tqdm.notebook.tqdm([1, 0.01, 0.0001], leave=False, desc="alpha"):
            for degree in tqdm.notebook.tqdm([1, 2, 3, 4, 5], leave=False, desc="degree"):
                for samples in tqdm.notebook.tqdm(np.sort(ms["samples_train"].unique()), leave=False, desc="samples"):
                    krr = sklearn.kernel_ridge.KernelRidge(
                        alpha=alpha,
                        kernel="poly",
                        degree=degree,
                #     degree=len(dataset.x_train) + 10,
                        coef0=1,
                    )
                    dataset = datasets[seed].subset(samples, keep_test_set=True)
                    krr.fit(dataset.x_train, dataset.y_train)

                    m = measure_krr(krr, dataset)
                    m.update(
                        seed=seed,
                        alpha=alpha,
                        degree=degree,
                        samples_train=samples,
                        iteration=iteration,
                    )
                    ms_krr_l.append(m)
                #     y_pred = krr.predict(dataset.x_test)
                #     break

In [None]:
# %matplotlib inline
ms_krr = pd.DataFrame(ms_krr_l)
# ms_krr = ms_krr.loc[ms_krr["degree"] > 4]
# ms_krr["samples_train"] = np.sort(ms["samples_train"].unique())[ms_krr.index % 10]

for measure in ["train_loss", "test_loss", "path_length_f"]:
    ax = plt.subplot()
    sns.lineplot(
        data=ms_krr,
        x="samples_train",
        y=measure,
        hue="degree",
        style="alpha",
    #         col="batch_size",
    #         kind="line",
        palette=make_palette(ms_krr["degree"].unique()),
    #         sns.cubehelix_palette(8),
    #                 ax=ax
    )
    if measure in ["train_loss", "test_loss"]:
        ax.set_yscale("log")
    plt.show()

In [None]:
ms.loc[(ms["hidden_size"] == 30) & (ms["seed"] == 1)]

In [None]:
ms_krr.loc[(ms_krr["degree"] == 5) & (ms_krr["alpha"] == 0.01)]

In [None]:
def plot_compare(ms_dict):
    l = []
    for name, ms_cur in sorted(ms_dict.items()):
        ms_cur.loc[:,"source"] = name
        l.append(ms_cur)
#     ms1.loc[:,"source"] = "a"
#     ms2.loc[:,"source"] = "b"
    ms_both = pd.concat(l)
    ms_both = ms_both.loc[ms_both["samples_train"] <= 200]
    
    for measure in ["train_loss", "test_loss", "path_length_f"]:
        ax = plt.subplot()
        sns.lineplot(
            data=ms_both,
            x="samples_train",
            y=measure,
            hue="source",
#             style="alpha",
        #         col="batch_size",
        #         kind="line",
#             palette=make_palette(ms_krr["degree"].unique()),
        #         sns.cubehelix_palette(8),
        #                 ax=ax
        )
        if measure in ["train_loss", "test_loss"]:
            ax.set_yscale("log")
        plt.show()
    return 

plot_compare({
    "nn, hs=010": ms.loc[(ms["hidden_size"] == 10) & (ms["seed"] == 1)],
    "nn, hs=030": ms.loc[(ms["hidden_size"] == 30) & (ms["seed"] == 1)],
    "nn, hs=100": ms.loc[(ms["hidden_size"] == 100) & (ms["seed"] == 1)],
    "krr, deg=1": ms_krr.loc[(ms_krr["degree"] == 1) & (ms_krr["alpha"] == 0.0001)],
    "krr, deg=2": ms_krr.loc[(ms_krr["degree"] == 2) & (ms_krr["alpha"] == 0.0001)],
    "krr, deg=3": ms_krr.loc[(ms_krr["degree"] == 3) & (ms_krr["alpha"] == 0.0001)],
    "krr, deg=4": ms_krr.loc[(ms_krr["degree"] == 4) & (ms_krr["alpha"] == 0.0001)],
    "krr, deg=5": ms_krr.loc[(ms_krr["degree"] == 5) & (ms_krr["alpha"] == 0.0001)],
})

In [None]:
np.logspace(np.log10(5), np.log10(1000), 20).round().astype(int)

In [None]:
df = pd.read_feather("../0228-162015/measures.feather")