# Further results with 1D Gaussian process datasets

We discovered reasonable hyperparameters before, so now let's train some models with varying number of training samples and see what the measures' trend looks like.

We want to contrast this to the case of polynomial interpolation.

In [None]:
import os
import sys
# If we don't need CUDA, do this before importing TF
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import tensorflow as tf
import numpy as np
import pandas as pd
import tqdm
import tqdm.notebook
import scipy.stats
import matplotlib.pyplot as plt
import seaborn as sns

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    tf.config.experimental.set_visible_devices([gpus[1]], 'GPU')

sys.path.append("/nfs/scistore12/chlgrp/vvolhejn/smooth")

os.chdir("/nfs/scistore12/chlgrp/vvolhejn/smooth/logs/0225_gp1/")

In [None]:
%load_ext autoreload
%aimport smooth.datasets
%aimport smooth.model
%aimport smooth.analysis
%aimport smooth.callbacks
%aimport smooth.measures
%aimport smooth.util
%autoreload 1

In [None]:
ms = pd.read_feather("measures.feather")
smooth.analysis.remove_constant_columns(ms)
ms = smooth.analysis.expand_dataset_columns(ms)
ms["log_dir"] = ms["log_dir"].str.split("/").str.get(-1)
ms.head()

print("Total models:", len(ms))
print("Well-fit models for each lengthscale:")
for lengthscale in [1.0, 0.3, 0.1]:
    print("    {}: {}".format(
        lengthscale,
        len(ms["train_loss"][(ms["train_loss"] < 0.01) & (ms["lengthscale"] == lengthscale)])
    ))
# Only take models which are well-fit (otherwise it's easy to be smooth)
# ms = ms[ms["test_loss"] < 0.01]
# ms["test_loss"][ms["test_loss"] < 0.01]

The models are unable to fit GPs with a lengthscale of 0.1.

In [None]:
for col in ["actual_epochs", "train_loss", "test_loss"]:
    if ms[col].dtype == "object":
        continue
    plt.hist(ms[col], bins=20)
    plt.title(col)
    plt.show()

In [None]:
msi_train = smooth.analysis.get_interpolation_measures(ms["dataset"].unique())
msi_test = smooth.analysis.get_interpolation_measures(ms["dataset"].unique(), use_test_set=True)
msi_poly = smooth.analysis.get_interpolation_measures(ms["dataset"].unique(), use_polynomial=True)

In [None]:
def plot_dataset(data_sources, seed, lengthscale, measure):
    ms_all = None
    for name, ms_cur in data_sources.items():
        ms1 = ms_cur[(ms_cur["seed"] == seed) & (ms_cur["lengthscale"] == lengthscale)]
        ms1 = ms1.sort_values("samples_train")
        train_set_sizes = ms1["samples_train"].unique()
        train_set_sizes.sort()
        ms1["name"] = name
        
        if ms_all is None:
            ms_all = ms1
        else:
            ms_all = pd.concat([ms_all, ms1], sort=False)

    g = sns.relplot(
        x="samples_train", y=measure, kind="line",
        hue="name", data=ms_all,
    )
    g.fig.suptitle("seed={}, lengthscale={}".format(seed, lengthscale))

data_sources = {
    "relu1000": ms[ms["hidden_size"] == 1000],
    "relu100": ms[ms["hidden_size"] == 100],
    "train": msi_train,
    "test": msi_test,
#     "poly": msi_poly,
}

In [None]:
for seed in range(1, 5):
    plot_dataset(seed=seed, lengthscale=0.3, data_sources=data_sources, measure="seg_total_variation")
    plot_dataset(seed=seed, lengthscale=0.3, data_sources=data_sources, measure="seg_total_variation_derivative")

In [None]:
def plot_dataset_predictions(seed, lengthscale):
    ms1 = ms[(ms["seed"] == seed) & (ms["lengthscale"] == lengthscale)]
    dataset = smooth.datasets.GaussianProcessDataset.from_name(ms1.iloc[0]["dataset"])
    x = dataset.x_test

    ax = plt.subplot()
    ax.plot(x, dataset.y_test, color="C0")
    plt.title("seed={}, lengthscale={}".format(seed, lengthscale))
#     ax.scatter(dataset.x_train, dataset.y_train, color="C0")

    for i, row in list(ms1.iterrows()):
        log_dir = row["log_dir"]
        model = tf.keras.models.load_model(os.path.join(log_dir, "model.h5"))
        y = model.predict(x)
        color = {
            100: "C1",
            1000: "C2",
        }[row["hidden_size"]]
        color="C1"
        ax.plot(x, y, alpha=1/len(ms1), color=color)
    
    plt.show()


for lengthscale in [1.0, 0.3, 0.1]:
    for seed in range(1, 3):
        plot_dataset_predictions(seed, lengthscale)

In [None]:
%%time
_dataset = smooth.datasets.GaussianProcessDataset(samples_train=100, lengthscale=0.3, seed=123, dim=100)

In [None]:
import GPy

dim = 2
gp_model = GPy.models.GPRegression(
    # It seems the constructor needs at least 1 data point.
    np.array([[0] * dim]),
    np.array([[0]]),
    noise_var=0.001,
)
gp_model.kern.lengthscale = 0.3
samples_test = 100
x_test = np.random.randn(samples_test, dim)
y_test = gp_model.posterior_samples_f(x_test, size=1)[:, :, 0]
gp_model.set_XY(x_test, y_test)
samples_train = 10
indices = smooth.util.subsample_regularly(samples_test, samples_train)
x_train = x_test[indices]
y_train = gp_model.posterior_samples_f(x_train, size=1)[:, :, 0]
y_train