# Kernel ridge regression on GP datasets 1

The problem here is that the datasets are "too hard" -- for lengthscale 1.0, if we use the original GP to predict the test set given the training set, the variance is 1., so we can't expect any model to do better than that. In a follow-up analysis, we use models with `lengthscale == c * dim`

In [None]:
import os
import sys
# If we don't need CUDA, do this before importing TF
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import tensorflow as tf
import numpy as np
import pandas as pd
import tqdm
import tqdm.notebook
import scipy.stats
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    tf.config.experimental.set_visible_devices([gpus[1]], 'GPU')

os.chdir("/nfs/scistore12/chlgrp/vvolhejn/smooth/logs/0228_gp_krr/")

In [None]:
%load_ext autoreload
%aimport smooth.datasets
%aimport smooth.model
%aimport smooth.analysis
%aimport smooth.callbacks
%aimport smooth.measures
%aimport smooth.util
%autoreload 1

In [None]:
pd.read_feather("measures.feather")

In [None]:
ms = pd.read_feather("measures.feather")
ms = smooth.analysis.expand_dataset_columns(ms)
smooth.analysis.remove_constant_columns(ms, verbose=True)
ms.head()

In [None]:
trim = 0.1

for col in ms.columns:
    if ms[col].dtype == "object":
        continue
    
    data = ms.loc[(ms[col] >= ms[col].quantile(trim/2)) & (ms[col] <= ms[col].quantile(1-trim/2)), col]
    
    plt.hist(data, bins=20)
    plt.title(col)
    plt.show()

In [None]:
def get_optimal_path_length_f(dataset_name):
    dataset = smooth.datasets.from_name(dataset_name)
    n = len(dataset.x_test)
    y = sorted(dataset.y_test.reshape((-1,)))
#     cs = np.cumsum(y)[::-1]
#     res = 0
#     for i in range(n - 1):
#         res += cs[i] - y[i] * (n - i)

#     return res / (n ** 2)
    res = 0
    for a in y:
        for b in y:
            res += np.abs(a - b)
    return res / (n ** 2)


datasets = ms["dataset"].str.split("-").str.slice(0, -1).str.join("-").unique()
datasets

optimal_lengths = {}
for dataset in tqdm.notebook.tqdm(datasets):
    optimal_lengths[dataset] = get_optimal_path_length_f("{}-77".format(dataset))

optimal_lengths

In [None]:
ms

In [None]:
# ms1 = ms
seed = 2
measure_cols = ["train_loss", "test_loss", "path_length_f"]
# measure_cols = ["path_length_f"]

for dim in sorted(ms["dim"].unique()):
    for measure in measure_cols:
        ms1 = ms[(ms["seed"] == seed) & (ms["dim"] == dim) & (ms["alpha"] == 0.0001)]
        ax = plt.subplot()
    #     ms1.loc[:,"hidden_size_s"] = ms1["hidden_size"].astype(str) + " units"
        sns.lineplot(
            data=ms1,
            x="samples_train",
            y=measure,
            hue="degree",
    #         col="batch_size",
    #         kind="line",
            palette=smooth.analysis.make_palette(ms1["degree"].unique()),
    #                 ax=ax
        )
        if measure == "path_length_f":
            ol = optimal_lengths["gp-{}-{}-1.0".format(dim, seed)]
            plt.plot([ms1["samples_train"].min(), ms1["samples_train"].max()], [ol, ol])

        ax.set_xscale("log")
        if "loss" in measure:
            ax.set_yscale("log")
        plt.title("dim={}".format(dim))
        plt.show()

In [None]:
datasets = {}
for seed in tqdm.notebook.tqdm(range(1, 6)):
    datasets[seed] = smooth.datasets.from_name("gp-100-{}-1.0-1000".format(seed))

In [None]:
class KRRModel:
    
    def __init__(self, krr):
        self.krr = krr
    
    def predict(self, x, batch_size=None):
        # batch_size is a fake argument which is ignored
        return self.krr.predict(x)


def measure_krr(krr, dataset):
    def mse(y1, y2):
        return np.mean(tf.losses.mean_squared_error(y1, y2))

    train_loss = mse(krr.predict(dataset.x_train), dataset.y_train)
    test_loss = mse(krr.predict(dataset.x_test), dataset.y_test)
    path_length_f = smooth.measures.path_length(KRRModel(krr), dataset.x_test)

    return {
        "train_loss": train_loss,
        "test_loss": test_loss,
        "path_length_f": path_length_f,
    }

In [None]:
%%time

import sklearn.kernel_ridge
import warnings
return
ms_krr_l = []

samples_l = np.logspace(np.log10(10), np.log10(1000), 50).round().astype(int)
seed = 1

for dim in tqdm.notebook.tqdm([4, 8, 16], desc="dim"):
    for alpha in tqdm.notebook.tqdm([0.01, 0.0001, 1e-15], leave=False, desc="alpha"):
        dataset0 = smooth.datasets.from_name("gp-{}-{}-{}-1000".format(dim, seed, dim))
        for degree in tqdm.notebook.tqdm([1, 2, 3, 4, 5], leave=False, desc="degree"):
            for samples in tqdm.notebook.tqdm(samples_l, leave=False, desc="samples"):            
                krr = sklearn.kernel_ridge.KernelRidge(
                    alpha=alpha,
                    kernel="poly",
                    degree=degree,
            #     degree=len(dataset.x_train) + 10,
                    coef0=1,
                )
                dataset = dataset0.subset(samples, keep_test_set=True)
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    krr.fit(dataset.x_train, dataset.y_train)

                m = smooth.train_kernel_models.measure_krr(krr, dataset)
                m.update(
                    dim=dim,
                    seed=seed,
                    alpha=alpha,
                    degree=degree,
                    samples_train=samples,
                )
                ms_krr_l.append(m)
            #     y_pred = krr.predict(dataset.x_test)
            #     break

In [None]:
y_pred = np.zeros_like(dataset.y_test) + np.mean(dataset.y_test)
sklearn.metrics.mean_squared_error(dataset.y_test, y_pred)

## Preliminary experiments with updated dataset

In [None]:
# %matplotlib inline
ms_krr = pd.DataFrame(ms_krr_l)
# ms_krr.loc[range(0, 1500), "dim"] = np.array([4,8,16])[np.array(range(1500)) // 500]
# ms_krr["dim"] = np.array([4,8,16])[ms_krr.index // 500]
# ms_krr = ms_krr.loc[ms_krr["alpha"] < 1e-9]
# ms_krr["samples_train"] = np.sort(ms["samples_train"].unique())[ms_krr.index % 10]
# ms_krr = ms_krr[ms_krr["degree"] == 3]

for measure in ["train_loss", "test_loss", "path_length_f_test", "path_length_f_train"]:
#     ax = plt.subplot()
    grid = sns.relplot(
        data=ms_krr,
        x="samples_train",
        y=measure,
        hue="degree",
        style="alpha",
        col="dim",
        kind="line",
        palette=smooth.analysis.make_palette(ms_krr["degree"].unique()),
    #         sns.cubehelix_palette(8),
    #                 ax=ax
    )
    ax = grid.axes[0][0]
    ax.set_xscale('log')
    ax.set_yscale('log')
    if measure == "test_loss":
        baseline = sklearn.metrics.mean_squared_error(dataset.y_test, y_pred)
        plt.plot([ms["samples_train"].min(), ms["samples_train"].max()], [baseline, baseline])

#     ax.set_xscale("log")
#     if measure in ["train_loss", "test_loss", "path_length_f"]:
#         ax.set_yscale("log")
#     plt.show()

## The datasets are too hard

In [None]:
vs = {}
for dim in tqdm.notebook.tqdm([2**i for i in range(1, 10)]):
    lengthscale = 1.0
    d = smooth.datasets.from_name("gp-{}-1-{}-1000".format(dim, lengthscale))
    d.gp_model.set_XY(d.x_train, d.y_train)
    vs[dim] = np.mean(d.gp_model.predict(d.x_test)[1])
    print(vs[dim])