# n-dimensional Gaussian process datasets

A first exploration of high-dimensional Gaussian processes. We train a lot of models on a lot of GP datasets of various dimensions, lengthscales and training samples.

We also have some preliminary results of whether smooth functions are learned. We compute the ratio of the roughness measures for 10 training samples vs 1000 training samples. If smooth functions are learned, we would expect the roughness measure to increase or stay roughly constant.

In [None]:
import os
import sys
# If we don't need CUDA, do this before importing TF
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import tensorflow as tf
import numpy as np
import pandas as pd
import tqdm
import tqdm.notebook
import scipy.stats
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    tf.config.experimental.set_visible_devices([gpus[1]], 'GPU')

sys.path.append("/nfs/scistore12/chlgrp/vvolhejn/smooth")

os.chdir("/nfs/scistore12/chlgrp/vvolhejn/smooth/logs/0226_gp_nd/")

In [None]:
%load_ext autoreload
%aimport smooth.datasets
%aimport smooth.model
%aimport smooth.analysis
%aimport smooth.callbacks
%aimport smooth.measures
%aimport smooth.util
%autoreload 1

In [None]:
ms_raw = pd.read_feather("measures.feather")

print("Errors:", len(ms_raw[~ms_raw["error"].isnull()]))
ms_raw = ms_raw.loc[ms_raw["error"].isnull()]

ms_raw = smooth.analysis.expand_dataset_columns(ms_raw)
ms = ms_raw

divergent_model_mask = (ms["loss"] == np.inf) | (~(ms["train_loss"] < 0.1))
print("Divergent models:", len(ms[divergent_model_mask]))
ms = ms.loc[~divergent_model_mask]

print("Remaining:", len(ms))
smooth.analysis.remove_constant_columns(ms, verbose=True)
# ms = smooth.analysis.expand_dataset_columns(ms)
ms.loc[:,"log_dir"] = ms["log_dir"].str.split("/").str.get(-1)

for d in sorted(ms_raw["dim"].unique()):
    n_before = len(ms_raw[ms_raw["dim"] == d])
    n_after = len(ms[ms["dim"] == d])
    print("For dim {}:\t{}/{}\t({:.0f}%) remain".format(d, n_after, n_before, n_after/n_before*100))

ms.head()

In [None]:
ms["hidden_size"].nunique()

In [None]:
for col in ms.columns:
    if ms[col].nunique() <= 20:
        sns.countplot(ms[col])
        plt.show()

# plt.hist(ms["dim"].astype(str))
# plt.bar

In [None]:
ms.loc[ms["gradient_norm"] < ,"gradient_norm"]

In [None]:
cols = ["actual_epochs", "train_loss", "test_loss"]
cols = ms.columns

trim = 0.1

for col in cols:
    if ms[col].dtype == "object":
        continue
    
    data = ms.loc[(ms[col] > ms[col].quantile(trim/2)) & (ms[col] < ms[col].quantile(1-trim/2)), col]
    
    plt.hist(data, bins=20)
    plt.title(col)
    plt.show()

In [None]:
ms1 = ms.sort_values("samples_train")
groups = ms1.groupby(["dim", "lengthscale", "hidden_size", "init_scale", "learning_rate"])

measure_cols = ["gradient_norm", "seg_total_variation", "seg_total_variation_derivative"]
ratios = groups.agg(lambda g: np.log10(g.iloc[0] / g.iloc[-1]))[measure_cols]

ratios.describe()

In [None]:
ms[(ms["hidden_size"] == 1000) & (ms["init_scale"] == 1) & (ms["learning_rate"] == 0.01)
  & (ms["lengthscale"] ==1.) & (ms["dim"] == 128)]

In [None]:
for measure in measure_cols:
    n_bins = 10
    bins = np.logspace(-2, 2, 20)
    plt.hist(ratios[measure], bins=np.linspace(-2, 2, 30))
#     plt.xscale("log")
    plt.title("log ratio of {}".format(measure))
    plt.show()
    
    print("Percentage of cases where ratio < 1: {:.1f}%".format(
        (ratios[measure] < np.log10(1)).sum() / len(ratios) * 100
    ))
    print("Percentage of cases where ratio < 1.5: {:.1f}%".format(
        (ratios[measure] < np.log10(1.5)).sum() / len(ratios) * 100
    ))
    print("90th percentile: ratio is {:.2f}".format(
        10 ** ratios[measure].quantile(0.9)
#         (ratios[measure] < np.log(1.5)).sum() / len(ratios) * 100
    ))
#     print(np.sum(ratios[measure]))
    

In [None]:
np.logspace(np.log10(10), np.log10(1000), 10).round().astype(int)