## Setup

In [8]:
%run ../../data_processing.ipynb
%run ../utils/objects.ipynb

import time
import pickle

from collections import defaultdict
from sklearn.model_selection import GroupKFold

## Functions

In [2]:
# Recursive default dictionary, used for nested defaultdicts
def rec_dd():
    return defaultdict(rec_dd)

In [3]:
# Run the cross validation process a default of 5 times. Results are a list of result objects,
# one for each of the cv runs
def cv(data, kernel, splits=5):
    kf = GroupKFold(n_splits=splits)
    results = []
    if data.binned == 'binned': n_classes = 2
    else:                       n_classes = 3

    for train_index, test_index in kf.split(data.x, groups=data.group):
        start = time.time()
        x_train, x_test = data.x[train_index], data.x[test_index]
        y_train, y_test = data.y[train_index], data.y_true_dist[test_index]
        model = GaussianModel(x_train, y_train, kernel, n_classes)
        results.append(model.build_train_test(x_test, y_test))
        end = time.time()
        print('Fold tested in (sec):', end - start)
        # This is needed because (I think) GPFlow isn't handling garbage collection 
        # properly as the time it takes to train a brand new model, even after deleting
        # all previous models, increases linearly based on the number of models trained
        # since the kernel was last restarted. However, this doesn't completely fix the 
        # problem, just makes it a bit better
        del model

    return results

In [4]:
# Dumps the results in a pickle file in the results folder
def pickle_results(results, params):
    with open('../gp_results/{}.pkl'.format(params), 'wb') as f:
        pickle.dump(results, f)

## Datasets

In [5]:
# True prediction distribution for aggregated, binned share data
y_share_agg_pred_dist = y_share_agg[['increase', 'not_increase']].values

In [6]:
# True prediction distribution for individual, binned share data
y_share_ind_pred_dist = []
for noc in ind_nocs:
    index = agg_nocs.index(noc)
    y_share_ind_pred_dist.append(y_share_agg_pred_dist[index])

In [9]:
# Not including absolute or non_binned datasets here even though they are created
agg_cont_binned = GaussianDataset(x_cont_agg, 
                    y_share_agg['binned_y'],
                    y_share_agg_pred_dist,
                    agg_nocs,
                    'agg',
                    'cont',
                    'binned')
agg_disc_binned = GaussianDataset(x_disc_agg, 
                    y_share_agg['binned_y'],
                    y_share_agg_pred_dist,
                    agg_nocs,
                    'agg',
                    'disc',
                    'binned')
ind_cont_binned = GaussianDataset(x_cont_ind, 
                    y_share_bin_ind,
                    y_share_ind_pred_dist,
                    ind_nocs,
                    'ind',
                    'cont',
                    'binned')
ind_disc_binned = GaussianDataset(x_disc_ind, 
                    y_share_bin_ind,
                    y_share_ind_pred_dist,
                    ind_nocs,
                    'ind',
                    'disc',
                    'binned')

datasets = [agg_cont_binned, agg_disc_binned, ind_cont_binned, ind_disc_binned]