# Demo of nested cross validation with MLP Regressor

In [1]:
from sklearn.model_selection import KFold
from torch.utils.data import Subset

from cv import cross_validation, nested_gridsearch_cv
from data.dataset import CSVDataset
from model.mlp import MLP
from train import train_model

## Load dataset
This dataset is located at `/ml_scripts/data/random_1dfeature_datset.csv`. It is generated by adding gaussian noises (N(0, 0.1)) to points generated following the relationship of $y = 1.5 x_1 + 0.5 x_2 -0.5 x_3 - 1.5 x_4$. The dimension of features are 4, and the dimension of 1

In [2]:
dataset = CSVDataset()

## Demo 1 - train model

In [3]:
# use the first 80 % as training set and the remaining 20 % as test set
train_dataset = Subset(dataset, range(int(0.8 * len(dataset))))
test_dataset = Subset(dataset, range(int(0.8 * len(dataset)), len(dataset)))

# initialize a model
model = MLP(input_dim=4, hidden_dim=10, output_dim=1, n_layers=2)

# train the model
# the best average loss on the validation set will be returned
# and you can optionaly save the best model
# hyperparameters such as learning rate, number of epochs, loss functions 
# can be passed via `hparams`
score = train_model(train_dataset, test_dataset, model, hparams={"lr": 0.01, "num_epochs": 100}, save_model=False)

Epoch 1/100, Validation Loss: 8.3048
Epoch 2/100, Validation Loss: 7.9426
Epoch 3/100, Validation Loss: 7.5767
Epoch 4/100, Validation Loss: 7.2002
Epoch 5/100, Validation Loss: 6.7140
Epoch 6/100, Validation Loss: 6.1803
Epoch 7/100, Validation Loss: 5.5514
Epoch 8/100, Validation Loss: 4.8691
Epoch 9/100, Validation Loss: 4.2107
Epoch 10/100, Validation Loss: 3.5329
Epoch 11/100, Validation Loss: 2.8951
Epoch 12/100, Validation Loss: 2.3548
Epoch 13/100, Validation Loss: 1.9265
Epoch 14/100, Validation Loss: 1.6043
Epoch 15/100, Validation Loss: 1.4003
Epoch 16/100, Validation Loss: 1.1178
Epoch 17/100, Validation Loss: 0.9644
Epoch 18/100, Validation Loss: 0.6980
Epoch 19/100, Validation Loss: 0.5949
Epoch 20/100, Validation Loss: 0.4618
Epoch 21/100, Validation Loss: 0.4412
Epoch 22/100, Validation Loss: 0.3048
Epoch 23/100, Validation Loss: 0.4433
Epoch 24/100, Validation Loss: 0.2136
Epoch 25/100, Validation Loss: 0.3342
Epoch 26/100, Validation Loss: 0.1519
Epoch 27/100, Validat

## Demo 2 - cross validation
For simplicity, a random split 5 fold cross validation is performed. The following
function should also support other type of cross validation, as long as `cv` has the 
`split` function that generates training and validation data indice

In [4]:
# Define the 5 fold cross validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Conducting cross validation based on a given set of 
# model hyperparameters (`model_hparams`) and training
# hyperparameters (`train_hparams`).
# Returns a list of scores for each fold.
scores = cross_validation(
    dataset,
    model_class=MLP,
    cv=kfold,
    model_hparams={"n_layers": 3},
    train_hparams={"num_epochs": 30, "lr": 0.01},
)

Epoch 1/30, Validation Loss: 6.6740
Epoch 2/30, Validation Loss: 6.5747
Epoch 3/30, Validation Loss: 6.4401
Epoch 4/30, Validation Loss: 6.2160
Epoch 5/30, Validation Loss: 5.8937
Epoch 6/30, Validation Loss: 5.4610
Epoch 7/30, Validation Loss: 4.9018
Epoch 8/30, Validation Loss: 4.2570
Epoch 9/30, Validation Loss: 3.5264


Epoch 10/30, Validation Loss: 2.7533
Epoch 11/30, Validation Loss: 2.0254
Epoch 12/30, Validation Loss: 1.3523
Epoch 13/30, Validation Loss: 0.7843
Epoch 14/30, Validation Loss: 0.5997
Epoch 15/30, Validation Loss: 0.6015
Epoch 16/30, Validation Loss: 0.3562
Epoch 17/30, Validation Loss: 0.3866
Epoch 18/30, Validation Loss: 0.1909
Epoch 19/30, Validation Loss: 0.2655
Epoch 20/30, Validation Loss: 0.1396
Epoch 21/30, Validation Loss: 0.1963
Epoch 22/30, Validation Loss: 0.1192
Epoch 23/30, Validation Loss: 0.2037
Epoch 24/30, Validation Loss: 0.1103
Epoch 25/30, Validation Loss: 0.2041
Epoch 26/30, Validation Loss: 0.1053
Epoch 27/30, Validation Loss: 0.1437
Epoch 28/30, Validation Loss: 0.1007
Epoch 29/30, Validation Loss: 0.1295
Epoch 30/30, Validation Loss: 0.0804
Epoch 1/30, Validation Loss: 9.3524
Epoch 2/30, Validation Loss: 9.2865
Epoch 3/30, Validation Loss: 9.1508
Epoch 4/30, Validation Loss: 8.9415
Epoch 5/30, Validation Loss: 8.6058
Epoch 6/30, Validation Loss: 8.1314
Epoch 7

In [5]:
# Display the score of each fold
print(scores)

[0.08038657307624816, 0.1632515549659729, 0.09336517453193664, 0.06887156963348388, 0.26986472606658934]


## Demo 3 - nested cross validation

This example demonstrates an example of nested cross-validation. The inner cross valiation
is a 5-folder cross validation, and the outer cross validation is a 3-fold cross validation.
The grid search approach is used to find the best hyperparameters in the inner CV.
Hyperparameters giving the lowest average validation loss are kept. 
Once best hyperparameters are found, the model is retrained and evaluate on the test data
of the outer cv. The developed function chaining the process together, and output the
scores, model, and the best hyperparameters of each outer fold.




In [6]:
k_fold_inner = KFold(n_splits=5, shuffle=True, random_state=42)
k_fold_outer = KFold(n_splits=3, shuffle=True, random_state=42)

# Nest gridsearch CV
# model_hparams and train_hparams are dictionaries of hyperparameter grids
# if not exploring model_hparams or train_hparams, you can simply assign None.
# This example explores the space of hidden_dim and num_epochs for the MLP model.
# As a toy example, num_epochs are chosen to be very small to avoid extra long running time.
scores, models, hparams = nested_gridsearch_cv(
    dataset,
    model_class=MLP,
    inner_cv=k_fold_inner,
    outer_cv=k_fold_outer,
    model_hparams_grid={"hidden_dim": [5, 10]},
    train_hparams_grid={"num_epochs": [5, 10]},
)

Outer Fold 0
Current hyperparameters:
Model hyperparameters: {'hidden_dim': 5}
Training hyperparameters: {'num_epochs': 5}
Epoch 1/5, Validation Loss: 5.1606
Epoch 2/5, Validation Loss: 5.1574
Epoch 3/5, Validation Loss: 5.1541
Epoch 4/5, Validation Loss: 5.1500
Epoch 5/5, Validation Loss: 5.1459
Epoch 1/5, Validation Loss: 14.6394
Epoch 2/5, Validation Loss: 14.6307
Epoch 3/5, Validation Loss: 14.6220
Epoch 4/5, Validation Loss: 14.6134
Epoch 5/5, Validation Loss: 14.6047
Epoch 1/5, Validation Loss: 15.0959
Epoch 2/5, Validation Loss: 15.0871
Epoch 3/5, Validation Loss: 15.0780
Epoch 4/5, Validation Loss: 15.0694
Epoch 5/5, Validation Loss: 15.0608
Epoch 1/5, Validation Loss: 13.3657
Epoch 2/5, Validation Loss: 13.3610
Epoch 3/5, Validation Loss: 13.3563
Epoch 4/5, Validation Loss: 13.3517
Epoch 5/5, Validation Loss: 13.3474
Epoch 1/5, Validation Loss: 19.5980
Epoch 2/5, Validation Loss: 19.5733
Epoch 3/5, Validation Loss: 19.5484
Epoch 4/5, Validation Loss: 19.5239
Epoch 5/5, Validat

In [7]:
print(scores)

[4.8758854585535385, 5.414452755089962, 5.383980028557055]
