# Some minimal hyperparameter tuning to find an MLP model that perfroms well on all the datasets used

In [1]:
import itertools
import os

import numpy as np
import pandas as pd
import torch
from opendataval.dataloader import DataFetcher
from opendataval.model import ClassifierMLP, RegressionMLP
from torch import nn
from tqdm import tqdm

from src.baseline.run_baselines_classification import DATA_DIR
from src.LossVal.run_LossVal import REGRESSION_DATASETS, CLASSIFICATION_DATASETS



In [4]:
# Define search space:
NUMBER_OF_HIDDEN_LAYERS = list(range(1, 6))     # 1, 2, 3, 4, 5
HIDDEN_LAYER_SIZE = list(range(10, 101, 10))    # 10, 20, 30, ..., 100
LEARNING_RATE = [0.001, 0.01, 0.1]
BATCH_SIZE = [32, 64, 128]
INNER_ACTIVATION_FUNCTION = [nn.ReLU(), nn.Tanh(), nn.Sigmoid()]

# Fixed hyperparameters:
EPOCHS = 5
DATASET_SIZE = 1000     # Number of samples to use from each dataset (as defined in the OpenDataVal paper)
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
NR_REPETITIONS = 3
RESULTS_DIR = os.path.join(os.path.abspath("../../results/hyperparameter_tuning/"))

print("Device:", DEVICE)

Device: cuda


In [3]:
def r2_score(y_true, y_pred):
    # Ensure the tensors are of the same shape
    y_true = y_true.flatten()
    y_pred = y_pred.flatten()
    
    # Calculate the residual sum of squares (SS_res)
    ss_res = torch.sum((y_true - y_pred) ** 2)
    
    # Calculate the total sum of squares (SS_tot)
    y_true_mean = torch.mean(y_true)
    ss_tot = torch.sum((y_true - y_true_mean) ** 2)
    
    # Calculate R2 score
    r2 = 1 - ss_res / ss_tot
    
    return r2.item()

def accuracy(y_true, y_pred):
    # Ensure the tensors are of the same shape
    y_true = y_true.flatten()
    y_pred = y_pred.flatten()
    
    # Calculate the accuracy
    acc = torch.sum(y_true == y_pred) / len(y_true)
    
    return acc.item()

In [4]:
def build_and_evaluate_MLP(nr_hidden_layers, hidden_layer_size, learning_rate, batch_size, activation_function):
    # == Classification datasets ==
    accuracy_hist = []
    for dataset in CLASSIFICATION_DATASETS:
        # Load data
        fetcher = DataFetcher(dataset_name=dataset, cache_dir=DATA_DIR, force_download=False)
        fetcher = fetcher.split_dataset_by_count(1000, 0, 500)    # Use 1000 samples for training and 500 for testing
        x_trn, y_trn, _, _, x_test, y_test = fetcher.datapoints
        x_trn, x_test, y_trn, y_test = x_trn.to(DEVICE), x_test.to(DEVICE), y_trn.to(DEVICE), y_test.to(DEVICE)
        
        for i in range(NR_REPETITIONS):
            model = ClassifierMLP(
                input_dim=fetcher.covar_dim[0],
                num_classes=fetcher.label_dim[0],
                hidden_dim=hidden_layer_size,
                act_fn=activation_function,
                layers=nr_hidden_layers
            )
            model.to(DEVICE)
            
            # Train model
            model.fit(x_trn, y_trn, epochs=EPOCHS, batch_size=batch_size, lr=learning_rate)
            # Evaluate model
            y_pred = model.predict(x_test)
            accuracy_res = accuracy(y_test, y_pred)#.cpu().numpy()
            accuracy_hist.append(accuracy_res)
            # print(f"Accuracy on {dataset}: {accuracy_res}")

    # == Regression datasets ==
    r2_score_hist = []
    for dataset in REGRESSION_DATASETS:
        # Load data
        fetcher = DataFetcher(dataset_name=dataset, cache_dir=DATA_DIR, force_download=False)
        fetcher = fetcher.split_dataset_by_count(1000, 0, 500)    # Use 1000 samples for training and 500 for testing
        x_trn, y_trn, _, _, x_test, y_test = fetcher.datapoints
        x_trn, x_test, y_trn, y_test = x_trn.to(DEVICE), x_test.to(DEVICE), y_trn.to(DEVICE), y_test.to(DEVICE)
        
        for i in range(NR_REPETITIONS):
            model = RegressionMLP(
                input_dim=fetcher.covar_dim[0],
                num_classes=fetcher.label_dim[0],
                hidden_dim=hidden_layer_size,
                act_fn=activation_function,    
                layers=nr_hidden_layers,
            )
            model.to(DEVICE)
            
            # Train model
            model.fit(x_trn, y_trn, epochs=EPOCHS, batch_size=batch_size, lr=learning_rate)
            # Evaluate model on test data
            y_pred = model.predict(x_test)
            r2_score_res = r2_score(y_test, y_pred)#.cpu().numpy()
            r2_score_hist.append(r2_score_res)
            # print(f"R2 score on {dataset}: {r2_score_res}")
    
    average_accuracy = np.mean(accuracy_hist)
    average_r2_score = np.mean(r2_score_hist)
    
    # print(f"Average accuracy: {average_accuracy}")
    # print(f"Average R2 score: {average_r2_score}")
        
    return average_accuracy, average_r2_score 

In [5]:
# All configurations as list of tuples to try for hyperparameter tuning:
all_configs = list(itertools.product(NUMBER_OF_HIDDEN_LAYERS, HIDDEN_LAYER_SIZE, LEARNING_RATE, BATCH_SIZE, INNER_ACTIVATION_FUNCTION))

print("Number of configurations to try:", len(list(all_configs)))

Number of configurations to try: 1350


In [6]:
# Hyperparameter tuning
results = []
for config in tqdm(all_configs):
    average_accuracy, average_r2_score = build_and_evaluate_MLP(*config)
    results.append((config, average_accuracy, average_r2_score))

100%|██████████| 1350/1350 [2:31:45<00:00,  6.74s/it] 


In [7]:
result_list_to_dataframe = pd.DataFrame(results, columns=['config', 'accuracy', 'r2_score'])
result_list_to_dataframe.to_csv(RESULTS_DIR + 'hyperparameters_for_opendataval.csv', index=False)

In [51]:
result_list_to_dataframe = pd.read_csv(RESULTS_DIR + '/hyperparameters_for_opendataval.csv')

In [57]:
best_acc = best_acc.sort_values(["accuracy", "r2_score"], ascending=False)[:200]

In [58]:
best_r2 = result_list_to_dataframe.sort_values(["r2_score", "accuracy"], ascending=False)[:200]

In [60]:
print("Best configurations by accuracy:")
best_acc.head(5)

Best configurations by accuracy:


Unnamed: 0,config,accuracy,r2_score
801,"[3, 100, 0.1, 32, ReLU()]",0.453778,0.276318
804,"[3, 100, 0.1, 64, ReLU()]",0.446167,0.337212
777,"[3, 90, 0.1, 64, ReLU()]",0.423778,0.331787
747,"[3, 80, 0.1, 32, ReLU()]",0.419056,0.33899
807,"[3, 100, 0.1, 128, ReLU()]",0.377278,-0.678049


In [61]:
print("Best configurations by R2 score:")
best_r2.head(5)

Best configurations by R2 score:


Unnamed: 0,config,accuracy,r2_score
604,"(3, 30, 0.01, 32, Tanh())",0.0,0.698484
796,"(3, 100, 0.01, 64, Tanh())",0.0,0.690445
685,"(3, 60, 0.01, 32, Tanh())",0.0,0.690138
577,"(3, 20, 0.01, 32, Tanh())",0.0,0.68509
739,"(3, 80, 0.01, 32, Tanh())",0.0,0.684571
