In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

from tqdm import tqdm
from copy import deepcopy
from ax import optimize
from ax.plot.contour import plot_contour
from ax.utils.notebook.plotting import init_notebook_plotting, render
from ax.plot.trace import optimization_trace_single_method

import json
import torch
mps_device = torch.device("mps")

## Simple Linear Dataset

Let's start with a ground-truth linear model that we know a linear model should be able to recover.

In [3]:
# generate a dataset
NROW = 1000
NCOL = 5

random.seed(1010)

data = np.random.rand(NROW, NCOL)
X = pd.DataFrame(data, columns=[f"col_{i}" for i in range(1, NCOL+1)])
y = X["col_1"] + 2 * X["col_2"] + 3 * X["col_3"] + 4 * X["col_4"] + 5 * X["col_5"] + np.random.uniform(0, 0.5, NROW)

In [4]:
# train-test split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1010)

X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

#### Optimize/Train the Model on the Training Set (and report optimal hyperparams)

In [6]:
# code adapted from Mohammed's q2 optimization notebook
model_dict = {"RandomForestRegressor":{"model":RandomForestRegressor,
					"params":[{"name":"n_estimators", "type":"range", "bounds":[2,200]},
							{"name":"max_depth", "type":"range", "bounds":[1,10]},
							{"name":"min_samples_leaf", "type":"range", "bounds":[1,5]}]},
			   "ElasticNet":{ "model": ElasticNet,
					"params":[{"name": "alpha", "type": "range", "bounds":[0.001,1.0]},
							{"name": "l1_ratio", "type": "range", "bounds":[0.0,1.0]},
							{"name": "max_iter", "type": "range", "bounds":[200, 2000]},
							{"name": "selection", "type": "choice", "values":["cyclic", "random"]}
						]}
			}

def q2_baseline_models(estimator, X, y):
    baseline_models = []
    for loo_index in X.index:
        if hasattr(estimator, "random_state"):
            estimator.random_state = loo_index	
        baseline_models.append(deepcopy(estimator.fit(X=X.drop(loo_index).values, y=y.drop(loo_index).values)))
        
    return baseline_models

# This function computes q^2 (used as evaluation for the models)
def q2_score(estimator, X, y):
    models = q2_baseline_models(estimator, X, y)
    q2_means = []
    q2_preds = []
    
    for loo_index in X.index:
        q2_means.append(y.drop(loo_index).mean())
        q2_preds.append(models[loo_index].predict(np.array(X.iloc[loo_index]).reshape(1,-1))[0])

    q2 = 1 - np.sum((np.array(q2_preds) - np.array(y))**2) / np.sum((np.array(q2_means) - np.array(y))**2)
    
    return q2

def get_optimal_model(X, y, model_type, total_trials):
    best_parameters, best_values, experiment, model = optimize(
        parameters= model_dict[model_type]["params"],
        evaluation_function=lambda p: q2_score(model_dict[model_type]["model"](**p), X, y),
        minimize=False,
        total_trials=total_trials,
    )
    optimal_parameters = best_parameters 
    optimization_q2 = best_values[0]["objective"]

    return(optimal_parameters, optimization_q2)

In [7]:
# print the training q^2 on the training dataset
optimal_params, optimal_q2 = get_optimal_model(X_train, y_train, model_type = "ElasticNet", total_trials = 10)

[INFO 03-26 15:53:55] ax.service.utils.instantiation: Inferred value type of ParameterType.FLOAT for parameter alpha. If that is not the expected value type, you can explicitly specify 'value_type' ('int', 'float', 'bool' or 'str') in parameter dict.
[INFO 03-26 15:53:55] ax.service.utils.instantiation: Inferred value type of ParameterType.FLOAT for parameter l1_ratio. If that is not the expected value type, you can explicitly specify 'value_type' ('int', 'float', 'bool' or 'str') in parameter dict.
[INFO 03-26 15:53:55] ax.service.utils.instantiation: Inferred value type of ParameterType.INT for parameter max_iter. If that is not the expected value type, you can explicitly specify 'value_type' ('int', 'float', 'bool' or 'str') in parameter dict.
[INFO 03-26 15:53:55] ax.service.utils.instantiation: Inferred value type of ParameterType.STRING for parameter selection. If that is not the expected value type, you can explicitly specify 'value_type' ('int', 'float', 'bool' or 'str') in par

In [8]:
optimal_q2

0.9954762787725429

In [9]:
optimal_params

{'alpha': 0.001,
 'l1_ratio': 0.5135116582779522,
 'max_iter': 1925,
 'selection': 'cyclic'}

#### Evaluate the model on the true held-out data

In [10]:
# print the evaluation q^2 on the evaluation dataset
model_optimal = ElasticNet(alpha = optimal_params["alpha"],
                           l1_ratio= optimal_params["l1_ratio"],
                           max_iter = optimal_params["max_iter"],
                           selection = optimal_params["selection"])

In [12]:
# confirm that the model's coefficients recover the ground truth
model_optimal.coef_

array([1.01194055, 2.03002156, 2.9994982 , 3.9544421 , 4.94892631])

No surprise --- we found the ground truth coefficients, so it works out-of-sample.

In [11]:
q2_score(model_optimal, X_test, y_test)

0.9952759010180788

## Complex Dataset with Omitted Variables and Nonlinearities

Let's create a dataset with nonlinear relationships, omitted variables, and lots of noise --- making it more likely that the original model will overfit.

In [38]:
# generate a dataset
NROW = 1000
NCOL = 5

random.seed(1010)

data = np.random.rand(NROW, NCOL)
X = pd.DataFrame(data, columns=[f"col_{i}" for i in range(1, NCOL+1)])
y = X["col_1"] + 2 * X["col_2"] + 3 * X["col_3"]**3 + 4 * X["col_4"] * X["col_5"] + np.random.uniform(0, 2, NROW)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1010)

X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

# drop cols 1 and 5 so that we have omitted variables
X_train = X_train.drop(["col_1", "col_5"], axis = 1)
X_test = X_test.drop(["col_1", "col_5"], axis = 1)

In [39]:
optimal_params, optimal_q2 = get_optimal_model(X_train, y_train, model_type = "ElasticNet", total_trials = 10)

[INFO 03-26 16:04:43] ax.service.utils.instantiation: Inferred value type of ParameterType.FLOAT for parameter alpha. If that is not the expected value type, you can explicitly specify 'value_type' ('int', 'float', 'bool' or 'str') in parameter dict.
[INFO 03-26 16:04:43] ax.service.utils.instantiation: Inferred value type of ParameterType.FLOAT for parameter l1_ratio. If that is not the expected value type, you can explicitly specify 'value_type' ('int', 'float', 'bool' or 'str') in parameter dict.
[INFO 03-26 16:04:43] ax.service.utils.instantiation: Inferred value type of ParameterType.INT for parameter max_iter. If that is not the expected value type, you can explicitly specify 'value_type' ('int', 'float', 'bool' or 'str') in parameter dict.
[INFO 03-26 16:04:43] ax.service.utils.instantiation: Inferred value type of ParameterType.STRING for parameter selection. If that is not the expected value type, you can explicitly specify 'value_type' ('int', 'float', 'bool' or 'str') in par

In [40]:
optimal_q2

0.40186159840319036

In [41]:
optimal_params

{'alpha': 0.09307539881655244,
 'l1_ratio': 0.3760842071755143,
 'max_iter': 569,
 'selection': 'cyclic'}

In [42]:
# print the evaluation q^2 on the evaluation dataset
model_optimal = ElasticNet(alpha = optimal_params["alpha"],
                           l1_ratio= optimal_params["l1_ratio"],
                           max_iter = optimal_params["max_iter"],
                           selection = optimal_params["selection"])

The overall Q^2 is now much lower, but we're not seeing much of a bias.

In [43]:
q2_score(model_optimal, X_test, y_test)

0.40435696458863113