In [10]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### experiment with parameter tuning 
Using [Optuna](https://optuna.org/)

In [28]:
from patch_gnn.data import load_ghesquire
import pandas as pd
from pyprojroot import here
import pickle as pkl
from patch_gnn.splitting import train_test_split
from jax import random
from patch_gnn.seqops import one_hot
from patch_gnn.unirep import unirep_reps
from patch_gnn.graph import graph_tensors
from patch_gnn.models import MPNN, DeepMPNN, DeepGAT
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score as evs
import matplotlib.pyplot as plt 
from sklearn.metrics import mean_squared_error as mse
import pickle as pkl
from patch_gnn.graph import met_position
import seaborn as sns
from jax.config import config
import jax.numpy as np
import optuna
from typing import Dict
config.update("jax_debug_nans", True)

#### Simulate data for MPNN model and Random forest model

In [43]:
key = random.PRNGKey(490)
k1,k2,k3,k4 = random.split(key, num = 4)
n_node = 5
n_feature =13
node_feature_shape = (n_node, n_feature) 
n_adjacency = 1
num_training = 5

adjacency = random.normal(k1, (251,n_node,n_node,n_adjacency))
node_features = random.normal(k2, (251, n_node, n_feature))
X = (adjacency, node_features)
X_rf = random.normal(k4, (251, 630))
y = random.normal(k3, (251, 1))
mpnn_model = MPNN(node_feature_shape=node_feature_shape,
                                num_adjacency = n_adjacency,
                                num_training_steps =num_training, 
                                optimizer_step_size = 1e-5)


#### first test evotune on mpnn class models

In [44]:
mpnn_model = MPNN(node_feature_shape=node_feature_shape,
                                num_adjacency = n_adjacency,
                                num_training_steps =num_training, 
                                optimizer_step_size = 1e-5)
    

In [50]:
def evotune_mpnn_class(model, X, y, num_training_steps_kwargs:Dict = {}, optimizer_step_size_kwargs:Dict={}, n_trials:int = 10 ):
    """
    evotune for MPNN class models
    One can choose to tune num_training_steps (num of epochs) and optimizer_step_size (learning rate)
    
    :param X: input data for MPNN class models
    :param y: outcome associated with input data, should be of shape (sample_size,), use np.squeeze(y) if not
    :params num_training_steps_kwargs: a dictionary with the format of, 
                                num_training_steps_kwargs = {
                                "name": "num_training_steps", #requires to have as is
                                "low": 10, # one can change it
                                "high": 14,  # one can change it
                                "log" :True # one can change it
                            }, default is an empty dictionary with means no hyperparameters should be tuned
                            
    :params optimizer_step_size_kwargs: a dictionary with the format of, 
                                optimizer_step_size_kwargs = {
                                "name" : "optimizer_step_size",# this key value pair is required as is
                                "low" : 1e-5, # one can change the value
                                "high" : 1e-2, # one can change the value
                            } 
    :params n_trials: number of experiments for optuna to run, each experiment is associated with one hyperparameter combination
                            
    return:
            The ideal param combination (that one asked to tune) with the lowest MSE error on input data in 
            in given number of experiments
    """
    if len(num_training_steps_kwargs)==0 and len(optimizer_step_size_kwargs) ==0:
        raise ValueError("The hyperparameters to optimize cannot be empty")
    def objective(trial):
        param_dict = {}
        #defensive programming to check for empty values
        if len(num_training_steps_kwargs)!=0:
            num_training_steps = trial.suggest_int(**num_training_steps_kwargs)
            #update param_dict
            param_dict["num_training_steps"] = num_training_steps
        if len(optimizer_step_size_kwargs)!=0:
            optimizer_step_size = trial.suggest_uniform(**optimizer_step_size_kwargs)
            param_dict["optimizer_step_size"] = optimizer_step_size
        
        print(f"The params that were optimized is {param_dict}")
        # the model object is callable and takes a dict argument to update the parameters
        mpnn_obj = model(param_dict = param_dict)

        loss_history = mpnn_obj.fit(X, y).loss_history
        print(f"num_training_step is {mpnn_obj.num_training_steps}")
        loss = loss_history[mpnn_obj.num_training_steps-1]
        return loss



    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)
    return study.best_trial.params

In [52]:
num_training_steps_kwargs = {
    "name": "num_training_steps",
    "low": 4,
    "high": 500,
    "log": True,
}
#num_training_steps_kwargs={}
optimizer_step_size_kwargs={
    "name" : "optimizer_step_size",# this key value pair is required as is
    "low" : 1e-5, # one can change the value
    "high" : 1e-2, # one can change the value
} 
optimizer_step_size_kwargs={}
evotune_mpnn_class(mpnn_model, X, y, num_training_steps_kwargs,optimizer_step_size_kwargs )

[32m[I 2021-07-01 20:42:45,624][0m A new study created in memory with name: no-name-64b67250-b70b-4760-97bb-e731cf5f0529[0m


The params that were optimized is {'num_training_steps': 229}


  0%|          | 0/229 [00:00<?, ?it/s]

[32m[I 2021-07-01 20:42:53,060][0m Trial 0 finished with value: 0.9240721464157104 and parameters: {'num_training_steps': 229}. Best is trial 0 with value: 0.9240721464157104.[0m


num_training_step is 229
The params that were optimized is {'num_training_steps': 89}


  0%|          | 0/89 [00:00<?, ?it/s]

[32m[I 2021-07-01 20:42:56,151][0m Trial 1 finished with value: 1.0165961980819702 and parameters: {'num_training_steps': 89}. Best is trial 0 with value: 0.9240721464157104.[0m


num_training_step is 89
The params that were optimized is {'num_training_steps': 110}


  0%|          | 0/110 [00:00<?, ?it/s]

[32m[I 2021-07-01 20:42:59,689][0m Trial 2 finished with value: 1.5875941514968872 and parameters: {'num_training_steps': 110}. Best is trial 0 with value: 0.9240721464157104.[0m


num_training_step is 110
The params that were optimized is {'num_training_steps': 8}


  0%|          | 0/8 [00:00<?, ?it/s]

[32m[I 2021-07-01 20:43:00,529][0m Trial 3 finished with value: 1.4208685159683228 and parameters: {'num_training_steps': 8}. Best is trial 0 with value: 0.9240721464157104.[0m


num_training_step is 8
The params that were optimized is {'num_training_steps': 6}


  0%|          | 0/6 [00:00<?, ?it/s]

[32m[I 2021-07-01 20:43:01,544][0m Trial 4 finished with value: 1.4959734678268433 and parameters: {'num_training_steps': 6}. Best is trial 0 with value: 0.9240721464157104.[0m


num_training_step is 6
The params that were optimized is {'num_training_steps': 348}


  0%|          | 0/348 [00:00<?, ?it/s]

[32m[I 2021-07-01 20:43:13,087][0m Trial 5 finished with value: 0.91261225938797 and parameters: {'num_training_steps': 348}. Best is trial 5 with value: 0.91261225938797.[0m


num_training_step is 348
The params that were optimized is {'num_training_steps': 125}


  0%|          | 0/125 [00:00<?, ?it/s]

[32m[I 2021-07-01 20:43:17,607][0m Trial 6 finished with value: 1.582926630973816 and parameters: {'num_training_steps': 125}. Best is trial 5 with value: 0.91261225938797.[0m


num_training_step is 125
The params that were optimized is {'num_training_steps': 15}


  0%|          | 0/15 [00:00<?, ?it/s]

[32m[I 2021-07-01 20:43:18,646][0m Trial 7 finished with value: 1.6999303102493286 and parameters: {'num_training_steps': 15}. Best is trial 5 with value: 0.91261225938797.[0m


num_training_step is 15
The params that were optimized is {'num_training_steps': 161}


  0%|          | 0/161 [00:00<?, ?it/s]

[32m[I 2021-07-01 20:43:24,243][0m Trial 8 finished with value: 1.4698224067687988 and parameters: {'num_training_steps': 161}. Best is trial 5 with value: 0.91261225938797.[0m


num_training_step is 161
The params that were optimized is {'num_training_steps': 32}


  0%|          | 0/32 [00:00<?, ?it/s]

[32m[I 2021-07-01 20:43:25,794][0m Trial 9 finished with value: 2.4718165397644043 and parameters: {'num_training_steps': 32}. Best is trial 5 with value: 0.91261225938797.[0m


num_training_step is 32


{'num_training_steps': 348}

#### then test evotune on random forest class models

In [29]:
rf_max_depth_kwargs = {
    "name" : "rf_max_depth",
    "low" :2, 
    "high" : 32,
    "log" :True
}
rf_n_estimator_kwargs = {
    "name" : "n_estimators",
    "low" : 64,
    "high" : 128,
    "log" : True
} 

def evotune_rf( X, y, rf_max_depth_kwargs: Dict ={}, rf_n_estimator_kwargs: Dict = {} ,n_trials:int = 20):
    """
    evotune for random forest models
    One can choose to tune max_depth and n_estimators (num of trees in rf)
    
    :param X: input data for random forest models
    :param y: outcome associated with input data, should be of shape (sample_size,), use np.squeeze(y) if not
    :params rf_max_depth_kwargs: a dictionary with the format of, 
                                rf_max_depth_kwargs = {
                                "name" : "rf_max_depth", # this key value pair is required as is
                                "low" :2,  # one can change the value
                                "high" : 32, # one can change the value
                                "log" :True  #this key value pair is required as is
                            }, default is an empty dictionary with means no hyperparameters should be tuned
                            
    :params rf_n_estimator_kwargs: a dictionary with the format of, 
                                rf_n_estimator_kwargs = {
                                "name" : "n_estimators",# this key value pair is required as is
                                "low" : 64, # one can change the value
                                "high" : 128, # one can change the value
                                "log" : True # this key value pair is required as is
                            } 
                            
    :params n_trials: number of experiments for optuna to run, each experiment is associated with one hyperparameter combination
                            
    return:
            The ideal param combination (that one asked to tune) with the lowest MSE error on input data in 
            in given number of experiments

    """
    if len(rf_max_depth_kwargs)==0 and len(rf_n_estimator_kwargs) ==0:
        raise ValueError("The hyperparameters to optimize cannot be empty")
    def objective(trial):
        param_dict = {}
        #defensive programming to check for empty values
        if len(rf_max_depth_kwargs)!=0:
            max_depth = trial.suggest_int(**rf_max_depth_kwargs)
            #update param_dict
            param_dict["max_depth"] = max_depth
        if len(rf_n_estimator_kwargs)!=0:
            n_estimators = trial.suggest_int(**rf_n_estimator_kwargs)
            param_dict["n_estimators"] = n_estimators
        
        print(f"The params that were optimized {param_dict}")
        # build rf model object
        rf_obj = RandomForestRegressor(oob_score=True, n_jobs=-1, **param_dict)
        #rf_obj = model(**param_dict) # this won't work since this class doens't have __call__

        rf_obj.fit(X, y)
        y_pred = rf_obj.predict(X)

        error = mean_squared_error(y, y_pred)

        return error



    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)
    return study.best_trial.params

In [30]:
# experiment with random forest tuning
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [31]:

evotune_rf( X_rf, np.squeeze(y), rf_max_depth_kwargs, rf_n_estimator_kwargs, n_trials =4)

[32m[I 2021-07-01 20:14:41,073][0m A new study created in memory with name: no-name-c68cd51d-0886-47d1-b9b0-0dec90ecf432[0m
[32m[I 2021-07-01 20:14:41,253][0m Trial 0 finished with value: 0.5664913199420887 and parameters: {'rf_max_depth': 3, 'n_estimators': 87}. Best is trial 0 with value: 0.5664913199420887.[0m


The params that were optimized {'max_depth': 3, 'n_estimators': 87}
The params that were optimized {'max_depth': 21, 'n_estimators': 110}


[32m[I 2021-07-01 20:14:41,525][0m Trial 1 finished with value: 0.13180692704174113 and parameters: {'rf_max_depth': 21, 'n_estimators': 110}. Best is trial 1 with value: 0.13180692704174113.[0m


The params that were optimized {'max_depth': 6, 'n_estimators': 126}


[32m[I 2021-07-01 20:14:42,085][0m Trial 2 finished with value: 0.23002004743304078 and parameters: {'rf_max_depth': 6, 'n_estimators': 126}. Best is trial 1 with value: 0.13180692704174113.[0m


The params that were optimized {'max_depth': 15, 'n_estimators': 93}


[32m[I 2021-07-01 20:14:42,344][0m Trial 3 finished with value: 0.13942848849655806 and parameters: {'rf_max_depth': 15, 'n_estimators': 93}. Best is trial 1 with value: 0.13180692704174113.[0m


{'rf_max_depth': 21, 'n_estimators': 110}