In [1]:
import torch
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, balanced_accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import imblearn
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from collections import defaultdict
from tqdm import trange
import seaborn as sns
import os
import matplotlib.pyplot as plt
from io import BytesIO
import base64
import sklearn
import time
import datetime
import random

plt.style.use("seaborn")
torch.set_default_dtype(torch.float32)

  plt.style.use("seaborn")


In [2]:
import pyhopper

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from loguru import logger

logger.add("log.txt", format='{time:YYYY-MM-DD HH:mm:ss.SSS} | {message}')

1

In [4]:
from hypertab_benchmark_utils import *

# GLOBALS

In [6]:
GS_METRIC = "balanced_accuracy"

In [7]:
SEED = 42

In [8]:
TEST_RUN = True

os.environ["HYPERTAB_TEST_RUN"] = str(TEST_RUN)

In [9]:
DATA = "BreastCancer"
TIME_BUDGET = "30m"
DEVICE="cpu"

# Load data

In [10]:
set_seed(SEED)

In [11]:
_X, _y = get_data(DATA)

In [12]:
n_classes = get_n_classes(_X, _y)
n_features = get_n_features(_X, _y)

n_classes 2
n_features 30


In [13]:
get_each_class_counts(_X, _y)

class counts {0: 212, 1: 357}


{0: 212, 1: 357}

# Split

In [14]:
X_train, X_test, y_train, y_test = initial_split(_X, _y)

X = X_train, X_test
y = y_train, y_test

In [15]:
n_classes = get_n_classes(X_train, y_train)
n_features = get_n_features(X_train, y_train)

n_classes 2
n_features 30


In [16]:
get_each_class_counts(X_train, y_train)

class counts {0: 170, 1: 285}


{0: 170, 1: 285}

In [17]:
eval_train_max_size, train_max_size = get_eval_and_benchmark_size(X_train=X_train)

eval_max_size 455
train_max_size 341


# TRAIN MODELS

## Common hyperparams

In [18]:
class CommonHyperparams:
    lr = pyhopper.choice([3e-5, 3e-4, 3e-3, 3e-2, 3e-1])
    batch_size = pyhopper.int(32, 512, 32, 32)
    ht_mask_size = pyhopper.choice([i for i in [2, 5, 10, 20, 50, 90] if i <= n_features])
    ht_target_size = pyhopper.choice([5, 10, 20, 50])
    ht_mask_no = pyhopper.choice([50, 70, 100, 150, 200, 300])
    ht_epochs = pyhopper.choice([100, 200])
    

## Hypernetwork

In [19]:
set_seed(SEED)

network_hp_fn=get_parametrized_hypertab_fn(DEVICE=DEVICE, n_features=n_features, n_classes=n_classes)

#### Find hyperparams

<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [20]:
param_grid = {
    "epochs": CommonHyperparams.ht_epochs,
    "masks_no": CommonHyperparams.ht_mask_no,
    "mask_size": CommonHyperparams.ht_mask_size,
    "target_size": CommonHyperparams.ht_target_size,
    "lr": CommonHyperparams.lr,
    "batch_size": CommonHyperparams.batch_size,
}

hp_best_params, hp_history = pyhopper_best_params(
    model_fn=network_hp_fn, 
    param_grid=param_grid,
    data=(X_train, y_train),
    train_size=train_max_size,
    DATA=DATA,
    device=DEVICE,
    time=TIME_BUDGET,
)

hp_best_params

THIS IS TEST RUN
| DEVICE: cpu
| model_fn network_hp_fn

pyhopper X.shape: (455, 30) y.shape: (455,) train_size: 341



  0%|                                                                                                                                                                                                                   | [00:00<?]


                                                                                                                                                                                                                                   




  0%|                                                                                                                                                                                                                   | [00:00<?]

Search is scheduled for 01:00 (m:s)
params {'epochs': 10, 'masks_no': 50, 'mask_size': 2, 'target_size': 5, 'lr': 3e-05, 'batch_size': 32}
iter 1 of 1 X_train shape torch.Size([341, 30])



Best f: 50 (out of 1 params): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| [02:50<00:00, 0.4 param/min]


Best f: 50 (out of 1 params): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| [02:50<00:00, 0.4 param/min]


Best f: 50 (out of 1 params): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| [02:50<00:00, 0.4 param/min]

Mode              : Best f : Steps : Time       
----------------  : ----   : ----  : ----       
Initial solution  : 50     : 1     : 02:50 (m:s)
----------------  : ----   : ----  : ----       
Total             : 50     : 1     : 02:50 (m:s)
BreastCancer_network_hp_fn_{'epochs': 10, 'masks_no': 50, 'mask_size': 2, 'target_size': 5, 'lr': 3e-05, 'batch_size': 32}





{'epochs': 10,
 'masks_no': 50,
 'mask_size': 2,
 'target_size': 5,
 'lr': 3e-05,
 'batch_size': 32}

In [21]:
hp_best_params

{'epochs': 10,
 'masks_no': 50,
 'mask_size': 2,
 'target_size': 5,
 'lr': 3e-05,
 'batch_size': 32}

#### Train using the best hyperparams

In [22]:
epochs = hp_best_params['epochs']
masks_no = hp_best_params['masks_no']
mask_size = hp_best_params['mask_size']
target_size = hp_best_params['target_size']
batch_size = hp_best_params['batch_size']
lr = hp_best_params['lr']


hyper_results = test_model(
    model_fn=network_hp_fn(**hp_best_params, verbose=True),
    data=(X, y),
    train_size=eval_train_max_size,
    iters=10
)

iter 1 of 1 X_train shape torch.Size([455, 30])



  0%|                                                                                                                                                                                                       | 0/10 [00:00<?, ?it/s]


 10%|███████████████████                                                                                                                                                                            | 1/10 [00:20<03:03, 20.38s/it]


 20%|██████████████████████████████████████▏                                                                                                                                                        | 2/10 [00:42<02:51, 21.50s/it]


 30%|█████████████████████████████████████████████████████████▎                                                                                                                                     | 3/10 [01:05<02:34, 22.07s/it]


 40%|████████████████████████████████████████████████████████████████████████████▍                                                                                                                  | 4/10 [01:27<02:12, 22.04s/it]


 50%|███████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                               | 5/10 [01:48<01:48, 21.76s/it]


 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 6/10 [02:10<01:26, 21.71s/it]


 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                         | 7/10 [02:31<01:04, 21.47s/it]


 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                      | 8/10 [02:53<00:43, 21.75s/it]


 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                   | 9/10 [03:16<00:22, 22.23s/it]


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:39<00:00, 22.45s/it]


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:39<00:00, 21.98s/it]




In [23]:
print_mean_std_max(hyper_results, DATA)

metric balanced_accuracy
dataset_name BreastCancer
BreastCancer: 50.00 ~ nan (max: 50.00)


## NODE

In [24]:
set_seed(SEED)

node_fn=get_parametrized_node_fn(X_train=X_train, n_classes=n_classes, n_features=n_features, DEVICE=DEVICE)

#### Tune hyperparams

In [25]:
# 'layer_dim': hp.quniform('layer_dim', 100, 1200, 100),
# 'num_layers': hp.quniform('num_layers', 1, 4, 1),
# 'depth': hp.quniform('depth', 2, 7, 1)
                    
param_grid = {
    'layer_dim': pyhopper.int(64, 1024, power_of=2),
    'num_layers': pyhopper.int(1, 5),
    'depth': pyhopper.int(2, 7),
    'batch_size': pyhopper.choice([32, 64, 128]),
}

node_best, node_history = pyhopper_best_params(
    model_fn=node_fn,
    param_grid=param_grid,
    data=(X_train, y_train),
    train_size=train_max_size,
    DATA=DATA,
    device=DEVICE,
    time=TIME_BUDGET
)

node_best

THIS IS TEST RUN
| DEVICE: cpu
| model_fn node_fn

pyhopper X.shape: (455, 30) y.shape: (455,) train_size: 341



  0%|                                                                                                                                                                                                                   | [00:00<?]


                                                                                                                                                                                                                                   




  0%|                                                                                                                                                                                                                   | [00:00<?]

Search is scheduled for 01:00 (m:s)
params {'layer_dim': 256, 'num_layers': 3, 'depth': 4, 'batch_size': 32}
iter 1 of 1 X_train shape torch.Size([341, 30])


  warn("Data-aware initialization is performed on less than 1000 data points. This may cause instability."


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at /opt/conda/conda-bld/pytorch_1678402374358/work/torch/csrc/utils/python_arg_parser.cpp:1485.)
  exp_avg.mul_(beta1_adj).add_(1.0 - beta1_adj, d_p)


CTRL+C received. Will terminate once the currently running candidates finished




CTRL+C received. Will terminate once the currently running candidates finished


#### Use best hyperparams

In [None]:
%%capture --no-stdout

node_results = test_model(
    model_fn=node_fn(**node_best),
    data=(X, y),
    train_size=eval_train_max_size,
    iters=10
)

In [None]:
print_mean_std_max(node_results, DATA)

## Dropout Neural network

#### Find Hyperparams

### Dropout 1 layer

In [None]:
set_seed(SEED)

network_fn1=get_parametrized_dropout_net1(DEVICE=DEVICE, n_features=n_features, n_classes=n_classes)

In [None]:
param_grid = {
                "epochs": CommonHyperparams.ht_epochs,
                "lr": CommonHyperparams.lr,
                "batch_size": CommonHyperparams.batch_size,
                "drop1": pyhopper.choice([0.1, 0.3, 0.5, 0.7], is_ordinal=True),
                "drop2": pyhopper.choice([0.1, 0.3, 0.5, 0.7], is_ordinal=True),
                "batch_size": pyhopper.choice([32, 64]),
             }

nn_fn1_best_params, nn_fn1_history = pyhopper_best_params(
    model_fn=network_fn1,
    param_grid=param_grid,
    data=(X_train, y_train),
    train_size=train_max_size,
    DATA=DATA,
    device=DEVICE,
    time=TIME_BUDGET,
)

nn_fn1_best_params

In [None]:
nn1_results = test_model(
    model_fn=network_fn1(**nn_fn1_best_params),
    data=(X, y),
    train_size=eval_train_max_size,
    iters=10
)

In [None]:
print_mean_std_max(nn1_results, DATA)

### Dropout 2 layers

In [None]:
set_seed(SEED)

network_fn2=get_parametrized_dropout_net2(DEVICE=DEVICE, n_features=n_features, n_classes=n_classes)

In [None]:
param_grid = {
                "epochs": CommonHyperparams.ht_epochs,
                "lr": CommonHyperparams.lr,
                "batch_size": CommonHyperparams.batch_size,
                "drop1": pyhopper.choice([0.1, 0.3, 0.5, 0.7], is_ordinal=True),
                "drop2": pyhopper.choice([0.1, 0.3, 0.5, 0.7], is_ordinal=True),
                "drop3": pyhopper.choice([0.1, 0.3, 0.5, 0.7], is_ordinal=True),
             }

nn_fn2_best_params, nn_fn2_history = pyhopper_best_params(
    model_fn=network_fn2,
    param_grid=param_grid,
    data=(X_train, y_train),
    train_size=train_max_size,
    DATA=DATA,
    device=DEVICE,
    time=TIME_BUDGET,
)
nn_fn2_best_params

In [None]:
nn2_results = test_model(
    model_fn=network_fn2(**nn_fn2_best_params),
    data=(X, y),
    train_size=eval_train_max_size,
    iters=10
)

In [None]:
print_mean_std_max(nn2_results, DATA)

### Dropout 3 layers

In [None]:
set_seed(SEED)

network_fn3=get_parametrized_dropout_net3(DEVICE=DEVICE, n_features=n_features, n_classes=n_classes)

In [None]:
param_grid = {
                "epochs": CommonHyperparams.ht_epochs,
                "lr": CommonHyperparams.lr,
                "batch_size": CommonHyperparams.batch_size,
                "drop1": pyhopper.choice([0.1, 0.3, 0.5, 0.7], is_ordinal=True),
                "drop2": pyhopper.choice([0.1, 0.3, 0.5, 0.7], is_ordinal=True),
                "drop3": pyhopper.choice([0.1, 0.3, 0.5, 0.7], is_ordinal=True),
                "drop4": pyhopper.choice([0.1, 0.3, 0.5, 0.7], is_ordinal=True),
             }

nn_fn3_best_params, nn_fn3_history = pyhopper_best_params(
    model_fn=network_fn3,
    param_grid=param_grid,
    data=(X_train, y_train),
    train_size=train_max_size,
    DATA=DATA,
    device=DEVICE,
    time=TIME_BUDGET,
)

nn_fn3_best_params

In [None]:
nn3_results = test_model(
    model_fn=network_fn3(**nn_fn3_best_params),
    data=(X, y),
    train_size=eval_train_max_size,
    iters=10
)

In [None]:
print_mean_std_max(nn3_results, DATA)

## Random Subspace

In [None]:
set_seed(SEED)

get_bagged_fn=get_parametrized_bagged_fn()

#### Tune hyperparams

In [None]:
param_grid = {
    "first_hidden_layer": CommonHyperparams.ht_mask_size,
    "second_hidden_layer": CommonHyperparams.ht_target_size,
    "batch_size": CommonHyperparams.batch_size,
    "learning_rate_init": CommonHyperparams.lr, 
    "max_iter": CommonHyperparams.ht_epochs
}

In [None]:
bagged_best, bagged_history = pyhopper_best_params(
    model_fn=get_bagged_fn, 
    param_grid=param_grid,
    data=(X_train, y_train),
    train_size=train_max_size,
    DATA=DATA,
    device='cpu',
    time=TIME_BUDGET
)

#### Use best hyperparams

In [None]:
bagging_results = test_model(
    model_fn=get_bagged_fn(**bagged_best),
    data=(X, y),
    train_size=eval_train_max_size,
    iters=10
)

In [None]:
print_mean_std_max(bagging_results, DATA)

## Ensembles

In [None]:
set_seed(SEED)

get_ensembles=get_parametrized_ensemble_fn()

#### Tune

In [None]:
param_grid = {
    "n_models": CommonHyperparams.ht_mask_no,
    "first_hidden_layer": CommonHyperparams.ht_mask_size,
    "second_hidden_layer": CommonHyperparams.ht_target_size,
    "batch_size": CommonHyperparams.batch_size,
    "learning_rate_init": CommonHyperparams.lr, 
    "max_iter": CommonHyperparams.ht_epochs
}

In [None]:
ensemble_best, ensemble_history = pyhopper_best_params(
    model_fn=get_ensembles, 
    param_grid=param_grid,
    data=(X_train, y_train),
    train_size=train_max_size,
    DATA=DATA,
    device='cpu',
    time=TIME_BUDGET
)

#### Use best

In [None]:
ensemble_results = test_model(
    model_fn=get_ensembles(**ensemble_best),
    data=(X, y),
    train_size=eval_train_max_size,
    iters=10
)

In [None]:
print_mean_std_max(ensemble_results, DATA)

## XGBoost

In [None]:
set_seed(SEED)

get_xgboost = get_parametrized_xgboost_fn(seed=SEED)

#### Hyperparam tuning

In [None]:
param_grid = {
                'n_estimators': pyhopper.int(50, 3000, multiple_of=50),
                'max_depth': pyhopper.choice([2, 3, 5, 10, 15]),
                'learning_rate': pyhopper.float(1e-5,1e-1, log=True),
                'min_child_weight': pyhopper.choice([1, 2, 4, 8, 16, 32]),
                'gamma': pyhopper.choice([0, 0.001, 0.1, 1]),
             }

xgbt_best1, xgbt_history1 = pyhopper_best_params(
    model_fn=get_xgboost, 
    param_grid=param_grid,
    data=(X_train, y_train),
    train_size=train_max_size,
    DATA=DATA,
    device='cpu',
    time=TIME_BUDGET
)

In [None]:
param_grid = {
                'subsample': pyhopper.choice([0.5, 0.6, 0.7, 0.8, 0.9, 1]),
                'reg_lambda': pyhopper.float(1e-5, 10, init=0, log=True),
                'reg_alpha': pyhopper.float(1e-5, 10, init=0, log=True),
             }


xgbt_best2, xgbt_history2 = pyhopper_best_params(
    model_fn=get_xgboost,
    param_grid=param_grid,
    data=(X_train, y_train),
    train_size=train_max_size,
    DATA=DATA,
    device='cpu',
    time=TIME_BUDGET,
    default_params=xgbt_best1
)

#### Best Params

In [None]:
xgboost_best = {**xgbt_best1, **xgbt_best2}

In [None]:
xgb_dframe = test_model(
    model_fn=get_xgboost(**xgboost_best),
    data=(X, y),
    train_size=eval_train_max_size,
    iters=10
)

In [None]:
print_mean_std_max(xgb_dframe, DATA)

## Random forest

In [None]:
set_seed(SEED)

get_rf = get_parametrized_rf_fn(seed=SEED)

#### Find hyperparams

In [None]:
param_grid = {
    'n_estimators': pyhopper.int(50, 3000, multiple_of=50),
    'max_features': pyhopper.choice([None, 'sqrt', 0.2, 0.3, 0.5, 0.7]),
    'criterion' : pyhopper.choice(['gini', 'entropy']),
    'max_depth': pyhopper.choice([None, 2, 4, 8, 16]),
 }

rf_best, rf_history = pyhopper_best_params(
    model_fn=get_rf,
    param_grid=param_grid,
    data=(X_train, y_train),
    train_size=train_max_size,
    DATA=DATA,
    device='cpu',
    time=TIME_BUDGET,
)

rf_best

#### Use best params

In [None]:
rf_dframe = test_model(
    model_fn=get_rf(**rf_best), 
    data=(X, y),
    train_size=eval_train_max_size,
    iters=10
)

In [None]:
print_mean_std_max(rf_dframe, DATA)

# Collect analytics

In [None]:
hyper_results.groupby("Class")['Metric'].agg(['mean', 'std', 'max'])

In [None]:
d = {}

In [None]:
d['Random forest'] = rf_dframe.groupby("Class")['Metric'].agg(['mean', 'std', 'max'])
d['Hypernet'] = hyper_results.groupby("Class")['Metric'].agg(['mean', 'std', 'max'])

d['Dropout_1'] = nn1_results.groupby("Class")['Metric'].agg(['mean', 'std', 'max'])
d['Dropout_2'] = nn2_results.groupby("Class")['Metric'].agg(['mean', 'std', 'max'])
d['Dropout_3'] = nn3_results.groupby("Class")['Metric'].agg(['mean', 'std', 'max'])
d['Node'] = node_results.groupby("Class")['Metric'].agg(['mean', 'std', 'max'])
d['XGBoost'] = xgb_dframe.groupby("Class")['Metric'].agg(['mean', 'std', 'max'])

In [None]:
all_models_df=pd.concat(d, axis=0)
all_models_df

In [None]:
os.environ['COMET_KEY']

In [None]:
all_models_df.to_csv(f"{DATA}_metrics.csv")

In [None]:
exp = Experiment(os.environ.get("COMET_KEY"), 'hypernet-uci-tune')
# exp.log_parameters({"epochs": epochs, "mask_size": mask_size, "masks_no": masks_no, "data_size": data_size})
exp.add_tag(f"hypernet-tune2{DATA}")
exp.log_table(f"{DATA}_metrics.csv", all_models_df)

### Replace some data in existing

In [None]:
# tmp_df = pd.concat(d, axis=0)
# tmp_df = tmp_df.reset_index()
# tmp_df = tmp_df.rename(columns={tmp_df.columns[0]: DATA})

# tmp_df

In [None]:
# all_models_df = pd.read_csv(f"{DATA}_metrics.csv")
# all_models_df = all_models_df.rename(columns={all_models_df.columns[0]: DATA})
# all_models_df = all_models_df.drop(all_models_df[all_models_df.iloc[:, 0] == 'Hypernet'].index)
# all_models_df = all_models_df.drop(all_models_df[all_models_df.iloc[:, 0] == 'HypernetPCA'].index)

# all_models_df

In [None]:
# all_models_df = pd.concat([all_models_df, tmp_df])
# all_models_df

In [None]:
f"{DATA}_metrics.csv"

In [None]:
all_models_df = pd.read_csv(f"{DATA}_metrics.csv")

In [None]:
all_models_df.reset_index()

In [None]:
# all_models_df = all_models_df.drop(all_models_df.columns[0], axis=1)
# all_models_df

In [None]:
tmp = all_models_df

In [None]:
tmp = tmp.rename(columns={tmp.columns[0]: DATA})

In [None]:
tmp[tmp['Class'] == "balanced_accuracy"]