In [1]:
import torch

In [2]:
from comet_ml import Experiment, Optimizer

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from collections import defaultdict
from tqdm import trange
import torch
import os
import matplotlib.pyplot as plt
from io import BytesIO
import base64
import sklearn
import time
import datetime
from hypertab_benchmark_utils import *

plt.style.use("seaborn")
torch.set_default_dtype(torch.float32)

  from .autonotebook import tqdm as notebook_tqdm
  plt.style.use("seaborn")


In [30]:
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, balanced_accuracy_score

In [10]:
from hypertab import HypernetworkPCA, TrainingModes, Hypernetwork
from hypertab.interfaces import HypernetworkSklearnInterface
# from ipynb.fs.defs.MNIST_benchmark import test_model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [11]:
from sklearn.metrics import classification_report, balanced_accuracy_score, accuracy_score

In [12]:
import pyhopper
import sklearn.datasets

In [13]:
from loguru import logger

logger.add("log.txt", format='{time:YYYY-MM-DD HH:mm:ss.SSS} | {message}')

1

In [14]:
DEVICE="cuda:0"

In [15]:
GS_METRIC = 'balanced_accuracy'

# Load data

In [16]:
TEST_RUN = False

# Preprocess

In [17]:
def train_test_split_tuple(X, y, train_size=None):
    if isinstance(X, tuple) and isinstance(y, tuple):
        X_train, X_test = X
        y_train, y_test = y
    else:    
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, stratify=y)
    
    return X_train, X_test, y_train, y_test

In [18]:
import numpy as np
np.random.seed(42)

def prepare_data(X, y, size=None):
    if isinstance(X, tuple) and isinstance(y, tuple):
        X_train, X_test = X
        y_train, y_test = y
    else:    
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=size, stratify=y)
    # X_train, y_train = imblearn.over_sampling.RandomOverSampler(random_state=42).fit_resample(X_train, y_train)
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    X_train, X_test, y_train, y_test = [torch.from_numpy(x) for x in [X_train, X_test, y_train, y_test]]
    
    return X_train, X_test, y_train, y_test

## Benchmark

In [19]:
criterion = torch.nn.CrossEntropyLoss()

In [20]:
def _summarize_results(y_pred, y_score, y_test, labels):
    results = []
    for idx, label in enumerate(labels):
        y_pred_filt = y_pred[y_test==idx]
        y_test_filt = y_test[y_test==idx]
#         acc = (y_pred_filt==y_test_filt.numpy()).sum()/len(y_test_filt)*100
        acc = accuracy_score(y_test_filt, y_pred_filt)
        results.append({
            "Class": label,
            "Metric": acc
        })
        
#     acc = (y_pred==y_test.numpy()).sum()/len(y_test)*100 
    acc = accuracy_score(y_test, y_pred)
    results.append({
        "Class": "Total",
        "Metric": acc
    })
    
    results.append({
        "Class": "balanced_accuracy",
        "Metric": balanced_accuracy_score(y_test, torch.from_numpy(y_pred)).item()*100
    })
    
    try:
        results.append({
            "Class": "F1 score",
            "Metric": f1_score(y_test, torch.from_numpy(y_pred)).item()*100
        })
        results.append({
            "Class": "roc_auc",
            "Metric": roc_auc_score(y_test, torch.from_numpy(y_score[:, 1])).item()*100
        })
        results.append({
            "Class": "Precision",
            "Metric": precision_score(y_test, torch.from_numpy(y_pred)).item()*100
        })
        results.append({
            "Class": "Recall",
            "Metric": recall_score(y_test, torch.from_numpy(y_pred)).item()*100
        })
    except ValueError:
        pass
    return results

def test_model(model_fn, data, train_size, label_encoder=None, iters=10, as_numpy=False):
    if TEST_RUN:
        iters = 1
        
    if label_encoder is not None:
        labels = label_encoder.classes_
    else:
        labels = sorted(pd.unique(data[1][0] if isinstance(data[1], tuple) else data[1]))

    
    results = []
    
    X, y = data

    for i in range(iters):
        X_train, X_test, y_train, y_test = prepare_data(*data, train_size)
        print('iter', i+1, 'of', iters, 'X_train shape', X_train.shape)

        model = model_fn()

        if as_numpy:
            model.fit(X_train.numpy(), y_train.numpy());
        else:
            model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        y_score = model.predict_proba(X_test)
        results.extend(_summarize_results(y_pred, y_score, y_test, labels))

    dframe = pd.DataFrame.from_dict(results)
    # sns.violinplot(data=dframe[dframe["Class"]!="Loss"], y="Class", x="Metric", orient='h')
    return dframe

### Param search

In [21]:
def pyhopper_best_params(model_fn, param_grid, data, train_size, metric=GS_METRIC, time="30m", default_params={}):
    if TEST_RUN:
        time = 60
        if 'epochs' in param_grid:
            param_grid["epochs"] = pyhopper.choice([10])
    
    X, y = data
    print('pyhopper', X.shape, y.shape, train_size)
        
    def objective(params):
    #     print("Training...")
        print('params',params)
        model_results = test_model(
                        model_fn(
                            **default_params,
                            **params
                        ),
                        (X, y),
                        train_size,
                        None, 5)
        with open(f"params/{DATA}_{model_fn.__name__}_params.txt", "a") as f:
            f.write(str(params) + ", " + str(model_results[model_results["Class"]==metric]["Metric"].mean()) + "\n")
        return model_results[model_results["Class"]==metric]["Metric"].mean()

    from pyhopper.callbacks import History
    search = pyhopper.Search(param_grid)

    best_params = search.run(objective, "maximize", time, n_jobs=1, seeding_ratio=0.5)
    
    with open(f"{DATA}_{model_fn.__name__}_best_params.txt", "a") as f:
            f.write(str(best_params))
    
    print(f"{DATA}_{model_fn.__name__}_{best_params}")
    return best_params


In [22]:
d = {}

In [23]:
DATA = "Glass"

# TRAIN

In [24]:
import sklearn
import sklearn.datasets

if DATA == "BreastCancer":
    dataset = sklearn.datasets.load_breast_cancer()
    X = dataset['data']
    y = dataset['target']
elif DATA == "Connectionist":
    dataset = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data", header=None)
    X = dataset.values[:, :-1].astype(float)
    y = dataset.values[:, -1]
    y = LabelEncoder().fit_transform(y)
elif DATA == "Dermatology":
    dataset = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data", header=None, na_values="?").dropna()
    X = dataset.values[:, :-1].astype(float)
    y = dataset.values[:, -1].astype(int) - 1
elif DATA == "Glass":
    dataset = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data", header=None, na_values="?").dropna()
    X = dataset.values[:, :-1].astype(float)
    y = dataset.values[:, -1].astype(int)
    y = LabelEncoder().fit_transform(y).astype(int)
    
elif DATA == "Cleveland":
    dataset = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data", header=None, na_values="?").dropna()
    X = dataset.values[:, :-1].astype(float)
    y = dataset.values[:, -1].astype(int)
    y = LabelEncoder().fit_transform(y).astype(int)

elif DATA == "CNAE9":
    dataset = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00233/CNAE-9.data", header=None, na_values="?").dropna()
    X = dataset.values[:, 1:].astype(float)
    y = dataset.values[:, 0].astype(int)
    y = LabelEncoder().fit_transform(y).astype(int)

max_size = int(len(X)*0.7)
print(X.shape, len(np.unique(y)), max_size)

(214, 10) 6 149


In [25]:
n_classes = len(np.unique(y))
n_features = X.shape[1]
unique, counts = np.unique(y, return_counts=True)

print(dict(zip(unique, counts)))

print('n_classes', n_classes)
print('n_features', n_features)

{0: 70, 1: 76, 2: 17, 3: 13, 4: 9, 5: 29}
n_classes 6
n_features 10


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y, random_state=42)

eval_max_size = int(len(X_train))
train_max_size = int(len(X_train) * 0.75)
print('train_max_size', train_max_size)
print('eval_max_size', eval_max_size)

train_max_size 128
eval_max_size 171


In [27]:
X_train.shape

(171, 10)

## HyperTab

In [28]:
def hypertab_fn(epochs=150, masks_no=100, mask_size=100, target_size=100, lr=3e-4, batch_size=64, verbose=False):
    def _inner():
        hypernet = Hypernetwork(
                        target_architecture=[(mask_size, target_size), (target_size, n_classes)],
                        test_nodes=masks_no,
                        architecture=torch.nn.Sequential(torch.nn.Linear(n_features, 32), 
                            torch.nn.ReLU(),
                            torch.nn.Linear(32, 128),
                            torch.nn.ReLU(),
                            torch.nn.Dropout(),
                            torch.nn.Linear(128, 128),
                            torch.nn.ReLU(),
                        ),
                        mode=TrainingModes.CARTHESIAN,
                    ).to(DEVICE)    
        hypernet = hypernet.train()

        network = HypernetworkSklearnInterface(hypernet, device=DEVICE, epochs=epochs, batch_size=batch_size, verbose=verbose, lr=3e-4)
        return network
    return _inner

In [29]:
param_grid = {
    "epochs": pyhopper.choice([100, 150, 200, 300, 400, 500]),
    "masks_no": pyhopper.choice([10, 20, 50, 80, 100, 150, 200]),
    "mask_size": pyhopper.choice([2, 3, 5, 8]),
    "target_size": pyhopper.choice([5, 10, 20, 50]),
    "batch_size": pyhopper.choice([32, 64]),
}
#{'epochs': 100, 'masks_no': 10, 'mask_size': 2, 'target_size': 5, 'lr': 3e-05, 'batch_size': 32}
hp_best_params = pyhopper_best_params(
    get_parametrized_hypertab_fn(DEVICE=DEVICE, n_classes=n_classes, n_features=n_features), param_grid, data=(X_train, y_train), train_size=train_max_size, time="10m"
)
hp_best_params

pyhopper (171, 10) (171,) 128


  0%|          | [00:00<?]

Search is scheduled for 10:00 (m:s)
params {'epochs': 100, 'masks_no': 10, 'mask_size': 2, 'target_size': 5, 'batch_size': 32}
iter 1 of 5 X_train shape torch.Size([128, 10])
Remote process caught exception in objective function: 
Traceback (most recent call last):
  File "/home/MCB/wwydmanski/miniconda3/envs/torch/lib/python3.9/site-packages/pyhopper/parallel.py", line 227, in execute
    iter_or_result = objective_function(candidate, **kwargs)
  File "/tmp/ipykernel_4189342/436130538.py", line 13, in objective
    model_results = test_model(
  File "/tmp/ipykernel_4189342/3179501506.py", line 73, in test_model
    results.extend(_summarize_results(y_pred, y_score, y_test, labels))
  File "/tmp/ipykernel_4189342/3179501506.py", line 28, in _summarize_results
    "Metric": f1_score(y_test, torch.from_numpy(y_pred)).item()*100
NameError: name 'f1_score' is not defined



ValueError: Pyhopper - Remote process caught exception

In [25]:
epochs = hp_best_params['epochs']
masks_no = hp_best_params['masks_no']
mask_size = hp_best_params['mask_size']
target_size = hp_best_params['target_size']
lr = 3e-4
batch_size = hp_best_params['batch_size']

full_res = []
for i in range(10):
    model = get_parametrized_hypertab_fn(DEVICE=DEVICE, n_classes=n_classes, n_features=n_features)(
        epochs, 
        masks_no, 
        mask_size, 
        target_size, 
        lr,
        batch_size=batch_size,
        verbose=True)()

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_score = model.predict_proba(X_test)
    hyper_results = _summarize_results(y_pred, 
                                    y_score, 
                                    torch.from_numpy(y_test), 
                                    torch.from_numpy(np.unique(y)))
    hyper_results = pd.DataFrame(hyper_results)
    res = hyper_results[hyper_results["Class"]=="Total"].reset_index(drop=True)["Metric"]
    full_res.append(res)

full_res = pd.DataFrame(full_res)

100%|██████████| 200/200 [00:57<00:00,  3.48it/s]
100%|██████████| 200/200 [00:43<00:00,  4.62it/s]
100%|██████████| 200/200 [00:44<00:00,  4.53it/s]
100%|██████████| 200/200 [00:44<00:00,  4.50it/s]
100%|██████████| 200/200 [00:43<00:00,  4.64it/s]
100%|██████████| 200/200 [00:42<00:00,  4.75it/s]
100%|██████████| 200/200 [00:42<00:00,  4.66it/s]
100%|██████████| 200/200 [00:43<00:00,  4.59it/s]
100%|██████████| 200/200 [00:44<00:00,  4.54it/s]
100%|██████████| 200/200 [00:45<00:00,  4.42it/s]


In [30]:
with open("results/" + DATA + "_hypertab.txt", "w") as f:
    f.write(f"{eval_max_size}: {full_res.mean().values[0]:.3f} ~ {full_res.std().values[0]:.2f}, (max: {full_res.max().values[0]:.3f})")

## XGBoost

In [31]:
import xgboost
xgboost.__version__

'1.7.4'

In [32]:
def get_xgboost(**params):
    random_seed = np.random.randint(1024)
    def _inner():
        return xgboost.XGBClassifier(
            verbosity=0,
            random_state=random_seed,
            **params
        )
    return _inner    

In [33]:
param_grid = {
                'n_estimators': pyhopper.int(50, 3000, multiple_of=50, init=50),
                'max_depth': pyhopper.choice([2, 3, 5, 10, 15]),
                'learning_rate': pyhopper.float(1e-5,1e-1, log=True),
                'min_child_weight': pyhopper.choice([1, 2, 4, 8, 16, 32]),
                'gamma': pyhopper.choice([0, 0.001, 0.1, 1]),
             }

xgbc = get_xgboost()

xgbt_best1 = pyhopper_best_params(get_xgboost, param_grid, data=(X_train, y_train), train_size=train_max_size, time="10m")

pyhopper (166, 60) (166,) 124


  0%|          | [00:00<?]

Search is scheduled for 10.00 s
params {'n_estimators': 50, 'max_depth': 2, 'learning_rate': 0.0010000000000000002, 'min_child_weight': 1, 'gamma': 0}
iter 1 of 5 X_train shape torch.Size([124, 60])




iter 2 of 5 X_train shape torch.Size([124, 60])




iter 3 of 5 X_train shape torch.Size([124, 60])




iter 4 of 5 X_train shape torch.Size([124, 60])




iter 5 of 5 X_train shape torch.Size([124, 60])


Best f: 66.8 (out of 1 params): 100%|██████████| [00:39<00:00, 0.0 param/s]

Mode              : Best f : Steps : Time
----------------  : ----   : ----  : ----
Initial solution  : 66.77  : 1     : 40 s
----------------  : ----   : ----  : ----
Total             : 66.77  : 1     : 40 s
Connectionist_get_xgboost_{'n_estimators': 50, 'max_depth': 2, 'learning_rate': 0.0010000000000000002, 'min_child_weight': 1, 'gamma': 0}





In [34]:
xgbt_best1

{'n_estimators': 50,
 'max_depth': 2,
 'learning_rate': 0.0010000000000000002,
 'min_child_weight': 1,
 'gamma': 0}

In [35]:
full_res = []
for i in range(10):
    model = get_xgboost(**xgbt_best1)()

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_score = model.predict_proba(X_test)
    hyper_results = _summarize_results(y_pred, 
                                    y_score, 
                                    torch.from_numpy(y_test), 
                                    torch.from_numpy(np.unique(y)))
    hyper_results = pd.DataFrame(hyper_results)
    res = hyper_results[hyper_results["Class"]=="Total"].reset_index(drop=True)["Metric"]
    full_res.append(res)

full_res = pd.DataFrame(full_res)



In [36]:
f"{eval_max_size}: {full_res.mean().values[0]:.3f} ~ {full_res.std().values[0]:.2f}, (max: {full_res.max().values[0]:.3f})"

'166: 0.690 ~ 0.00, (max: 0.690)'

In [None]:
with open("results/" + DATA + "_xgboost.txt", "w") as f:
    f.write(f"{eval_max_size}: {full_res.mean().values[0]:.3f} ~ {full_res.std().values[0]:.2f}, (max: {full_res.max().values[0]:.3f})")