In [2]:
%load_ext autoreload
%autoreload 2

In [8]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.model_selection import PredefinedSplit
import numpy as np
from self_learn import CustomLogisticRegression, BaymaxNet
from sklearn.model_selection import BaseCrossValidator
from scipy.stats import uniform, randint

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK

from sklearn.metrics import accuracy_score

import utils

In [14]:
%store -r X_train
%store -r X_val
%store -r y_train
%store -r y_val

## For baseline logreg

In [7]:
estimator = CustomLogisticRegression(validation=False)

In [5]:
class CustomCV(BaseCrossValidator):
    def __init__(self, X_train, X_val):
        self.X_train = X_train
        self.X_val = X_val

    def split(self, X, y=None, groups=None):
        # Generate indices for the training and validation sets
        train_indices = np.arange(len(self.X_train))
        val_indices = np.arange(len(self.X_train), len(self.X_train) + len(self.X_val))
        
        yield train_indices, val_indices

    def get_n_splits(self, X=None, y=None, groups=None):
        # Returns the number of splitting iterations in the cross-validator
        return 1

In [6]:
# Create the custom cross-validator
cv = CustomCV(X_train, X_val)

## Gridsearch

In [10]:
# Assuming X_train, X_val, y_train, y_val are your training and validation sets
# Concatenate your training and validation sets
X = np.concatenate((X_train, X_val), axis=0)
y = np.concatenate((y_train, y_val), axis=0)

weights = utils.get_class_weights(len(y), 3, y)

In [15]:
X_train.shape

(20999, 20)

In [16]:
# Define the parameter grid for grid search
param_grid = {
    'learning_rate': [0.1,0.01,0.001],
    'num_epochs': [1000, 1500],
    'regularization': ['L1','L2'],
    'lambda_reg': [0.001, 0.01, 0.1],
    'gamma': [2.0, 3.0],
    'class_weights':[[0, 0, 0],[2.85176468, 7.36963696, 1.66483665]]
}

In [19]:
# log-uniform: understand as search over p = exp(x) by varying x
optimizer = RandomizedSearchCV(
    estimator=CustomLogisticRegression(validation=False),
    param_distributions={
        'learning_rate': [0.1,0.01,0.001],
        'num_epochs': [1000, 1500],
        'regularization': ['L1','L2'],
        'lambda_reg': [0.001, 0.01, 0.1],
        'gamma': [2.0, 3.0],
        'class_weights':[[0, 0, 0],[2.85176468, 7.36963696, 1.66483665]]
    },
    n_iter=50,
    cv=cv,
    random_state=0,
    refit=True,
)

# executes bayesian optimization
_ = optimizer.fit(X_train, y_train)

print("Random Search Best Params:", optimizer.best_params())
# model can be saved, used for predictions or scoring
print(optimizer.score(X_test, y_test))

Epoch 0, Loss: 29728.606900185805, Train Accuracy: 0.6883184913567313, Val Accuracy: None
Epoch 100, Loss: 12504.119467211307, Train Accuracy: 0.7656078860898138, Val Accuracy: None
Epoch 200, Loss: 11214.75576026576, Train Accuracy: 0.765226915567408, Val Accuracy: None
Epoch 300, Loss: 10684.403151987179, Train Accuracy: 0.7661793418734225, Val Accuracy: None
Epoch 400, Loss: 10382.644026816995, Train Accuracy: 0.7665126910805277, Val Accuracy: None
Epoch 500, Loss: 10182.915061295997, Train Accuracy: 0.7671317681794371, Val Accuracy: None
Epoch 600, Loss: 10033.175456839313, Train Accuracy: 0.7683223010619553, Val Accuracy: None
Epoch 700, Loss: 9917.19991755185, Train Accuracy: 0.7691318634220677, Val Accuracy: None
Epoch 800, Loss: 9824.016778417503, Train Accuracy: 0.7697033192056765, Val Accuracy: None
Epoch 900, Loss: 9745.622112741281, Train Accuracy: 0.7704176389351874, Val Accuracy: None
Epoch 0, Loss: 0.0009057172688226544, Train Accuracy: 0.37082718224677363, Val Accuracy:

In [13]:
def objective(params):
    # Create the RandomForestClassifier with the given hyperparameters
    clf = CustomLogisticRegression(**params)
    clf.fit(X_train.values, y_train.values)

    y_pred = clf.predict(X_val.values)
    loss = -accuracy_score(y_val, y_pred)

    return {'loss': loss, 'status': STATUS_OK}

# Define the search space for hyperparameters
space = {
    'learning_rate': hp.loguniform('learning_rate', np.log(0.001), np.log(0.1)),
    'num_epochs': hp.choice('num_epochs', [1000, 1500]),
    'regularization': hp.choice('regularization', ['L1', 'L2']),
    'lambda_reg': hp.uniform('lambda_reg', 0.001, 0.1),
    'gamma': hp.uniform('gamma', 2.0, 3.0)
}

# Initialize Trials object to keep track of results
trials = Trials()

# Run the optimization
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials
)

print("Best hyperparameters:", best)

  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

job exception: 'NoneType' object has no attribute 'dot'



  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]


AttributeError: 'NoneType' object has no attribute 'dot'

## BaymaxNet

In [16]:
def objective(params):
    # Create the RandomForestClassifier with the given hyperparameters
    clf = RandomForestClassifier(**params)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_val)
    loss = -accuracy_score(y_val, y_pred)

    return {'loss': loss, 'status': STATUS_OK}

# Define the search space for hyperparameters
space = {
    'n_estimators': hp.choice('n_estimators', range(100, 1000)),
    'max_depth': hp.choice('max_depth', range(5, 50)),
    'min_samples_split': hp.uniform('min_samples_split', 0.01, 0.1),
    'min_samples_leaf': hp.uniform('min_samples_leaf', 0.01, 0.1),
    'max_features': hp.choice('max_features', ['auto', 'sqrt', 'log2', None]),
    'bootstrap': hp.choice('bootstrap', [True, False]),
    'class_weight': hp.choice('class_weight', [None, 'balanced', 'balanced_subsample']),
    'criterion': hp.choice('criterion', ['gini', 'entropy']),
}

# Initialize Trials object to keep track of results
trials = Trials()

# Run the optimization
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials
)

print("Best hyperparameters:", best)


 60%|██████    | 60/100 [26:01<19:43, 29.60s/trial, best loss: -0.803763440860215] 

In [5]:
def objective(params):
    # Create the RandomForestClassifier with the given hyperparameters
    clf = LogisticRegression(**params)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_val)
    loss = -accuracy_score(y_val, y_pred)

    return {'loss': loss, 'status': STATUS_OK}

# Define the search space for hyperparameters
space = {
    'C': hp.loguniform('C', np.log(1e-1), np.log(1e1)),  # Inverse of regularization strength
    'solver': hp.choice('solver', ['saga']),  # Algorithm to use in the optimization problem
    'max_iter': hp.choice('max_iter', range(100, 1000)),  # Maximum number of iterations taken for the solvers to converge
    'penalty': hp.choice('penalty', ['l1', 'l2', 'elasticnet']),  # Used to specify the norm used in the penalization
    'l1_ratio': hp.uniform('l1_ratio', 0, 1),  # The Elastic-Net mixing parameter, with 0 <= l1_ratio <= 1
    'class_weight': hp.choice('class_weight', [None, 'balanced']),  # Weights associated with classes
}

# Initialize Trials object to keep track of results
trials = Trials()

# Run the optimization
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials
)

print("Best hyperparameters:", best)


100%|██████████| 100/100 [13:26<00:00,  8.06s/trial, best loss: -0.8190860215053763]
Best hyperparameters: {'C': 6.898604855527303, 'class_weight': 0, 'l1_ratio': 0.7072602895829233, 'max_iter': 885, 'penalty': 0, 'solver': 0}


In [6]:
def objective(params):
    # Create the RandomForestClassifier with the given hyperparameters
    clf = XGBClassifier(**params)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_val)
    loss = -accuracy_score(y_val, y_pred)

    return {'loss': loss, 'status': STATUS_OK}

# Define the search space for hyperparameters
space = {
    'n_estimators': hp.choice('n_estimators', range(50, 500)),  # Number of gradient boosted trees
    'learning_rate': hp.quniform('learning_rate', 0.01, 0.2, 0.01),  # Boosting learning rate
    'max_depth': hp.choice('max_depth', range(3, 14)),  # Maximum tree depth for base learners
    'min_child_weight': hp.choice('min_child_weight', range(1, 10)),  # Minimum sum of instance weight(hessian) needed in a child
    'gamma': hp.uniform('gamma', 0.0, 0.5),  # Minimum loss reduction required to make a further partition on a leaf node of the tree
    'subsample': hp.uniform('subsample', 0.5, 1.0),  # Subsample ratio of the training instance
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),  # Subsample ratio of columns when constructing each tree
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),  # L1 regularization term on weights
    'reg_lambda': hp.uniform('reg_lambda', 1.0, 4.0),  # L2 regularization term on weights
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1.0, 10.0),  # Balancing of positive and negative weights
    'max_delta_step': hp.choice('max_delta_step', range(1, 10)),  # Maximum delta step we allow each tree's weight estimation to be
    'objective': 'multi:softmax',  # Objective function for multiclass classification,
}
# Initialize Trials object to keep track of results
trials = Trials()

# Run the optimization
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials
)

print("Best hyperparameters:", best)


100%|██████████| 100/100 [04:15<00:00,  2.56s/trial, best loss: -0.810752688172043]
Best hyperparameters: {'colsample_bytree': 0.9385812500707504, 'gamma': 0.188570709579355, 'learning_rate': 0.17, 'max_delta_step': 5, 'max_depth': 9, 'min_child_weight': 7, 'n_estimators': 254, 'reg_alpha': 0.2791453005653505, 'reg_lambda': 2.9636012865585113, 'scale_pos_weight': 7.692974658470718, 'subsample': 0.9514650766399473}
