In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.model_selection import PredefinedSplit
import numpy as np
from self_learn import LogisticRegression

import utils

In [3]:
%store -r X_train
%store -r X_val
%store -r y_train
%store -r y_val

In [4]:
class HyperparameterOptimizer:
    def __init__(self, estimator, scoring='accuracy', cv=None):
        self.estimator = estimator
        self.scoring = scoring
        self.cv = cv  # Custom cross-validator

    def grid_search(self, param_grid):
        self.optimizer = GridSearchCV(
            estimator=self.estimator,
            param_grid=param_grid,
            scoring=self.scoring,
            cv=self.cv  # Use custom cross-validator
        )
        return self

    def random_search(self, param_distributions, n_iter=10):
        self.optimizer = RandomizedSearchCV(
            estimator=self.estimator,
            param_distributions=param_distributions,
            n_iter=n_iter,
            scoring=self.scoring,
            cv=self.cv  # Use custom cross-validator
        )
        return self

    def bayesian_search(self, search_spaces, n_iter=50, n_points=1):
        self.optimizer = BayesSearchCV(
            estimator=self.estimator,
            search_spaces=search_spaces,
            n_iter=n_iter,
            scoring=self.scoring,
            cv=self.cv,  # Use custom cross-validator
            n_points=n_points
        )
        return self

    def fit(self, X_train, y_train):
        self.optimizer.fit(X_train, y_train)
        return self

    def best_params(self):
        return self.optimizer.best_params_

    def best_score(self):
        return self.optimizer.best_score_

    def best_estimator(self):
        return self.optimizer.best_estimator_


In [10]:
estimator = LogisticRegression()

## Gridsearch

In [11]:
# Assuming X_train, X_val, y_train, y_val are your training and validation sets
# Concatenate your training and validation sets
X = np.concatenate((X_train, X_val), axis=0)
y = np.concatenate((y_train, y_val), axis=0)

# Create the test_fold array
test_fold = [-1]*len(X_train) + [0]*len(X_val)  # -1 for training, 0 for validation

# Create the PredefinedSplit cross-validator
ps = PredefinedSplit(test_fold)

weights = utils.get_class_weights(len(y), 3, y)

In [15]:
X.shape

(24563, 40)

In [13]:
# Initialize the optimizer
optimizer = HyperparameterOptimizer(estimator, cv=ps)

# Define the parameter grid for grid search
param_grid = {
    'learning_rate': [3e-2,3e-3,3e-4],
    'num_epochs': [2000, 2500],
    'regularization': ['L1','L2'],
    'lambda_reg': [0.001, 0.01, 0.1],
    'gamma': [2.0, 3.0],
    'class_weights':[[0.42411666, 8.17534247, 1.923672],[0.42411666, 8.5, 2.0]]
}

# Perform Grid Search
optimizer.grid_search(param_grid).fit(X, y)
print("Grid Search Best Params:", optimizer.best_params())

ValueError: 
All the 144 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
144 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/microbot/miniconda/envs/eda/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/microbot/Masters/IFT6390_FundamentalsOfML/Kaggle_competition/self_learn.py", line 104, in fit
    val_accuracy = np.mean(y_val == self.predict(X_val)) if self.validation else None
                                    ^^^^^^^^^^^^^^^^^^^
  File "/Users/microbot/Masters/IFT6390_FundamentalsOfML/Kaggle_competition/self_learn.py", line 143, in predict
    logits = X.dot(self.weights) + self.bias
             ^^^^^
AttributeError: 'NoneType' object has no attribute 'dot'


In [None]:
# Perform Random Search
optimizer.random_search(param_grid, n_iter=10).fit(X, y)
print("Random Search Best Params:", optimizer.best_params())

## bayesian optimization

In [None]:
# Define the search spaces for Bayesian Optimization
search_spaces = {
    'C': Real(0.1, 100, prior='log-uniform'),
    'gamma': Real(0.001, 1, prior='log-uniform'),
    'kernel': Categorical(['rbf'])
}

# Perform Bayesian Optimization
optimizer.bayesian_search(search_spaces, n_iter=50).fit(X, y)
print("Bayesian Optimization Best Params:", optimizer.best_params())

In [None]:
# Define the hyperparameter grid for Logistic Regression
param_grid = {
    'learning_rate': [3e-3,3e-4],
    'num_epochs': [2000],
    'regularization': ['L2'],
    'gamma': [2.0, 3.0],
    'class_weights':[[0.42411666, 8.17534247, 1.923672],[0.42411666, 8.5, 2.0]],
    'alpha':[[0,0,0],[0.15, 0.90, 0.6],[0.20, 0.95, 0.6]]
}

In [None]:
best_params_list = []  # To store the best parameters for each model
best_models = []       # To store the best models for each region

# Initialize Logistic Regression model
log_reg = LogisticRegression()

# Initialize GridSearchCV (without TimeSeriesSplit)
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)  # cv=5 as an example

# Train the model using GridSearchCV on the training data
grid_search.fit(X_train, y_train)

# Save the best hyperparameters and the best model
best_params_list.append(grid_search.best_params_)
best_models.append(grid_search.best_estimator_)

# Optionally, you can print out the results for each iteration
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}\n")
