# LIBRARIES

In [1]:
# custom imports
from constants import RANDOM_STATE
from functions import geometric_mean_score, roughly_balanced_bagging

# imbalanced-learn imports
from imblearn import FunctionSampler
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier
from imblearn.over_sampling import ADASYN, BorderlineSMOTE, RandomOverSampler, SMOTE, SVMSMOTE 
from imblearn.under_sampling import ClusterCentroids, NearMiss, RandomUnderSampler
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.pipeline import Pipeline

# joblib-related imports
from joblib import dump

# scikit-learn imports
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, QuantileTransformer, RobustScaler, StandardScaler 
from sklearn.svm import SVC

# scikit-optimize imports
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# standard Python imports
import warnings
import unidecode
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# additional settings
%matplotlib inline
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

# READ FILES

Read the training data from the CSV files

In [2]:
X_train = pd.read_csv('../dataset/X_train.csv', index_col=0)
y_train = pd.read_csv('../dataset/y_train.csv', index_col=0)

# MODELING

A pipeline for data normalization and balancing is used to prepare imbalanced datasets, improving model performance on minority classes:

In [3]:
def create_pipeline(model, normalizer=True, imputation=True, balancer=True):

    steps = []
    
    if normalizer:
        steps.append(('normalization', None))
        
    if imputation:
        steps.append(('imputation', None))
                
    if balancer:
        steps.append(('balance', None))
       
    steps.append((model.__class__.__name__.lower(), model))    
   
    return Pipeline(steps)

- These are the normalization techniques evaluated in this pipeline:

In [4]:
normalization = [
    MaxAbsScaler(),
    MinMaxScaler(), 
    QuantileTransformer(output_distribution='normal'),
    RobustScaler(), 
    StandardScaler()
]

- These are the imputation techniques evaluated in this pipeline:

In [5]:
imputation = [
    SimpleImputer(strategy='mean'),  # Fill with mean
    SimpleImputer(strategy='median'),  # Fill with median
    IterativeImputer(max_iter=10, random_state=RANDOM_STATE),  # Iterative imputation
    KNNImputer(n_neighbors=3, weights='uniform'),  # K-nearest neighbors-based imputation
    KNNImputer(n_neighbors=5, weights='uniform'),  # K-nearest neighbors-based imputation
    KNNImputer(n_neighbors=7, weights='uniform'),  # K-nearest neighbors-based imputation
]

- These are the balancing techniques evaluated in this pipeline:

In [6]:
balancing = [
    ADASYN(random_state=RANDOM_STATE), 
    BorderlineSMOTE(random_state=RANDOM_STATE),
    ClusterCentroids(random_state=RANDOM_STATE),
    NearMiss(),
    RandomOverSampler(random_state=RANDOM_STATE),
    RandomUnderSampler(random_state=RANDOM_STATE),
    SMOTE(random_state=RANDOM_STATE), 
    SMOTEENN(random_state=RANDOM_STATE),
    SMOTETomek(random_state=RANDOM_STATE),
    SVMSMOTE(random_state=RANDOM_STATE)
]

We will examine ensemble techniques and some classification models in a pipeline.

In [7]:
# Define common settings
common_settings = {
    'normalization': Categorical(normalization),
    'imputation': Categorical(imputation),
    'balance': Categorical(balancing),
    'balance__sampling_strategy': Real(0.5, 1.0),
}

# Define specific settings for each model
models = {      
    'LR': {
        'model': create_pipeline(LogisticRegression(random_state=RANDOM_STATE)),
        'space': {
            'logisticregression__C': Real(1e-3, 1e+3),
            'logisticregression__max_iter': Integer(1e+2, 1e+4),
            'logisticregression__solver': Categorical(['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']),
            'logisticregression__fit_intercept': Categorical([True, False]),
            'logisticregression__class_weight': Categorical(['balanced', None]),
            'logisticregression__l1_ratio': Real(0, 1),
            **common_settings
        },
    },
    'MLP': {
        'model': create_pipeline(MLPClassifier(random_state=RANDOM_STATE)),
        'space': {
            'mlpclassifier__hidden_layer_sizes': Integer(2, 20),
            'mlpclassifier__activation': Categorical(['logistic', 'tanh', 'relu']),
            'mlpclassifier__max_iter': Integer(1e3, 1e5),
            'mlpclassifier__alpha': Real(1e-3, 1e0),
            'mlpclassifier__learning_rate': Categorical(['constant', 'adaptive']),
            'mlpclassifier__learning_rate_init': Real(1e-4, 1e-1),
            'mlpclassifier__momentum': Real(0.1, 0.9),
            'mlpclassifier__early_stopping': Categorical([True, False]),
            'mlpclassifier__validation_fraction': Real(0.25, 0.50),
            'mlpclassifier__beta_1': Real(0.8, 0.99),
            'mlpclassifier__beta_2': Real(0.8, 0.99),
            'mlpclassifier__epsilon': Real(1e-8, 1e-6),
            **common_settings
        },
    },
    'SVM': {
        'model': create_pipeline(SVC(probability=True, random_state=RANDOM_STATE)),
        'space': {
            'svc__C': Real(1e-3, 1e+3),
            'svc__gamma': Real(1e-4, 1e-1),
            'svc__kernel': Categorical(['linear', 'rbf', 'poly', 'sigmoid']),
            'svc__degree': Integer(1, 5),
            'svc__coef0': Real(0, 1),
            'svc__shrinking': Categorical([True, False]),
            'svc__class_weight': Categorical([None, 'balanced']),
            'svc__max_iter': Integer(1e3, 1e5),
            'svc__tol': Real(1e-6, 1e-2),
            **common_settings
        },
    },
    'RF': {
        'model': create_pipeline(RandomForestClassifier(random_state=RANDOM_STATE)),
        'space': {
            'randomforestclassifier__n_estimators': Integer(1e1, 1e3),
            'randomforestclassifier__max_depth': Integer(1, 20),
            'randomforestclassifier__criterion': Categorical(['gini', 'entropy']),
            'randomforestclassifier__min_samples_split': Integer(2, 10),
            'randomforestclassifier__min_samples_leaf': Integer(1, 10),
            'randomforestclassifier__max_features': Categorical(['sqrt', 'log2']),
            'randomforestclassifier__max_samples': Real(0.1, 1.0),
            'randomforestclassifier__class_weight': Categorical(['balanced', 'balanced_subsample']),
            **common_settings
        },
    },
    'GB': {
        'model': create_pipeline(GradientBoostingClassifier(random_state=RANDOM_STATE)),
        'space': {
            'gradientboostingclassifier__n_estimators': Integer(1e1, 1e3),
            'gradientboostingclassifier__learning_rate': Real(0.01, 1.0),
            'gradientboostingclassifier__max_depth': Integer(1, 10),
            'gradientboostingclassifier__min_samples_split': Real(0.01, 1.0),
            'gradientboostingclassifier__min_samples_leaf': Real(0.01, 0.5),
            'gradientboostingclassifier__max_features': Categorical(['auto', 'sqrt', 'log2']),
            **common_settings
        },
    },
    'EBB': {
        'model': create_pipeline(BalancedBaggingClassifier(
            sampler=RandomUnderSampler(random_state=RANDOM_STATE), 
            random_state=RANDOM_STATE), balancer=False),
        'space': {
            'balancedbaggingclassifier__n_estimators': Integer(1e1, 1e3),
            'balancedbaggingclassifier__max_samples': Real(0.1, 1.0),
            'balancedbaggingclassifier__max_features': Real(0.4, 1.0),
            'balancedbaggingclassifier__sampling_strategy': Real(0.5, 1.0),
            'normalization': Categorical(normalization),
            'imputation': Categorical(imputation),
        },
    },
    'OB': {
        'model': create_pipeline(BalancedBaggingClassifier(
            sampler=RandomOverSampler(random_state=RANDOM_STATE), 
            random_state=RANDOM_STATE), balancer=False),
        'space': {
            'balancedbaggingclassifier__n_estimators': Integer(1e1, 1e3),
            'balancedbaggingclassifier__max_samples': Real(0.1, 1.0),
            'balancedbaggingclassifier__max_features': Real(0.4, 1.0),
            'balancedbaggingclassifier__sampling_strategy': Real(0.5, 1.0),
            'normalization': Categorical(normalization),
            'imputation': Categorical(imputation),    
        },
    },
    'SB': {
        'model': create_pipeline(BalancedBaggingClassifier(
            sampler=SMOTE(random_state=RANDOM_STATE), 
            random_state=RANDOM_STATE), balancer=False),
        'space': {
            'balancedbaggingclassifier__n_estimators': Integer(1e1, 1e3),
            'balancedbaggingclassifier__max_samples': Real(0.1, 1.0),
            'balancedbaggingclassifier__max_features': Real(0.4, 1.0),
            'balancedbaggingclassifier__sampling_strategy': Real(0.5, 1.0),
            'normalization': Categorical(normalization),
            'imputation': Categorical(imputation),   
        },
    },
    'RRB': {
        'model': create_pipeline(BalancedBaggingClassifier(
            sampler=FunctionSampler(func=roughly_balanced_bagging, kw_args={"replace": True}),
            random_state=RANDOM_STATE), balancer=False),
        'space': {
            'balancedbaggingclassifier__n_estimators': Integer(1e1, 1e3),
            'balancedbaggingclassifier__max_samples': Real(0.1, 1.0),
            'balancedbaggingclassifier__max_features': Real(0.4, 1.0),
            'normalization': Categorical(normalization),
            'imputation': Categorical(imputation),
        },
    },
    'BRF': {
        'model': create_pipeline(BalancedRandomForestClassifier(
            random_state=RANDOM_STATE), balancer=False),
        'space': {
            'balancedrandomforestclassifier__n_estimators': Integer(1e1, 1e3),
            'balancedrandomforestclassifier__criterion': Categorical(['gini', 'entropy']),
            'balancedrandomforestclassifier__max_depth': Integer(2, 10),
            'balancedrandomforestclassifier__min_samples_split': Integer(2, 10),
            'balancedrandomforestclassifier__min_samples_leaf': Integer(1, 10),
            'balancedrandomforestclassifier__max_samples': Real(0.1, 1.0),
            'balancedrandomforestclassifier__max_features': Categorical(['sqrt', 'log2']),
            'balancedrandomforestclassifier__class_weight': Categorical(['balanced', 'balanced_subsample']),
            'balancedrandomforestclassifier__sampling_strategy': Real(0.5, 1.0),
            'normalization': Categorical(normalization),
            'imputation': Categorical(imputation),  
        },
    },  
}


We create a BayesSearchCV object tailored to optimize the hyperparameters of the respective model

In [8]:
opt = {
    model_name: BayesSearchCV(
        models[model_name]['model'],  # Model to be optimized
        models[model_name]['space'],  # Hyperparameter search space
        n_iter=20,  # Number of search iterations
        cv=RepeatedStratifiedKFold(random_state=RANDOM_STATE),  # Cross-validation strategy
        random_state=RANDOM_STATE,  # Random state for reproducibility
        scoring=make_scorer(geometric_mean_score)  # Custom metric for optimization
    )  # Create instances of BayesSearchCV for each model
    for model_name in models  # Iterate through model names in the 'models' dictionary
}

The code below optimizes machine learning models, records the best score during cross-validation, and saves the trained models to files.

In [9]:
for model_name, model_opt in opt.items():
    # Print the model name followed by a colon
    print(f'- {model_name}: ', end=' ')
    
    # Fit the model
    model_opt.fit(X_train, y_train)

    # Get the best score achieved by the model during cross-validation
    best_score = model_opt.best_score_

    # Print the best score
    print(f'{best_score:.4f}')

    # Generate the filename for saving the trained model
    model_filename = f'../models/{model_name}.joblib'

    # Save the best estimator to a file
    dump(model_opt.best_estimator_, model_filename)

- LR:  0.5830
- MLP:  0.5853
- SVM:  0.5985
- RF:  0.5831
- GB:  0.5800
- EBB:  0.5943
- OB:  0.4970
- SB:  0.5403
- RRB:  0.5923
- BRF:  0.5783
