# CONSTANTS

**RANDOM_STATE** is a constant that is typically used to initialize the random number generator in a library or algorithm that uses randomness in some aspect of its operation. This means that when the RANDOM_STATE is fixed to a certain value, the algorithm will generate the same random results every time it is run, which can be useful for reproducing results and ensuring result consistency.

In [None]:
RANDOM_STATE = 0

# LIBRARIES

In [None]:
import pandas as pd
import warnings

from imblearn.over_sampling import BorderlineSMOTE
from imblearn.pipeline import make_pipeline
from joblib import dump
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

# READ FILES

In [None]:
# Read the training data from the CSV files
X_train = pd.read_csv('../dataset/X_train.csv', usecols=lambda col: col not in ['TRACK', 'TRUST'], index_col=0)
y_train = pd.read_csv('../dataset/y_train.csv', index_col=0)

# MODELING

## - ML Models 

Creates a machine learning pipeline that either scales input features and applies a given model or directly applies the given model

- MinMaxScaler: works by subtracting the minimum value in the feature from each observation, and then dividing the result by the range (i.e., the maximum value minus the minimum value). This process ensures that the values in the feature are transformed to a range between 0 and 1.
- BorderlineSMOTE: is an extension of the SMOTE algorithm, which is commonly used for addressing the class imbalance problem in machine learning.

In [None]:
def create_pipeline(model):
    # Create a pipeline with preprocessing steps and a model
    pipeline = make_pipeline(
        MinMaxScaler(),  # Apply MinMaxScaler for feature scaling
        BorderlineSMOTE(random_state=RANDOM_STATE),  # Apply BorderlineSMOTE for oversampling
        model  # The specified model
    )
    
    # Return the created pipeline
    return pipeline

We will explore several classification models in this analysis:

In [None]:
models = {
    'SVM': create_pipeline(SVC(probability=True,random_state=RANDOM_STATE)),
    'RF': create_pipeline(RandomForestClassifier(random_state=RANDOM_STATE)),
    'LR': create_pipeline(LogisticRegression(random_state=RANDOM_STATE)),
    'MLP': create_pipeline(MLPClassifier(random_state=RANDOM_STATE)),
    'KNN': create_pipeline(KNeighborsClassifier()),
}

For each model, we define a search space for its hyperparameters:

In [None]:
space = {
    
    'SVM': {
        'svc__C': Real(1e-2,1e+2),
        'svc__gamma': Real(1e-4,1e+1),
        'svc__kernel': Categorical(['linear','rbf','poly','sigmoid']),
        'svc__degree': Integer(1,5),
        'svc__coef0': Real(0,1),
        'svc__shrinking': Categorical([True,False]),
        'svc__class_weight': Categorical([None,'balanced']),
        'svc__max_iter': Integer(100,5000),
        'svc__tol': Real(1e-6,1e-2)
    },
    'RF': {
        'randomforestclassifier__n_estimators': Integer(200,800),
        'randomforestclassifier__criterion': Categorical(['gini','entropy']),
        'randomforestclassifier__max_depth': Integer(2,10),
        'randomforestclassifier__min_samples_split': Integer(2,10),
        'randomforestclassifier__min_samples_leaf': Integer(1,10),
        'randomforestclassifier__max_features': Categorical(['sqrt','log2']),
        'randomforestclassifier__class_weight': Categorical(['balanced','balanced_subsample'])
    },
    'LR': {
        'logisticregression__C': Real(1e-2,1e+2),
        'logisticregression__max_iter' : Integer(100,1000),
        'logisticregression__solver': Categorical(['newton-cg','lbfgs','liblinear','sag','saga']),
        'logisticregression__fit_intercept': Categorical([True,False]),
        'logisticregression__class_weight': Categorical(['balanced',None]),     
        'logisticregression__l1_ratio': Real(0,1)
    },
    'MLP': {
        'mlpclassifier__hidden_layer_sizes': Integer(2,16),
        'mlpclassifier__activation': Categorical(['logistic','tanh','relu']), 
        'mlpclassifier__max_iter' : Integer(1000,5000),
        'mlpclassifier__alpha': Real(1e-3,1e0),
        'mlpclassifier__learning_rate': Categorical(['constant','adaptive']),
        'mlpclassifier__learning_rate_init': Real(1e-4,1e-1),     
        'mlpclassifier__momentum': Real(0.1,0.9),
        'mlpclassifier__early_stopping': Categorical([True,False]),
        'mlpclassifier__validation_fraction': Real(0.1,0.3),
        'mlpclassifier__beta_1': Real(0.8,0.99),
        'mlpclassifier__beta_2': Real(0.8,0.99),
        'mlpclassifier__epsilon': Real(1e-8,1e-6)
    }
}

for key in space:
    space[key]['borderlinesmote__sampling_strategy'] = Real(0.85,1.0)
    space[key]['borderlinesmote__k_neighbors'] = Integer(1,10)
    space[key]['borderlinesmote__m_neighbors'] = Integer(1,10)

We instantiate a BayesSearchCV object that optimizes the hyperparameters of that specific model

In [None]:
# Create an empty dictionary to store the optimized models
opt = {}

# Iterate over each item in the 'models' dictionary
for model_name, model in models.items():
    # Create a BayesSearchCV object for hyperparameter optimization
    # using the specified model, search space, and other parameters
    opt[model_name] = BayesSearchCV(
        model,  # The model to optimize
        space[model_name],  # The search space for hyperparameters
        n_iter=20,  # Number of parameter settings that are sampled
        cv=RepeatedStratifiedKFold(random_state=RANDOM_STATE),  # Cross-validation strategy
        random_state=RANDOM_STATE,  # Random state for reproducibility
        scoring='roc_auc'  # Scoring metric to optimize
    )

In [None]:
# Iterate over each item in the 'opt' dictionary
for model_name, model_opt in opt.items():
    # Print the model name followed by a colon
    print(model_name, end=': ')
    
    # Fit the model
    model_opt.fit(X_train, y_train)
    
    # Get the best score achieved by the model during cross-validation
    best_score = model_opt.best_score_
    
    # Print the best score
    print(best_score)
    
    # Generate the filename for saving the trained model
    model_filename = '../models/ml/' + model_name + '.joblib'
    
    # Save the best estimator to a file
    dump(model_opt.best_estimator_, model_filename)