# LIBRARIES

In [1]:
# custom imports
from constants import RANDOM_STATE
from functions import best_features_set

# deap imports
from deap import creator, base, tools, algorithms

# imbalanced-learn imports
from imblearn.over_sampling import RandomOverSampler 
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# joblib-related imports
from joblib import dump

# scikit-learn imports
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, QuantileTransformer, RobustScaler, StandardScaler 
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# scikit-optimize imports
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# standard Python imports
import ast
import csv
import os
import random
import warnings
import multiprocessing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# additional settings
%matplotlib inline

warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

# DIRECTORIES

In [2]:
directory_evaluations = '../evaluations/'
directory_models = '../models/'
directory_plots = '../images/'

for directory in [directory_evaluations, 
                  directory_models, 
                  directory_plots]:
    if not os.path.exists(directory):
        os.makedirs(directory)

# READ FILES

In [3]:
X_train = pd.read_csv('../dataset/X_train.csv', index_col=0)
y_train = pd.read_csv('../dataset/y_train.csv', index_col=0)

# IMBALANCE RATIO

In [4]:
class_counts = y_train.value_counts()
imb_ratio = round(class_counts.min() / class_counts.max() * 10) / 10
print(f'Imbalance ratio of is {imb_ratio}')

Imbalance ratio of is 0.4


# MODELING

Generates a pipeline for machine learning models with optional data preprocessing steps such as normalization, imputation, and balancing.

In [5]:
def create_pipeline(model, use_normalization=True, use_imputation=True, use_balancing=True, use_pca=True):
    steps = [('normalization', None)] if use_normalization else []
    steps += [('imputation', KNNImputer())] if use_imputation else []
    steps += [('balancing', None)] if use_balancing else []
    steps.append((model.__class__.__name__.lower(), model)) 
    return Pipeline(steps)

- These are the normalization methods evaluated in this pipeline:

In [6]:
normalization_methods = [
    MaxAbsScaler(),
    MinMaxScaler(),
    QuantileTransformer(),
    RobustScaler(),
    StandardScaler()
]

- These are the balancing methods evaluated in this pipeline:

In [7]:
balancing_methods = [
    RandomOverSampler(random_state=RANDOM_STATE),
    RandomUnderSampler(random_state=RANDOM_STATE),
]

Defines a set of models along with their respective hyperparameter spaces for hyperparameter optimization

In [8]:
common_settings = {
    'normalization': Categorical(normalization_methods),
    'imputation__n_neighbors': Integer(1, 10),
    'imputation__weights': Categorical(['distance', 'uniform']),
    'balancing': Categorical(balancing_methods),
    'balancing__sampling_strategy': Real(imb_ratio, 1.0),
}

In [9]:
model_space = {
    
    'DT': {
        'model': create_pipeline(DecisionTreeClassifier(random_state=RANDOM_STATE)),
        'space': {
            'decisiontreeclassifier__max_depth': Integer(3, 5),
            'decisiontreeclassifier__min_samples_split': Integer(5, 10),
            'decisiontreeclassifier__min_samples_leaf': Integer(1, 5),
            'decisiontreeclassifier__criterion': Categorical(['gini', 'entropy']),
            'decisiontreeclassifier__ccp_alpha': Real(0.0, 0.1),
            **common_settings
        } 
    },

    'GB': {
        'model': create_pipeline(GradientBoostingClassifier(random_state=RANDOM_STATE)),
        'space': {
            'gradientboostingclassifier__n_estimators': Integer(50, 500),
            'gradientboostingclassifier__learning_rate': Real(0.01, 1.0, 'log-uniform'),
            'gradientboostingclassifier__max_depth': Integer(1, 10),
            'gradientboostingclassifier__min_samples_split': Integer(2, 20),  
            'gradientboostingclassifier__min_samples_leaf': Integer(1, 20),  
            'gradientboostingclassifier__subsample': Real(0.5, 1.0, 'log-uniform'), 
            'gradientboostingclassifier__loss': Categorical(['deviance', 'exponential']), 
            **common_settings
        }
    },

    'LR': {
        'model': create_pipeline(LogisticRegression(random_state=RANDOM_STATE)),
        'space': {
            'logisticregression__C': Real(0.0001, 1000, 'log-uniform'),
            'logisticregression__max_iter': Integer(200, 2000),
            'logisticregression__solver': Categorical(['liblinear', 'sag', 'saga']),
            'logisticregression__class_weight': Categorical(['balanced', None]),
            'logisticregression__tol': Real(0.0001, 0.001),
            **common_settings
        }
    },
    
    'MLP': {
        'model': create_pipeline(MLPClassifier(random_state=RANDOM_STATE)),
        'space': {
            'mlpclassifier__hidden_layer_sizes': Integer(2, 16),
            'mlpclassifier__activation': Categorical(['logistic','tanh', 'relu']),
            'mlpclassifier__learning_rate': Categorical(['constant', 'adaptive']),
            'mlpclassifier__learning_rate_init': Real(0.001, 0.1, 'log-uniform'),
            'mlpclassifier__max_iter': Integer(200, 2000),
            'mlpclassifier__solver': Categorical(['sgd', 'adam']),
            'mlpclassifier__momentum': Real(0.1, 0.9, 'log-uniform'),
            **common_settings
        }
    },
  
    'NB': {
        'model': create_pipeline(GaussianNB()),
        'space': {
            **common_settings
        }
    },
        
    'RF': {
        'model': create_pipeline(RandomForestClassifier(random_state=RANDOM_STATE)),
        'space': {
            'randomforestclassifier__n_estimators': Integer(50,500),
            'randomforestclassifier__max_depth': Integer(3, 10),
            'randomforestclassifier__min_samples_split': Integer(2, 10),
            'randomforestclassifier__min_samples_leaf': Integer(1, 5),
            'randomforestclassifier__criterion': Categorical(['gini', 'entropy']),
            'randomforestclassifier__max_samples': Real(0.5, 1.0, 'log-uniform'),
            'randomforestclassifier__class_weight': Categorical(['balanced', 'balanced_subsample']),
            **common_settings
        }
    },
    
    'SVM': {
        'model': create_pipeline(SVC(probability=True, random_state=RANDOM_STATE)),
        'space': {
            'svc__kernel': Categorical(['linear', 'poly', 'rbf', 'sigmoid']),
            'svc__gamma': Real(0.001, 1000),
            'svc__degree': Integer(2, 5),
            'svc__coef0': Real(0.0001, 1, 'log-uniform'),
            'svc__C': Real(0.001, 1000),
            'svc__tol': Real(0.00001, 0.1, 'log-uniform'),
            'svc__max_iter': Integer(200, 2000),
            'svc__class_weight': Categorical([None, 'balanced']),
            **common_settings
        }
    }
  
}

Implements Bayesian optimization for hyperparameter tuning using the BayesSearchCV

In [10]:
def bayes_search(model, space, refit=True):
    bs = BayesSearchCV(
        model, space, n_iter=10, refit=refit,  
        cv=RepeatedStratifiedKFold(random_state=RANDOM_STATE), 
        random_state=RANDOM_STATE,  
        scoring='f1'
    )
    return bs

# FEATURES

In [11]:
features = ['KATZ', 'MNA-SF', 'Hemoglobin', 'Leukocyte', 'Advanced Staging'] 

# SAVE MODELS

In [12]:
for model_name in model_space:
    model_opt = bayes_search(
        model_space[model_name]['model'],
        model_space[model_name]['space'])
    model_opt.fit(X_train.loc[:,features],y_train)
    print(f'{model_name}: {features} -> {model_opt.best_score_}')
    model_path = f'{directory_models}/{model_name}.joblib'
    dump(model_opt.best_estimator_, model_path)

DT: ['KATZ', 'MNA-SF', 'Hemoglobin', 'Leukocyte', 'Advanced Staging'] -> 0.522645938311102
GB: ['KATZ', 'MNA-SF', 'Hemoglobin', 'Leukocyte', 'Advanced Staging'] -> 0.6120094968521406
LR: ['KATZ', 'MNA-SF', 'Hemoglobin', 'Leukocyte', 'Advanced Staging'] -> 0.6403565174076009
MLP: ['KATZ', 'MNA-SF', 'Hemoglobin', 'Leukocyte', 'Advanced Staging'] -> 0.6418989826822643
NB: ['KATZ', 'MNA-SF', 'Hemoglobin', 'Leukocyte', 'Advanced Staging'] -> 0.6356302161890397
RF: ['KATZ', 'MNA-SF', 'Hemoglobin', 'Leukocyte', 'Advanced Staging'] -> 0.6412872514482422
SVM: ['KATZ', 'MNA-SF', 'Hemoglobin', 'Leukocyte', 'Advanced Staging'] -> 0.500292261847424
