# Notebook 2 - Modelling and Optimization

In this notebook, I perform experimentation and evaluation of various classifiers, including tree-based bagging and boosting models, using stratified k-fold validation and hyperparameter tuning. The best models are saved alongside performance assessment on a holdout set.


In [3]:
!pip install bayesian_optimization --quiet
!pip install -q catboost

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [74]:
## Import the necessary libraries required for the task

## Data Manipulation and Visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-whitegrid')
import seaborn as sns

# Turning off warnings
import warnings
warnings.simplefilter('ignore')
## Various libraries for preprocessing, modeling, and evaluation
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.dummy import DummyClassifier
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, VotingClassifier, AdaBoostClassifier,GradientBoostingClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier, Dataset
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score, RandomizedSearchCV, StratifiedKFold
from lightgbm import LGBMClassifier

## Optimization Library
from bayes_opt import BayesianOptimization

## Utils
import os
import time
from joblib import dump

# Setting a maximum width for columns display in pandas dataframe
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)

## Data Preparation for Modelling

In [9]:
## Load Processed Dataset
df = pd.read_csv("/content/drive/MyDrive/quantspark/proc_dataset.csv")
target_col = "CHURN"
df.head()

Unnamed: 0,TENURE_MONTHS,CONTRACT_LENGTH,ARR,SEATS,COMPETITOR_SEATS,DETECTEDSEATSCOUNT,PRODUCT_ONE,PRODUCT_TWO,PRODUCT_THREE,PRODUCT_FOUR,PRODUCT_FIVE,PRODUCT_SIX,PRODUCT_SEVEN,PRODUCT_EIGHT,PRODUCT_NINE,PRODUCT_TEN,PRODUCT_ELEVEN,PRODUCT_TWELVE,PRODUCT_THIRTEEN,PRODUCT_FOURTEEN,PRODUCT_FIFTEEN,PRODUCT_SIXTEEN,PRODUCT_SEVENTEEN,PRODUCT_EIGHTEEN,PRODUCT_NINETEEN,PRODUCT_TWENTY,PRODUCT_TWENTYONE,PRODUCT_TWENTYTWO,PRODUCT_TWENTYTHREE,LICENSINGSPECIALIST_CHANGE,SALESREP_CHANGE,UPSELLMANAGER_CHANGE,ACCOUNTMANAGER_CHANGE,CHURN_RISK_DISCUSSION,CUSTOMER_HEALTH_CHECK,CS_EVENT_ATTENDED,CROSS_SELL_RECENCY,SEATS_DOWNSELL_RECENCY,PRODUCT_DOWNSELL_RECENCY,SEATS_UPSELL_RECENCY,PCT_PRODUCT_TWO_ENABLED,PCT_PRODUCT_THREE_ENABLED,PCT_PRODUCT_FOUR_ENABLED,PCT_PRODUCT_FIVE_ENABLED,PCT_PRODUCT_SIX_ENABLED,PCT_PRODUCT_NINE_ENABLED,PCT_PRODUCT_TWO_BEST_PRACTICE,PCT_PRODUCT_THREE_BEST_PRACTICE,PCT_PRODUCT_FOUR_BEST_PRACTICE,CUSTOMER_BEHAVIOUR_ONE,CUSTOMER_BEHAVIOUR_TWO,CUSTOMER_BEHAVIOUR_THREE,CUSTOMER_BEHAVIOUR_FOUR,CUSTOMER_BEHAVIOUR_FIVE,CUSTOMER_BEHAVIOUR_SIX,CUSTOMER_BEHAVIOUR_SEVEN,CUSTOMER_BEHAVIOUR_EIGHT,CUSTOMER_BEHAVIOUR_NINE,CUSTOMER_BEHAVIOUR_TEN,CUSTOMER_BEHAVIOUR_ELEVEN,MAX_SUPPORT_CASE_DAYSTOCLOSE,MAX_SUPPORT_CASE_TIMETOFIRSTRESPONSE,AVG_SUPPORT_CASE_PRIORITY_SCORE,SUM_SEVERE_CASES,SUM_HIGH_CASES,SUM_MEDIUM_CASES,SUM_LOW_CASES,SUM_STANDARD_CASES,SUPPORT_CASE_NUMBEROFSLABREACHES,BACKLOG,CHURN,ACCOUNTING_MONTH_Month,ACCOUNTING_MONTH_Year,RENEWAL_MONTH_Month,RENEWAL_MONTHYear,CONTRACT_START_DATE_Day,CONTRACT_START_DATE_Month,CONTRACT_START_DATE_Year,REGION_ANZ,REGION_CAN,REGION_DACH,REGION_ME,REGION_RSA,REGION_RoE,REGION_SEA,REGION_UKI,REGION_USA,"SECTOR_Agriculture, Forestry & Fishing","SECTOR_Arts, Entertainment & Recreation",SECTOR_Construction,SECTOR_Education,SECTOR_Energy & Utilities,SECTOR_Finance,SECTOR_Government,SECTOR_Health & Social Care,SECTOR_Housing Associations,SECTOR_IT,SECTOR_Manufacturing,SECTOR_Media & Publishing,SECTOR_Mining & Extraction,SECTOR_None,SECTOR_Not For Profit,SECTOR_Other,SECTOR_Professional Services,SECTOR_Real Estate,SECTOR_Retail & Wholesale,SECTOR_Sci/Tech,SECTOR_Telecommunications,"SECTOR_Transportation, Storage & Delivery","SECTOR_Travel, Hospitality & Catering",col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7
0,126,364,27241.17,330,0,1.015152,1,1,1,1,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,21,6,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,1,0,10,2021,12,2021,24,12,2020,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,127,364,27241.17,330,0,1.021212,1,1,1,1,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,22,7,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,1,0,11,2021,12,2021,24,12,2020,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,128,364,30349.76,350,0,0.94,1,1,1,1,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,23,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,1,0,12,2021,12,2022,24,12,2021,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,129,364,30349.76,350,0,0.96,1,1,1,1,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24,1,0,0,0,0,0,0,0,0,0,0,116598,0,0,0,11525,362,4,14,1892,0,0,0,0.0,0,0,0,0,0,0,1,0,1,2022,12,2022,24,12,2021,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,130,364,30349.76,350,0,0.94,1,1,1,1,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,25,2,0,0,0,0,0,0,0,0,0,0,40704,0,0,0,3804,194,17,4,737,0,0,0,0.0,0,0,0,0,0,0,1,0,2,2022,12,2022,24,12,2021,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [10]:
## Seperate into data and label
X = df.drop(target_col, axis=1)
y = df[target_col]
print(X.shape, y.shape)

(100000, 117) (100000,)


#### HoldOut Set Importance

- One of the business problem areas is that the client is keen to ensure the model is well validated, so they can be confident in the model outputs. Having a holdout set is one of the ways to ensure unbiased model performance validation.
- High performance on a holdout set would indicate the robustness and generalisability of the model and hence help gain trust of the client

In [11]:
# Split the data into training/validation and hold-out sets
X_train_val, X_hold_out, y_train_val, y_hold_out = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [75]:
## Write train-validation and holdout datasets for reusability
# DATASET_WRITEPATH = "/content/drive/MyDrive/quantspark/datasets"
# X_train_val.to_csv(os.path.join(DATASET_WRITEPATH,"X_train_val.csv"), index=False)
# X_hold_out.to_csv(os.path.join(DATASET_WRITEPATH,"X_hold_out.csv"), index=False)
# y_train_val.to_csv(os.path.join(DATASET_WRITEPATH,"y_train_val.csv"), index=False)
# y_hold_out.to_csv(os.path.join(DATASET_WRITEPATH,"y_hold_out.csv"), index=False)

In [12]:
y_train_val.value_counts()

0    79464
1    536  
Name: CHURN, dtype: int64

In [13]:
y_hold_out.value_counts()

0    19866
1    134  
Name: CHURN, dtype: int64

# Modelling

### Stratified KFold with Feature Selection

In [21]:
# Function to perform K-fold cross-validation with feature selection
def kfold_cv_with_feature_selection(trn, target, estimator_name, estimator, n_splits=5, shuffle_val=False, featimp=False, top_k_features=10):
    featimp_list = []
    add = 0
    splits = n_splits

    # Initialize KFold object with or without shuffling
    if shuffle_val:
        kfold, scores = StratifiedKFold(n_splits=splits, shuffle=shuffle_val), list()
    else:
        kfold, scores = StratifiedKFold(n_splits=splits, shuffle=shuffle_val), list()

    # Iterating over each split, performing feature selection and then training the model
    for train, test in kfold.split(trn, target):
        # Splitting the data
        x_train, x_test = trn.values[train], trn.values[test]
        y_train, y_test = target.values[train], target.values[test]

        # Feature Selection using SelectKBest
        selector = SelectKBest(score_func=f_classif, k=top_k_features) #chi2
        x_train = selector.fit_transform(x_train, y_train)
        x_test = selector.transform(x_test)

        # Model training
        model = estimator
        model.fit(x_train, y_train)

        # Model prediction
        preds = model.predict(x_test)

        # Extracting and storing feature importance (if the flag is set)
        if featimp:
            selected_features = trn.columns[selector.get_support()]
            feature_importances = pd.DataFrame(model.feature_importances_,
                                              index=selected_features,
                                              columns=['importance'])
            featimp_list.append(feature_importances.values)

        # Calculate the ROC-AUC score
        score = roc_auc_score(y_test, preds)
        scores.append(score)

    # Calculating the average score
    avg_score = np.mean(scores)
    print(f"Average of ROC-AUC of model {estimator_name} is {avg_score} and per fold scores are {scores}")
    print(f"{estimator_name} Classification_report : \n{classification_report(y_test,preds)}")

    # Displaying the feature importance (if the flag is set)
    if featimp:
        for item in featimp_list:
            add += item
        df_cv = pd.DataFrame(add / len(featimp_list), index=selected_features, columns=["importance"]).sort_values('importance', ascending=False)
        display(df_cv)

    return avg_score

#### Stratified KFold
- I've ensure to use a stratified KFold cross validation technique to ensure an equal distribution of classes between training and validation sets. This is imperative in imbalanced classification problems. I've used a 10 Fold CV due to a large number of records in the dataset. I've also used feature selection to remove any irrelevant features.

#### Validation Metrics
 - Accuracy can be a misleading metric as it is possible to get a decent accuracy while having mostly incorrect predictions for the minority class.
 - Due to high imbalance in the data, I've not relied on one singular metric as the go-to metric. I've printout out a range of metrics including class-wise F1 scores, macro and weighted average metrics, along with ROC-AUC Scores.
 - Keeping a track of all these metrics will generally give a good idea about the performance of the model

### Baseline Model Performance

In [44]:
baseline_models = {
    "logreg" : LogisticRegression(),
    # "dummyclf" : DummyClassifier(strategy='most_frequent', random_state=42)
}

In [24]:
fs_model_scores = {}
for model_name, model in baseline_models.items():
    start_time = time.time()
    score = kfold_cv_with_feature_selection(X_train_val, y_train_val, model_name, model, n_splits=10, shuffle_val=False, top_k_features=90)
    end_time = time.time()
    elapsed_time = end_time - start_time
    fs_model_scores[model_name] = score
    print(f"Execution time for {model_name}: {elapsed_time:.2f} seconds")

Average of ROC-AUC of model logreg is 0.9957111239087911 and per fold scores are [0.9989933308166603, 0.9996854158802064, 0.9998112495281238, 0.999307914936454, 0.9811668577714385, 0.9894822459005695, 0.9991190536118802, 0.9994336773219229, 0.9997483010319658, 0.9903631922886894]
logreg Classification_report : 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7946
           1       0.90      0.98      0.94        54

    accuracy                           1.00      8000
   macro avg       0.95      0.99      0.97      8000
weighted avg       1.00      1.00      1.00      8000

Execution time for logreg: 68.31 seconds


### Baseline Model Performance on HoldOut Set

In [47]:
baselinemodel = baseline_models["logreg"]
baselinemodel.fit(X_train_val, y_train_val)

# Predict on the hold-out set
y_hold_out_pred = baselinemodel.predict(X_hold_out)
y_hold_out_pred_proba = baselinemodel.predict_proba(X_hold_out)[:, 1]  # probabilities for the positive class

# Calculate various performance metrics
roc_auc = roc_auc_score(y_hold_out, y_hold_out_pred_proba)
cls_report = classification_report(y_hold_out, y_hold_out_pred)

# Output the performance metrics
print(f"ROC-AUC on hold-out set: {roc_auc}")
print(f"Classification Report on hold-out set: \n{cls_report}")

ROC-AUC on hold-out set: 0.9998858020378326
Classification Report on hold-out set: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19866
           1       0.87      0.99      0.92       134

    accuracy                           1.00     20000
   macro avg       0.93      0.99      0.96     20000
weighted avg       1.00      1.00      1.00     20000



Due to the high class imbalance, I'm mainly interested in knowing the performance of the models on class 1 i.e. churn. The baseline logistic regression model obtained an F1 score of 0.92 on class 1. Let's see if we can improve upon that.

### Test Models with Default Configurations

In [25]:
test_models = {
    "rfclf" : RandomForestClassifier(n_jobs=-1),
    "lgbclf" : LGBMClassifier(verbose=-1),
    'xgbclf' : XGBClassifier(),
    'extratreesclf' : ExtraTreesClassifier(n_jobs=-1),
    'DTclf' : DecisionTreeClassifier(),
    "KNNclf" : KNeighborsClassifier(),
    "CBclf" : CatBoostClassifier(verbose=False),
    "GBclf":GradientBoostingClassifier(random_state=0),
}

In [27]:
fs_model_scores = {}
for model_name, model in test_models.items():
    start_time = time.time()
    score = kfold_cv_with_feature_selection(X_train_val, y_train_val, model_name, model, n_splits=10, shuffle_val=False, top_k_features=90)
    end_time = time.time()
    elapsed_time = end_time - start_time
    fs_model_scores[model_name] = score
    print(f"Execution time for {model_name}: {elapsed_time:.2f} seconds")

Average of ROC-AUC of model rfreg is 0.9999685400044107 and per fold scores are [0.9998741663520825, 1.0, 0.9999370831760414, 1.0, 1.0, 1.0, 0.9999370752579914, 0.9999370752579914, 1.0, 1.0]
rfreg Classification_report : 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7946
           1       1.00      1.00      1.00        54

    accuracy                           1.00      8000
   macro avg       1.00      1.00      1.00      8000
weighted avg       1.00      1.00      1.00      8000

Execution time for rfreg: 62.00 seconds
Average of ROC-AUC of model lgb is 0.6670667951658389 and per fold scores are [0.6490903176943477, 0.9225149635201132, 0.8563513940231391, 0.6485525569159835, 0.9336749913769797, 0.49295242889504154, 0.5091334097752421, 0.6998023696991732, 0.4886735464384596, 0.46992197331990937]
lgb Classification_report : 
              precision    recall  f1-score   support

           0       0.99      0.94      0.97   

In [32]:
fs_model_scores

{'rfreg': 0.9999685400044107,
 'lgb': 0.6670667951658389,
 'xgb': 0.9747392821740117,
 'extratrees': 0.5287923903038889,
 'DTReg': 0.9972033471750346,
 'KNN Reg': 0.9260445941670181,
 'CB REg': 0.9952333478108949,
 'GradientBoostingClassifier': 0.991617736370588}

We find that Random Forest, Decision Tree, Catboost and Gradient Boosting models are the best performing models with impressive performance of ROC-AUC score of 0.99 on validation sets. Now let's test their performance on the holdout set.

In [50]:
for model,_ in test_models.items():
  if model in ["rfreg","DTReg","CB REg","GradientBoostingClassifier"]:
      print(f"Holdout Set Performance of model {model}")
      testmodel = test_models[model]
      testmodel.fit(X_train_val, y_train_val)

      # Predict on the hold-out set
      y_hold_out_pred = testmodel.predict(X_hold_out)
      y_hold_out_pred_proba = testmodel.predict_proba(X_hold_out)[:, 1]  # probabilities for the positive class

      # Calculate various performance metrics
      roc_auc = roc_auc_score(y_hold_out, y_hold_out_pred_proba)
      cls_report = classification_report(y_hold_out, y_hold_out_pred)

      # Output the performance metrics
      print(f"ROC-AUC on hold-out set: {roc_auc}")
      print(f"Classification Report on hold-out set: \n{cls_report}")

      print("="*50)

Holdout Set Performance of model rfreg
ROC-AUC on hold-out set: 0.9999554853338262
Classification Report on hold-out set: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19866
           1       0.98      0.99      0.99       134

    accuracy                           1.00     20000
   macro avg       0.99      1.00      0.99     20000
weighted avg       1.00      1.00      1.00     20000

Holdout Set Performance of model DTReg
ROC-AUC on hold-out set: 0.9961931508269585
Classification Report on hold-out set: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19866
           1       0.98      0.99      0.99       134

    accuracy                           1.00     20000
   macro avg       0.99      1.00      0.99     20000
weighted avg       1.00      1.00      1.00     20000

Holdout Set Performance of model CB REg
ROC-AUC on hold-out set: 0.9999819687428158
Classification R

The holdout set performance is also impressive for all these 4 models. I plan to optimize their parameters using Bayesian hyperparameter optimization and then use an ensembling technique such as Voting or Stacking to obtain a very reliable and robust model in-line with the client expectations.

## Hyperparameter Optimization
The objective here is to find the best parameters for the models to improve their performance. I have chosen Bayesian Optimization hyperparameter as it intelligently explores the hyperparameter space and converges more quickly than a grid or random search. It uses past evaluation results to inform which hyperparameters should be tried next.


In [51]:
# Define the evaluation function for Bayesian Optimization
def gbr_eval(n_estimators, learning_rate, max_depth, min_samples_split, min_samples_leaf, subsample):
    gbr = GradientBoostingClassifier(
        n_estimators=int(n_estimators),
        learning_rate=learning_rate,
        max_depth=int(max_depth),
        min_samples_split=int(min_samples_split),
        min_samples_leaf=int(min_samples_leaf),
        subsample=subsample,
        random_state=42
    )
    # Since cross_val_score expects a utility function (greater is better) rather than a cost function (lower is better),
    # we use `make_scorer` to negate the roc_auc_score, which follows the convention that higher scores are better.
    return cross_val_score(gbr, X_train_val, y_train_val, cv=3, scoring=make_scorer(roc_auc_score, needs_proba=True)).mean()

# Define the parameter space for Bayesian Optimization
param_space = {
    'n_estimators': (50, 500),
    'learning_rate': (0.005, 0.3),
    'max_depth': (3, 10),
    'min_samples_split': (2, 10),
    'min_samples_leaf': (1, 10),
    'subsample': (0.5, 1)
}

# Initialize Bayesian Optimization
gbrBO = BayesianOptimization(gbr_eval, param_space, random_state=42)

# Run optimization
gbrBO.maximize(n_iter=8, init_points=5)
print(gbrBO.max)

|   iter    |  target   | learni... | max_depth | min_sa... | min_sa... | n_esti... | subsample |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.8248   [0m | [0m0.1155   [0m | [0m9.655    [0m | [0m7.588    [0m | [0m6.789    [0m | [0m120.2    [0m | [0m0.578    [0m |
| [95m2        [0m | [95m1.0      [0m | [95m0.02213  [0m | [95m9.063    [0m | [95m6.41     [0m | [95m7.665    [0m | [95m59.26    [0m | [95m0.985    [0m |
| [0m3        [0m | [0m0.9439   [0m | [0m0.2506   [0m | [0m4.486    [0m | [0m2.636    [0m | [0m3.467    [0m | [0m186.9    [0m | [0m0.7624   [0m |
| [0m4        [0m | [0m0.4963   [0m | [0m0.1324   [0m | [0m5.039    [0m | [0m6.507    [0m | [0m3.116    [0m | [0m181.5    [0m | [0m0.6832   [0m |
| [0m5        [0m | [0m0.9943   [0m | [0m0.1395   [0m | [0m8.496    [0m | [0m2.797    [0m | [0m6.114    [0m | [0m316.6    [0m | [0m0.

In [None]:
def rfc_eval(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features):
    rfc = RandomForestClassifier(
        n_estimators=int(n_estimators),
        max_depth=int(max_depth),
        min_samples_split=int(min_samples_split),
        min_samples_leaf=int(min_samples_leaf),
        max_features=max_features,
        random_state=42,
        class_weight='balanced'  # for handling imbalanced classes
    )
    return cross_val_score(rfc, X_train_val, y_train_val, cv=3, scoring='roc_auc').mean()

rfc_param_space = {
    'n_estimators': (50, 500),
    'max_depth': (5, 30),
    'min_samples_split': (2, 20),
    'min_samples_leaf': (1, 20),
    'max_features': (0.1, 0.999),  # Fraction of features to consider at each split
}

rfcBO = BayesianOptimization(rfc_eval, rfc_param_space, random_state=42)
rfcBO.maximize(n_iter=8, init_points=5)
print(rfcBO.max)

|   iter    |  target   | max_depth | max_fe... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m1.0      [0m | [0m14.36    [0m | [0m0.9547   [0m | [0m14.91    [0m | [0m12.78    [0m | [0m120.2    [0m |
| [95m2        [0m | [95m1.0      [0m | [95m8.9      [0m | [95m0.1522   [0m | [95m17.46    [0m | [95m12.82    [0m | [95m368.6    [0m |
| [95m3        [0m | [95m1.0      [0m | [95m5.515    [0m | [95m0.9719   [0m | [95m16.82    [0m | [95m5.822    [0m | [95m131.8    [0m |
| [95m4        [0m | [95m1.0      [0m | [95m9.585    [0m | [95m0.3735   [0m | [95m10.97    [0m | [95m9.775    [0m | [95m181.1    [0m |
| [95m5        [0m | [95m1.0      [0m | [95m20.3     [0m | [95m0.2254   [0m | [95m6.551    [0m | [95m8.595    [0m | [95m255.2    [0m |
| [0m6        [0m | [0m1.0      [0m | [0m17.48    [0m | [0m0.5526   [0m | [0m7.97

In [53]:
def dtc_eval(max_depth, min_samples_split, min_samples_leaf, max_features):
    dtc = DecisionTreeClassifier(
        max_depth=int(max_depth),
        min_samples_split=int(min_samples_split),
        min_samples_leaf=int(min_samples_leaf),
        max_features=max_features,
        random_state=42,
        class_weight='balanced'  # assuming you have imbalanced classes
    )
    return cross_val_score(dtc, X_train_val, y_train_val, cv=3, scoring='roc_auc').mean()

dtc_param_space = {
    'max_depth': (3, 30),
    'min_samples_split': (2, 20),
    'min_samples_leaf': (1, 20),
    'max_features': (0.1, 0.999),
}

dtcBO = BayesianOptimization(dtc_eval, dtc_param_space, random_state=42)
dtcBO.maximize(n_iter=8, init_points=5)
print(dtcBO.max)

|   iter    |  target   | max_depth | max_fe... | min_sa... | min_sa... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m0.9999   [0m | [0m13.11    [0m | [0m0.9547   [0m | [0m14.91    [0m | [0m12.78    [0m |
| [0m2        [0m | [0m0.974    [0m | [0m7.213    [0m | [0m0.2402   [0m | [0m2.104    [0m | [0m17.59    [0m |
| [0m3        [0m | [0m0.9991   [0m | [0m19.23    [0m | [0m0.7366   [0m | [0m1.391    [0m | [0m19.46    [0m |
| [0m4        [0m | [0m0.9709   [0m | [0m25.48    [0m | [0m0.2909   [0m | [0m4.455    [0m | [0m5.301    [0m |
| [95m5        [0m | [95m1.0      [0m | [95m11.21    [0m | [95m0.5718   [0m | [95m9.207    [0m | [95m7.242    [0m |
| [95m6        [0m | [95m1.0      [0m | [95m10.68    [0m | [95m0.4957   [0m | [95m9.389    [0m | [95m8.559    [0m |
| [0m7        [0m | [0m1.0      [0m | [0m7.505    [0m | [0m0.5387   [0m | [0m17.42    [0m | [0m5.

In [54]:
def cbc_eval(n_estimators, learning_rate, depth, l2_leaf_reg, border_count):
    cbc = CatBoostClassifier(
        iterations=int(n_estimators),
        learning_rate=learning_rate,
        depth=int(depth),
        l2_leaf_reg=l2_leaf_reg,
        border_count=int(border_count),
        auto_class_weights='Balanced',
        verbose=False
    )
    return cross_val_score(cbc, X_train_val, y_train_val, cv=3, scoring='roc_auc').mean()

cbc_param_space = {
    'n_estimators': (50, 500),
    'learning_rate': (0.005, 0.3),
    'depth': (4, 10),
    'l2_leaf_reg': (1, 10),
    'border_count': (50, 255),
}

cbcBO = BayesianOptimization(cbc_eval, cbc_param_space, random_state=42)
cbcBO.maximize(n_iter=8, init_points=5)
print(cbcBO.max)

|   iter    |  target   | border... |   depth   | l2_lea... | learni... | n_esti... |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.9999   [0m | [0m126.8    [0m | [0m9.704    [0m | [0m7.588    [0m | [0m0.1816   [0m | [0m120.2    [0m |
| [0m2        [0m | [0m0.9998   [0m | [0m81.98    [0m | [0m4.349    [0m | [0m8.796    [0m | [0m0.1823   [0m | [0m368.6    [0m |
| [0m3        [0m | [0m0.9995   [0m | [0m54.22    [0m | [0m9.819    [0m | [0m8.492    [0m | [0m0.06764  [0m | [0m131.8    [0m |
| [0m4        [0m | [0m0.9999   [0m | [0m87.6     [0m | [0m5.825    [0m | [0m5.723    [0m | [0m0.1324   [0m | [0m181.1    [0m |
| [0m5        [0m | [0m0.9999   [0m | [0m175.4    [0m | [0m4.837    [0m | [0m3.629    [0m | [0m0.1131   [0m | [0m255.2    [0m |
| [0m6        [0m | [0m0.9999   [0m | [0m89.18    [0m | [0m6.523    [0m | [0m6.306    [0m | [0m0.1831   [0

### Optimized Model Performance on Hold-out Set

In [67]:
def holdoutperformance(model_name, model, X_train, y_train, X_hold_out, y_hold_out):

    # Train the model on the training set
    model.fit(X_train, y_train)

    # Predict on the hold-out set
    y_pred = model.predict(X_hold_out)
    y_pred_proba = model.predict_proba(X_hold_out)[:, 1] if hasattr(model, "predict_proba") else None

    # Calculate ROC-AUC if probabilities are available
    roc_auc = roc_auc_score(y_hold_out, y_pred_proba) if y_pred_proba is not None else None

    # Output the performance metrics
    cls_report = classification_report(y_hold_out, y_pred)

    print(f"{model_name} - ROC-AUC on hold-out set: {roc_auc}")
    print(f"{model_name} - Classification Report on hold-out set: \n{cls_report}")

    # Save the model to disk
    model_filename = f'/content/drive/MyDrive/quantspark/models/{model_name}_model.joblib'
    dump(model, model_filename)

    print(f"Optimized model saved to {model_filename}")

    return model

In [68]:
best_gbrBO_params = gbrBO.max['params']
final_gbr = GradientBoostingClassifier(
    n_estimators=int(best_gbrBO_params['n_estimators']),
    learning_rate=best_gbrBO_params['learning_rate'],
    max_depth=int(best_gbrBO_params['max_depth']),
    min_samples_split=int(best_gbrBO_params['min_samples_split']),
    min_samples_leaf=int(best_gbrBO_params['min_samples_leaf']),
    subsample=best_gbrBO_params['subsample'],
    random_state=42
)
holdoutperformance("GBClf_opt",final_gbr, X_train_val, y_train_val, X_hold_out, y_hold_out)

GBClf_opt - ROC-AUC on hold-out set: 0.999983471347581
GBClf_opt - Classification Report on hold-out set: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19866
           1       0.98      1.00      0.99       134

    accuracy                           1.00     20000
   macro avg       0.99      1.00      0.99     20000
weighted avg       1.00      1.00      1.00     20000

Optimized model saved to /content/drive/MyDrive/quantspark/models/GBClf_opt_model.joblib


In [71]:
best_cbcBO_params = cbcBO.max['params']
final_cbc = CatBoostClassifier(
        iterations=int(best_cbcBO_params['n_estimators']),
        learning_rate=best_cbcBO_params['learning_rate'],
        depth=int(best_cbcBO_params['depth']),
        l2_leaf_reg=best_cbcBO_params["l2_leaf_reg"],
        border_count=int(best_cbcBO_params["border_count"]),
        auto_class_weights='Balanced',
        verbose=False
    )
holdoutperformance("CBClf_opt",final_cbc, X_train_val, y_train_val, X_hold_out, y_hold_out)

CBClf_opt - ROC-AUC on hold-out set: 0.9999526679498911
CBClf_opt - Classification Report on hold-out set: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19866
           1       0.97      1.00      0.99       134

    accuracy                           1.00     20000
   macro avg       0.99      1.00      0.99     20000
weighted avg       1.00      1.00      1.00     20000

Optimized model saved to /content/drive/MyDrive/quantspark/models/CBClf_opt_model.joblib


<catboost.core.CatBoostClassifier at 0x7e0076d16ad0>

In [72]:
best_dtcBO_params = dtcBO.max['params']
final_dtc = DecisionTreeClassifier(
    max_depth=int(best_dtcBO_params["max_depth"]),
    min_samples_split=int(best_dtcBO_params["min_samples_split"]),
    min_samples_leaf=int(best_dtcBO_params["min_samples_leaf"]),
    max_features=best_dtcBO_params["max_features"],
    random_state=42,
    class_weight='balanced'
)
holdoutperformance("DTClf_opt",final_dtc, X_train_val, y_train_val, X_hold_out, y_hold_out)

DTClf_opt - ROC-AUC on hold-out set: 0.9999453427516599
DTClf_opt - Classification Report on hold-out set: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19866
           1       0.97      1.00      0.99       134

    accuracy                           1.00     20000
   macro avg       0.99      1.00      0.99     20000
weighted avg       1.00      1.00      1.00     20000

Optimized model saved to /content/drive/MyDrive/quantspark/models/DTClf_opt_model.joblib


In [73]:
best_rfcBO_params = rfcBO.max['params']
final_rfc = RandomForestClassifier(
    n_estimators=int(best_rfcBO_params["n_estimators"]),
    max_depth=int(best_rfcBO_params["max_depth"]),
    min_samples_split=int(best_rfcBO_params["min_samples_split"]),
    min_samples_leaf=int(best_rfcBO_params["min_samples_leaf"]),
    max_features=best_rfcBO_params["max_features"],
    random_state=42,
    class_weight='balanced'
)
holdoutperformance("RFClf_opt",final_rfc, X_train_val, y_train_val, X_hold_out, y_hold_out)

RFClf_opt - ROC-AUC on hold-out set: 0.9999894817666425
RFClf_opt - Classification Report on hold-out set: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19866
           1       0.97      1.00      0.99       134

    accuracy                           1.00     20000
   macro avg       0.99      1.00      0.99     20000
weighted avg       1.00      1.00      1.00     20000

Optimized model saved to /content/drive/MyDrive/quantspark/models/RFClf_opt_model.joblib


I've obtained the best parameters of these 4 models and saved them into disk. In the next notebook, I've used ensembling methods to create a robust classifier in-line with client expectations of having a well-validated and trustworthy model.