# Kaggle Playground - Season 4 Episode 
## Binary Classification of Insurance Cross Selling

Competion link - https://www.kaggle.com/competitions/playground-series-s4e8

### Steps
- Import the necessary libraries, packages and modules
- Read the datsets as data framers

### Understand the problem

- class is the target variable
- It determines the class of a mushroom depending on the given variables

In [33]:
import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb
import logging
import matplotlib.pyplot as plt
import numpy as np
import optuna
import os
import pandas as pd
import pickle
import seaborn as sns
import tensorflow as tf
import time
import xgboost as xgb

from imblearn.over_sampling import RandomOverSampler
from optuna.samplers import TPESampler
from scipy.stats import randint, uniform
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import (BaggingClassifier, RandomForestClassifier, AdaBoostClassifier,
                              GradientBoostingClassifier, HistGradientBoostingClassifier)
from sklearn.feature_selection import SelectFromModel  
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, make_scorer, matthews_corrcoef, roc_auc_score
from sklearn.model_selection import cross_val_score, train_test_split, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

sns.set()
%matplotlib inline

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
start_time = time.time()

In [34]:
# Test to see if TensorFlow can utilize the GPU
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [35]:
# Check if TensorFlow can access the GPU
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if physical_devices:
    print(f"TensorFlow can access the GPU: {physical_devices}")
else:
    print("TensorFlow is not detecting the GPU.")

TensorFlow can access the GPU: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [36]:
# Load data
train_df = pd.read_csv('train.csv', index_col='id')
test_df = pd.read_csv('test.csv', index_col='id')

train_df.head()

logger.info(f"Train data load completed. Time elapsed: {time.time() - start_time:.2f} seconds")

INFO:__main__:Train data load completed. Time elapsed: 9.24 seconds


In [37]:
test_df.head(2)

Unnamed: 0_level_0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3116945,8.64,x,,n,t,,,w,11.13,17.12,b,,w,u,w,t,g,,d,a
3116946,6.9,o,t,o,f,,c,y,1.27,10.75,,,n,,,f,f,,d,a


In [38]:
# # Checking for data imbalance in training dataframe

# temp = train_df['class'].value_counts()    # Counts the number of ones and zeroes in the column
# temp_df = pd.DataFrame({'class' : temp.index, 'values' : temp.values})    # Creating a data frame with value and their counts
# plt.figure(figsize = (4, 3))
# sns.barplot(x = 'class', y = 'values', data = temp_df)
# plt.show()

In [39]:
# plt.pie(temp_df['values'], labels = [0, 1])
# plt.show()

### Identify the target variable and features

- class is the target variable
- It determines the class of a mushroom depending on the given variables

### Remove duplicate rows

- Checked the sum of duplicated rows in train and test datasets
- No dupllicated rows in train dataframe

In [40]:
# train_df.duplicated().sum()

In [41]:
# test_df.duplicated().sum()

### Handling missing values

- Checked the missing values in column
- There are considerable amount of missing values in many columns

In [42]:
train_df.shape

(3116945, 21)

In [43]:
test_df.shape

(2077964, 20)

In [44]:
# train_df.info()

In [45]:
# test_df.info()

In [46]:
# Split the dataset into training and validation sets
raw_train_df, validation_df = train_test_split(train_df, train_size=0.75, random_state=1, stratify=train_df['class'])
logger.info(f"Train test split completed. Time elapsed: {time.time() - start_time:.2f} seconds")

raw_train_df.head(2)

INFO:__main__:Train test split completed. Time elapsed: 17.71 seconds


Unnamed: 0_level_0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
748879,p,2.45,x,g,n,f,a,,p,7.12,...,,,w,,,f,f,,d,u
589395,p,2.71,b,,k,f,a,,k,7.78,...,,,g,,w,f,f,k,g,u


In [47]:
raw_train_df.isnull().sum()

class                         0
cap-diameter                  3
cap-shape                    30
cap-surface              503295
cap-color                    10
does-bruise-or-bleed          6
gill-attachment          392685
gill-spacing             943507
gill-color                   49
stem-height                   0
stem-width                    0
stem-root               2067772
stem-surface            1486230
stem-color                   27
veil-type               2218034
veil-color              2055885
has-ring                     16
ring-type                 96564
spore-print-color       2137374
habitat                      36
season                        0
dtype: int64

In [48]:
test_df.isnull().sum()

cap-diameter                  7
cap-shape                    31
cap-surface              446904
cap-color                    13
does-bruise-or-bleed         10
gill-attachment          349821
gill-spacing             839595
gill-color                   49
stem-height                   1
stem-width                    0
stem-root               1838012
stem-surface            1321488
stem-color                   21
veil-type               1971545
veil-color              1826124
has-ring                     19
ring-type                 86195
spore-print-color       1899617
habitat                      25
season                        0
dtype: int64

In [49]:
# Check for missing values

missing_values = raw_train_df.isnull().sum()

# Calculate the percentage of missing values

missing_percentage = (missing_values / len(raw_train_df)) * 100
missing_percentage

class                    0.000000
cap-diameter             0.000128
cap-shape                0.001283
cap-surface             21.529421
cap-color                0.000428
does-bruise-or-bleed     0.000257
gill-attachment         16.797864
gill-spacing            40.360344
gill-color               0.002096
stem-height              0.000000
stem-width               0.000000
stem-root               88.452963
stem-surface            63.576375
stem-color               0.001155
veil-type               94.880712
veil-color              87.944474
has-ring                 0.000684
ring-type                4.130713
spore-print-color       91.430324
habitat                  0.001540
season                   0.000000
dtype: float64

In [50]:
# Check for missing values

missing_values = validation_df.isnull().sum()

# Calculate the percentage of missing values

missing_percentage = (missing_values / len(validation_df)) * 100
missing_percentage

class                    0.000000
cap-diameter             0.000128
cap-shape                0.001283
cap-surface             21.524645
cap-color                0.000257
does-bruise-or-bleed     0.000257
gill-attachment         16.843528
gill-spacing            40.414919
gill-color               0.001027
stem-height              0.000000
stem-width               0.000000
stem-root               88.452037
stem-surface            63.476324
stem-color               0.001412
veil-type               94.895263
veil-color              87.914460
has-ring                 0.001027
ring-type                4.147134
spore-print-color       91.410957
habitat                  0.001155
season                   0.000000
dtype: float64

In [51]:
# Check for missing values

missing_values_test = test_df.isnull().sum()

# Calculate the percentage of missing values

missing_percentage_test = (missing_values_test / len(test_df)) * 100
missing_percentage_test

cap-diameter             0.000337
cap-shape                0.001492
cap-surface             21.506821
cap-color                0.000626
does-bruise-or-bleed     0.000481
gill-attachment         16.834796
gill-spacing            40.404694
gill-color               0.002358
stem-height              0.000048
stem-width               0.000000
stem-root               88.452543
stem-surface            63.595327
stem-color               0.001011
veil-type               94.878689
veil-color              87.880445
has-ring                 0.000914
ring-type                4.148051
spore-print-color       91.417224
habitat                  0.001203
season                   0.000000
dtype: float64

### Data Preprocessing
- Droping columns with more than 50% missing values
- Using simple imputer
- Encoding Categorical Variables

In [52]:
# Drop columns with extremely high missing values
columns_to_drop = ['veil-type', 'spore-print-color', 'stem-root', 'veil-color', 'stem-surface']
raw_train_df.drop(columns=columns_to_drop, inplace=True)
validation_df.drop(columns=columns_to_drop, inplace=True)
test_df.drop(columns=columns_to_drop, inplace=True)

# # Check for missing values

# missing_values = train_df.isnull().sum()

# # Calculate the percentage of missing values

# missing_percentage = (missing_values / len(train_df)) * 100
# missing_percentage

In [53]:
# # Check for missing values

# missing_values_test = test_df.isnull().sum()

# # Calculate the percentage of missing values

# missing_percentage_test = (missing_values_test / len(test_df)) * 100
# missing_percentage_test

In [54]:
# Split dependent and independent variables

y_train = raw_train_df['class']
raw_train_df = raw_train_df.drop('class', axis=1)
y_val = validation_df['class']
validation_df = validation_df.drop('class', axis=1)

raw_train_df.head(2)

Unnamed: 0_level_0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
748879,2.45,x,g,n,f,a,,p,7.12,2.3,w,f,f,d,u
589395,2.71,b,,k,f,a,,k,7.78,2.33,g,f,f,g,u


In [55]:
validation_df.head(2)

Unnamed: 0_level_0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1706883,1.4,x,g,y,f,d,d,n,1.56,2.43,n,f,f,d,a
1554994,5.35,f,h,g,f,s,d,w,4.75,9.63,g,f,f,d,u


In [56]:
# Identify numerical and categorical columns
numerical_cols = raw_train_df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = raw_train_df.select_dtypes(include=['object']).columns

for col in categorical_cols:
    print("Unique Value is each categorical column :")
    print(col, raw_train_df[col].nunique())

Unique Value is each categorical column :
cap-shape 61
Unique Value is each categorical column :
cap-surface 70
Unique Value is each categorical column :
cap-color 60
Unique Value is each categorical column :
does-bruise-or-bleed 24
Unique Value is each categorical column :
gill-attachment 69
Unique Value is each categorical column :
gill-spacing 44
Unique Value is each categorical column :
gill-color 54
Unique Value is each categorical column :
stem-color 47
Unique Value is each categorical column :
has-ring 21
Unique Value is each categorical column :
ring-type 33
Unique Value is each categorical column :
habitat 48
Unique Value is each categorical column :
season 4


In [None]:
# Define the imputer for numerical columns (median imputation)
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Define a function to apply LabelEncoder to each categorical column
def encode_labels(df, columns):
    df_encoded = df.copy()
    le_dict = {}
    for col in columns:
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col])
        le_dict[col] = le
    return df_encoded, le_dict

# Define the encoder for categorical columns using LabelEncoder
def encode_labels_pipeline(X):
    df = pd.DataFrame(X, columns=categorical_cols)
    df_encoded, le_dict = encode_labels(df, categorical_cols)
    return df_encoded.values

# Create a ColumnTransformer for numerical and categorical transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', FunctionTransformer(func=lambda X: encode_labels_pipeline(X), validate=False), categorical_cols)
    ],
    remainder='passthrough'
)

# Fit the preprocessor on the training data
preprocessor.fit(raw_train_df)

# Transform both the training, validation, and test data
train_df_transformed = preprocessor.transform(raw_train_df)
validation_df_transformed = preprocessor.transform(validation_df)
test_df_transformed = preprocessor.transform(test_df)

print("Transformed Training Data Shape:", train_df_transformed.shape)
print("Transformed Validation Data Shape:", validation_df_transformed.shape)
print("Transformed Test Data Shape:", test_df_transformed.shape)

logger.info(f"Missing values and categorical columns treatment completed. Time elapsed: {time.time() - start_time:.2f} seconds")

In [None]:
# Combine feature names
all_feature_names = numerical_cols.tolist() + categorical_cols.tolist()

# Convert to DataFrame
train_df_transformed = pd.DataFrame(train_df_transformed, columns=all_feature_names)
validation_df_transformed = pd.DataFrame(validation_df_transformed, columns=all_feature_names)
test_df_transformed = pd.DataFrame(test_df_transformed, columns=all_feature_names)

# # Check for missing values
# missing_values_train = train_df_transformed.isnull().sum()

# # Calculate the percentage of missing values
# missing_values_train = (missing_values_train / len(train_df_transformed)) * 100
# missing_values_train

In [None]:
# # Check for missing values

# missing_values_test = test_df_transformed.isnull().sum()

# # Calculate the percentage of missing values

# missing_percentage_test = (missing_values_test / len(test_df_transformed)) * 100
# missing_percentage_test

In [None]:
train_df_transformed.head(2)

In [None]:
validation_df_transformed.head(2)

In [None]:
test_df_transformed.head(2)

In [None]:
# Feature Selection
def feature_selection(X_train, y_train, model):
    with tf.device('/GPU:0'):
        selector = SelectFromModel(model, threshold='mean')
        selector.fit(X_train, y_train)
    
    return selector.transform(X_train), selector.get_support()

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Ridge Classifier': RidgeClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Bagging Classifier': BaggingClassifier(),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost Classifier': AdaBoostClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVC': SVC(),
    'KNN': KNeighborsClassifier(),
    'XGBoost': xgb.XGBClassifier(),
    'LightGBM': lgb.LGBMClassifier()
}

# Define parameter grids for RandomizedSearchCV
param_grids = {
    'Logistic Regression': {
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'saga'],
        'penalty': ['l2', 'none'],  # 'elasticnet' is not supported by 'lbfgs'
        'C': uniform(0.001, 1000),  # Inverse of regularization strength
        'max_iter': [100, 200, 300]
    },
    'Ridge Classifier': {
        'alpha': uniform(0.1, 10),  # Regularization strength
        'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg'],
        'fit_intercept': [True, False],
        'max_iter': [100, 200, 300]
    },
    'Decision Tree': {
        'criterion': ['gini', 'entropy'],
        'splitter': ['best', 'random'],
        'max_depth': randint(3, 20),
        'min_samples_split': uniform(0.01, 0.1),
        'min_samples_leaf': uniform(0.01, 0.1),
        'min_weight_fraction_leaf': uniform(0.0, 0.1),
        'max_features': ['auto', 'sqrt', 'log2', None],
        'max_leaf_nodes': randint(10, 50),
        'min_impurity_decrease': uniform(0.0, 0.1),
        'class_weight': [None, 'balanced']
    },
    'Bagging Classifier': {
        'base_estimator': [DecisionTreeClassifier(), None],  # Default is DecisionTreeClassifier
        'n_estimators': randint(10, 100),
        'max_samples': uniform(0.5, 0.5),
        'max_features': uniform(0.5, 0.5),
        'bootstrap': [True, False],
        'bootstrap_features': [True, False],
        'oob_score': [True, False],
        'n_jobs': [None, -1],
        'random_state': [42]
    },
    'Random Forest': {
        'n_estimators': randint(50, 300),
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, randint(3, 20)],
        'min_samples_split': randint(2, 20),
        'min_samples_leaf': randint(1, 20),
        'max_features': ['auto', 'sqrt', 'log2', uniform(0.5, 0.5)],
        'bootstrap': [True, False],
        'oob_score': [True, False],
        'n_jobs': [None, -1],
        'random_state': [42],
        'verbose': [0, 1],
        'warm_start': [True, False],
        'class_weight': [None, 'balanced']
    },
    'AdaBoost Classifier': {
        'base_estimator': [None, DecisionTreeClassifier(max_depth=1)],
        'n_estimators': randint(50, 300),
        'learning_rate': uniform(0.01, 1.0),
        'algorithm': ['SAMME', 'SAMME.R'],
        'random_state': [42]
    },
    'Gradient Boosting': {
        'n_estimators': randint(50, 300),
        'learning_rate': uniform(0.01, 0.5),
        'max_depth': randint(3, 10),
        'min_samples_split': randint(2, 20),
        'min_samples_leaf': randint(1, 20),
        'max_features': ['sqrt', 'log2', None, uniform(0.1, 0.9)],
        'subsample': uniform(0.7, 0.3),
        'criterion': ['friedman_mse', 'mse', 'mae'],
        'random_state': [42],
        'verbose': [0, 1]
    },
    'SVC': {
        'C': uniform(0.001, 1000),
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'degree': randint(1, 5),
        'gamma': uniform(0.001, 1.0),
        'coef0': uniform(0.0, 1.0),
        'shrinking': [True, False],
        'probability': [True, False],
        'tol': uniform(1e-5, 1e-1),
        'cache_size': randint(100, 2000),
        'class_weight': [None, 'balanced'],
        'max_iter': [1000, 2000, -1],
        'random_state': [42]
    },
    'KNN': {
        'n_neighbors': randint(3, 20),
        'weights': ['uniform', 'distance'],
        'algorithm': ['ball_tree', 'kd_tree', 'brute'],
        'p': [1, 2],
        'leaf_size': randint(20, 60),
        'n_jobs': [None, -1]
    },
    'XGBoost': {
        'n_estimators': randint(100, 300),
        'max_depth': randint(3, 10),
        'learning_rate': uniform(0.01, 0.1),
        'subsample': uniform(0.7, 0.9),
        'colsample_bytree': uniform(0.7, 0.9),
        'gamma': uniform(0, 0.2),
        'min_child_weight': randint(1, 10),
        'scale_pos_weight': uniform(1, 10),
        'reg_alpha': uniform(0, 0.2),
        'reg_lambda': uniform(0, 0.2),
        'objective': ['binary:logistic'],
        'random_state': [42]
    },
    'LightGBM': {
        'n_estimators': randint(100, 300),
        'learning_rate': uniform(0.01, 0.1),
        'num_leaves': randint(31, 127),
        'max_depth': randint(-1, 10),
        'subsample': uniform(0.6, 0.4),
        'colsample_bytree': uniform(0.6, 0.4),
        'reg_alpha': uniform(0, 0.2),
        'reg_lambda': uniform(0, 0.2),
        'boosting_type': ['gbdt', 'dart', 'rf'],
        'metric': ['binary_logloss', 'auc'],
        'random_state': [42]
    }
}

# Define the objective function for Optuna
def objective(trial, model_name, model, param_grid, X_train, y_train, X_val, y_val):
    with tf.device('/GPU:0'):
        if model_name == 'XGBoost':
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 100, 300),
                'max_depth': trial.suggest_int('max_depth', 3, 10),
                'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.1),
                'subsample': trial.suggest_uniform('subsample', 0.7, 0.9),
                'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 0.9),
                'gamma': trial.suggest_uniform('gamma', 0, 0.2),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
                'reg_alpha': trial.suggest_uniform('reg_alpha', 0, 0.2),
                'reg_lambda': trial.suggest_uniform('reg_lambda', 0, 0.2),
                'objective': 'binary:logistic'
            }
        elif model_name == 'LightGBM':
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 100, 300),
                'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.1),
                'num_leaves': trial.suggest_int('num_leaves', 31, 127),
                'max_depth': trial.suggest_int('max_depth', -1, 10),
                'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
                'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
                'reg_alpha': trial.suggest_uniform('reg_alpha', 0, 0.2),
                'reg_lambda': trial.suggest_uniform('reg_lambda', 0, 0.2),
                'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart', 'rf']),
                'metric': trial.suggest_categorical('metric', ['binary_logloss', 'auc'])
            }

        model.set_params(**params)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        score = matthews_corrcoef(y_val, preds)
    
    return score

# Train and tune models
def train_and_tune_models(models, param_grids, X_train, y_train, X_val, y_val):
    with tf.device('/GPU:0'):
        best_models = {}
        for name, model in models.items():
            logger.info(f"Starting hyperparameter tuning for {name}")
            param_grid = param_grids[name]
            if name in ['XGBoost', 'LightGBM']:
                study = optuna.create_study(direction='maximize', sampler=TPESampler())
                study.optimize(lambda trial: objective(trial, name, model, param_grid, X_train, y_train, X_val, y_val), n_trials=50)
                best_params = study.best_params
                model.set_params(**best_params)
            else:
                random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=50, cv=5, verbose=1, n_jobs=-1, scoring='matthews_corrcoef')
                random_search.fit(X_train, y_train)
                best_params = random_search.best_params_
                model.set_params(**best_params)
            
            model.fit(X_train, y_train)
            preds = model.predict(X_val)
            score = matthews_corrcoef(y_val, preds)
            logger.info(f"{name} best score: {score:.4f} with params: {best_params}")
            best_models[name] = (model, best_params)
        
    return best_models

In [None]:
# # Encode the target variable
# label_encoder = LabelEncoder()
# y_train_encoded = label_encoder.fit_transform(y_train)
# y_val_encoded = label_encoder.transform(y_val)

In [None]:
# Best models
with tf.device('/GPU:0'):
    best_models = train_and_tune_models(models, param_grids, train_df_transformed, y_train, validation_df_transformed, y_val)

logger.info(f"Best models found. Time elapsed: {time.time() - start_time:.2f} seconds")

In [None]:
# INFO:__main__:Evaluating Logistic Regression# 
INFO:__main__:Logistic Regression - Best Params: {'C': 374.54111884736244, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}# 
INFO:__main__:Logistic Regression - MCC Score: 0.232# 7
INFO:__main__:Time elapsed: 5055.08 secon# ds
INFO:__main__:Evaluating Ridge Classif# ier
INFO:__main__:Ridge Classifier - Best Params: {'alpha': 3.845401188473625, 'fit_intercept': True, 'max_iter': 300, 'solver': 'choles# ky'}
INFO:__main__:Ridge Classifier - MCC Score: 0# .2332
INFO:__main__:Time elapsed: 5342.57 s# econds
INFO:__main__:Evaluating Decisi# on Tree
INFO:__main__:Decision Tree - Best Params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 17, 'max_features': 'log2', 'max_leaf_nodes': 17, 'min_impurity_decrease': 0.05986584841970366, 'min_samples_leaf': 0.025601864044243652, 'min_samples_split': 0.025599452033620268, 'min_weight_fraction_leaf': 0.005808361216819946, 'splitter': '# random'}
INFO:__main__:Decision Tree - MCC Scor# e: 0.0000
INFO:__main__:Time elapsed: 5536.# 35 seconds
INFO:__main__:Evaluating Bagging#  Classifier
INFO:__main__:Bagging Classifier - Best Params: {'base_estimator': DecisionTreeClassifier(), 'bootstrap': False, 'bootstrap_features': True, 'max_features': 0.5917173949330818, 'max_samples': 0.8898455001363846, 'n_estimators': 30, 'n_jobs': None, 'oob_score': False, 'random# _state': 42}
INFO:__main__:Bagging Classifier - MCC # Score: 0.9804
INFO:__main__:Time elapsed: 13# 779.59 seconds
INFO:__main__:Evaluatin# g Random Forest
INFO:__main__:Random Forest - Best Params: {'bootstrap': True, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 8, 'min_samples_split': 8, 'n_estimators': 171, 'n_jobs': None, 'oob_score': True, 'random_state': 42, 'verbose': 0, 'wa# rm_start': True}
INFO:__main__:Random Forest - # MCC Score: 0.9818
INFO:__main__:Time elapsed# : 23825.43 seconds
INFO:__main__:Evaluating # AdaBoost Classifier
INFO:__main__:AdaBoost Classifier - Best Params: {'algorithm': 'SAMME', 'base_estimator': DecisionTreeClassifier(max_depth=1), 'learning_rate': 0.9607143064099162, 'n_estimators': 156,#  'random_state': 42}
INFO:__main__:AdaBoost Classifie# r - MCC Score: 0.5079
INFO:__main__:Time ela# psed: 54563.87 seconds
INFO:__main__:Evaluating Gradient Boosting

In [None]:
# Evaluate best models on test data
def evaluate_best_models(best_models, X_test, y_test):
    with tf.device('/GPU:0'):
        results = {}
        for name, (model, _) in best_models.items():
            preds = model.predict(X_test)
            accuracy = accuracy_score(y_test, preds)
            auc_score = roc_auc_score(y_test, preds)
            conf_matrix = confusion_matrix(y_test, preds)
            class_report = classification_report(y_test, preds)
            results[name] = {
                'accuracy': accuracy,
                'auc_score': auc_score,
                'conf_matrix': conf_matrix,
                'class_report': class_report
            }
            logger.info(f"Evaluation results for {name}:")
            logger.info(f"Accuracy: {accuracy:.4f}")
            logger.info(f"AUC Score: {auc_score:.4f}")
            logger.info(f"Confusion Matrix:\n{conf_matrix}")
            logger.info(f"Classification Report:\n{class_report}")
        
    return results

In [None]:
with tf.device('/GPU:0'):
    evaluation_results = evaluate_best_models(best_models, test_df_transformed, test_df['class'])

logger.info(f"Model Evaluation Completed. Time elapsed: {time.time() - start_time:.2f} seconds")

In [None]:
# Save the best models
def save_best_models(best_models, filename_prefix):
    for name, (model, params) in best_models.items():
        filename = f"{filename_prefix}_{name.replace(' ', '_')}.pkl"
        with open(filename, 'wb') as f:
            pickle.dump({'model': model, 'params': params}, f)
        logger.info(f"Saved {name} model to {filename}")

In [None]:
save_best_models(best_models, 'best_model')

logger.info(f"Total time elapsed: {time.time() - start_time:.2f} seconds")