# Kaggle Playground - Season 4 Episode 
## Binary Classification of Insurance Cross Selling

Competion link - https://www.kaggle.com/competitions/playground-series-s4e8

### Steps
- Import the necessary libraries, packages and modules
- Read the datsets as data framers

### Understand the problem

- class is the target variable
- It determines the class of a mushroom depending on the given variables

In [4]:
# Import the necessary libraries, packages and modules

import warnings
warnings.filterwarnings('ignore')

# import dtale    # Use of a web progrm to analysis the data deeply
import lightgbm as lgb
import logging
import matplotlib.pyplot as plt
import numpy as np
import optuna
import os
import pandas as pd
import pickle
import seaborn as sns
# import statsmodels.api as sm
import tensorflow as tf
import time
import xgboost as xgb
# import zipfile

from imblearn.over_sampling import RandomOverSampler
from optuna.samplers import TPESampler
#from pandas_profiling import ProfileReport
from scipy.stats import randint, uniform
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import (BaggingClassifier, RandomForestClassifier, AdaBoostClassifier,
                              GradientBoostingClassifier, HistGradientBoostingClassifier)
from sklearn.feature_selection import chi2, RFE, SelectKBest, SelectFromModel  
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, make_scorer, matthews_corrcoef, roc_auc_score
from sklearn.model_selection import cross_val_score, train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
# from skopt import BayesSearchCV
from xgboost import XGBClassifier

sns.set()
%matplotlib inline

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
start_time = time.time()

In [5]:
# Test to see if TensorFlow can utilize the GPU
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [6]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

logger.info(f"Train data load completed. Time elapsed: {time.time() - start_time:.2f} seconds")

train_df.head(2)

INFO:__main__:Train data load completed. Time elapsed: 4.09 seconds


Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.8,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w


In [7]:
test_df.head(2)

Unnamed: 0,id,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,3116945,8.64,x,,n,t,,,w,11.13,...,b,,w,u,w,t,g,,d,a
1,3116946,6.9,o,t,o,f,,c,y,1.27,...,,,n,,,f,f,,d,a


### Identify the target variable and features

- class is the target variable
- It determines the class of a mushroom depending on the given variables

### Remove duplicate rows

- Checked the sum of duplicated rows in train and test datasets
- No dupllicated rows in train dataframe

### Handling missing values

- Checked the missing values in column
- There are considerable amount of missing values in many columns

In [8]:
train_df.shape

(3116945, 22)

In [9]:
test_df.shape

(2077964, 21)

In [10]:
# Since we have only one data set, spliting it into train and test (validation)

train_df_split, validation_df = train_test_split(train_df, train_size = 0.75, random_state = 42, stratify = train_df['class'])
logger.info(f"Train test split completed. Time elapsed: {time.time() - start_time:.2f} seconds")
train_df_split.head(2)

INFO:__main__:Train test split completed. Time elapsed: 6.54 seconds


Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
2419543,2419543,e,7.1,s,s,o,f,d,d,b,...,,,w,,,f,f,,g,w
910367,910367,p,7.11,f,,y,f,x,c,n,...,,,w,,,f,f,,d,a


### Data Preprocessing
- Droping columns with more than 50% missing values
- Using simple imputer
- Encoding Categorical Variables

In [11]:
# Drop columns with extremely high missing values
columns_to_drop = ['id', 'veil-type', 'spore-print-color', 'stem-root', 'veil-color', 'stem-surface']
train_df_split.drop(columns=columns_to_drop, inplace=True)
validation_df.drop(columns=columns_to_drop, inplace=True)
test_df.drop(columns=columns_to_drop, inplace=True)

In [12]:
# Spliting dependent and independent valriable

y_train = train_df_split['class']
train_df_split = train_df_split.drop('class', axis = 1)

y_val = validation_df['class']
validation_df = validation_df.drop('class', axis = 1)

train_df_split.head(2)

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
2419543,7.1,s,s,o,f,d,d,b,6.28,12.75,w,f,f,g,w
910367,7.11,f,,y,f,x,c,n,6.64,10.39,w,f,f,d,a


In [13]:
# Identify numerical and categorical columns
numerical_cols = train_df_split.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = train_df_split.select_dtypes(include=['object']).columns

for col in categorical_cols:
    print("Unique Value is each categorical column :")
    print(col, train_df[col].nunique())

Unique Value is each categorical column :
cap-shape 74
Unique Value is each categorical column :
cap-surface 83
Unique Value is each categorical column :
cap-color 78
Unique Value is each categorical column :
does-bruise-or-bleed 26
Unique Value is each categorical column :
gill-attachment 78
Unique Value is each categorical column :
gill-spacing 48
Unique Value is each categorical column :
gill-color 63
Unique Value is each categorical column :
stem-color 59
Unique Value is each categorical column :
has-ring 23
Unique Value is each categorical column :
ring-type 40
Unique Value is each categorical column :
habitat 52
Unique Value is each categorical column :
season 4


In [14]:
le = LabelEncoder()

# Define the imputer and scaler for numerical columns (median imputation and standard scaling)
def preprocess_numerical_data(X, numerical_cols):
    # Impute missing values with median
    imputer = SimpleImputer(strategy='median')
    X_numerical_imputed = imputer.fit_transform(X[numerical_cols])
    
    # Scale the numerical data
    scaler = StandardScaler()
    X_numerical_scaled = scaler.fit_transform(X_numerical_imputed)
    
    return pd.DataFrame(X_numerical_scaled, columns=numerical_cols)

# Define a function to apply LabelEncoder to each categorical column
def encode_labels(df, columns):
    df_encoded = df.copy()
    le_dict = {}
    for col in columns:
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col])
        le_dict[col] = le
    return df_encoded, le_dict

# Define the function to transform categorical columns using LabelEncoder
def encode_labels_transform(X, categorical_cols):
    df = pd.DataFrame(X, columns=categorical_cols)
    df_encoded, _ = encode_labels(df, categorical_cols)
    return df_encoded.values

# Define the function to preprocess the data
def preprocess_data(X, numerical_cols, categorical_cols):
    # Transform numerical columns
    num_transformed = preprocess_numerical_data(X, numerical_cols)
    
    # Transform categorical columns
    cat_transformed = encode_labels_transform(X, categorical_cols)
    
    # Combine transformed numerical and categorical columns
    X_transformed = pd.concat([num_transformed, pd.DataFrame(cat_transformed, columns=categorical_cols)], axis=1)
    return X_transformed

# Assuming you have your train and test dataframes
train_df_transformed = preprocess_data(train_df_split, numerical_cols, categorical_cols)
validation_df_transformed = preprocess_data(validation_df, numerical_cols, categorical_cols)
test_df_transformed = preprocess_data(test_df, numerical_cols, categorical_cols)

print("Transformed Training Data Shape:", train_df_transformed.shape)
print("Transformed Validation Data Shape:", validation_df_transformed.shape)
print("Transformed Test Data Shape:", test_df_transformed.shape)

logger.info(f"Missing values and categorical columns treatment completed. Time elapsed: {time.time() - start_time:.2f} seconds")

INFO:__main__:Missing values and categorical columns treatment completed. Time elapsed: 15.85 seconds


Transformed Training Data Shape: (2337708, 15)
Transformed Validation Data Shape: (779237, 15)
Transformed Test Data Shape: (2077964, 15)


In [15]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit the encoder on the training labels
label_encoder.fit(y_train)

# Transform the labels for training and validation datasets
y_train = label_encoder.transform(y_train)
y_val = label_encoder.transform(y_val)

In [16]:
# Combine feature names
all_feature_names = numerical_cols.tolist() + categorical_cols.tolist()

# Convert to DataFrame
train_df_transformed = pd.DataFrame(train_df_transformed, columns=all_feature_names)
validation_df_transformed = pd.DataFrame(validation_df_transformed, columns=all_feature_names)
test_df_transformed = pd.DataFrame(test_df_transformed, columns=all_feature_names)

In [17]:
def optimize_memory_usage(df):
    start_mem_usage = df.memory_usage().sum() / 1024 ** 2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type.name in ['category', 'object']:
            raise ValueError(f"Column '{col}' is of type '{col_type.name}'")

        c_min = df[col].min()
        c_max = df[col].max()
        
        if str(col_type)[:3] == 'int':
            
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
                
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
                
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
                
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)
        
        else:
        
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float16)
            
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            
            else:
                df[col] = df[col].astype(np.float64)

    end_mem_usage = df.memory_usage().sum() / 1024**2
    print(f'------ Memory usage before: {start_mem_usage:.2f} MB')
    print(f'------ Memory usage after: {end_mem_usage:.2f} MB')
    print(f'------ Reduced memory usage by {(100 * (start_mem_usage - end_mem_usage) / start_mem_usage):.1f}%')
    print('**********************' * 5)

    return df

In [18]:
train_df_transformed = optimize_memory_usage(train_df_transformed)
validation_df_transformed = optimize_memory_usage(validation_df_transformed)
test_df_transformed = optimize_memory_usage(test_df_transformed)

------ Memory usage before: 160.52 MB
------ Memory usage after: 40.13 MB
------ Reduced memory usage by 75.0%
**************************************************************************************************************
------ Memory usage before: 53.51 MB
------ Memory usage after: 13.38 MB
------ Reduced memory usage by 75.0%
**************************************************************************************************************
------ Memory usage before: 142.68 MB
------ Memory usage after: 35.67 MB
------ Reduced memory usage by 75.0%
**************************************************************************************************************


In [19]:
train_df_transformed.head(2)

Unnamed: 0,cap-diameter,stem-height,stem-width,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-color,has-ring,ring-type,habitat,season
0,0.169434,-0.025772,0.196899,53,62,57,8,36,21,23,42,4,13,21,3
1,0.171509,0.107544,-0.094604,40,73,69,8,61,18,40,42,4,13,18,0


In [20]:
validation_df_transformed.head(2)

Unnamed: 0,cap-diameter,stem-height,stem-width,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-color,has-ring,ring-type,habitat,season
0,-0.830078,0.042816,-1.05957,36,17,21,5,12,27,30,34,3,11,11,2
1,-0.218262,-0.242676,-0.580078,36,29,21,5,31,27,20,26,3,11,11,3


In [21]:
test_df_transformed.head(2)

Unnamed: 0,cap-diameter,stem-height,stem-width,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-color,has-ring,ring-type,habitat,season
0,0.498047,1.772461,0.737305,59,59,44,18,66,35,52,51,17,15,16,0
1,0.126709,-1.880859,-0.049194,50,53,45,5,66,17,54,38,6,14,16,0


In [28]:
# Feature Selection
def feature_selection(X_train, y_train, model):
    if hasattr(model, 'coef_') or hasattr(model, 'feature_importances_'):
        selector = SelectFromModel(model, threshold='mean')
        selector.fit(X_train, y_train)
        return selector.transform(X_train), selector.get_support()
    else:
        raise ValueError("Feature selection not supported for this model.")

def alternative_feature_selection(X_train, y_train):
    selector = SelectKBest(score_func=f_classif, k='all')
    X_train_selected = selector.fit_transform(X_train, y_train)
    return X_train_selected, selector.get_support()

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Ridge Classifier': RidgeClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Bagging Classifier': BaggingClassifier(),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost Classifier': AdaBoostClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVC': SVC(),
 #  'KNN': KNeighborsClassifier(),
    'XGBoost': xgb.XGBClassifier(tree_method='gpu_hist'),
    'LightGBM': lgb.LGBMClassifier(device='gpu')
}

# Define parameter grids for RandomizedSearchCV
param_grids = {
    'Logistic Regression': {
        'penalty': ['l2', 'l1', 'elasticnet', 'none'],
        'C': np.logspace(-4, 4, 20),
        'solver': ['lbfgs', 'liblinear', 'saga'],
        'l1_ratio': np.linspace(0, 1, 10)  # Added this line
    },
    'Ridge Classifier': {
        'alpha': uniform(0.1, 10),  # Regularization strength
        'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg'],
        'fit_intercept': [True, False],
        'max_iter': [100, 200, 300],
        'tol': uniform(1e-4, 1e-2)
    },
    'Decision Tree': {
        'criterion': ['gini', 'entropy'],
        'splitter': ['best', 'random'],
        'max_depth': randint(3, 20),
        'min_samples_split': uniform(0.01, 0.1),  # Random float between 0.01 and 0.1
        'min_samples_leaf': uniform(0.01, 0.1),  # Random float between 0.01 and 0.1
        'min_weight_fraction_leaf': uniform(0.0, 0.1),  # Random float between 0.0 and 0.1
        'max_features': ['auto', 'sqrt', 'log2', None],
        'max_leaf_nodes': randint(10, 50),
        'min_impurity_decrease': uniform(0.0, 0.1),
        'class_weight': [None, 'balanced']
    },
    'Bagging Classifier': {
        'estimator': [DecisionTreeClassifier(), None],  # Default is DecisionTreeClassifier
        'n_estimators': randint(10, 100),
        'max_samples': uniform(0.5, 1.0),  # Random float between 0.5 and 1.0
        'max_features': uniform(0.5, 1.0),  # Random float between 0.5 and 1.0
        'bootstrap': [True, False],
        'bootstrap_features': [True, False],
        'oob_score': [True, False],
        'n_jobs': [None, -1],
        'random_state': [42]
    },
    'Random Forest': {
        'n_estimators': randint(50, 300),
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, randint(3, 20)],
        'min_samples_split': randint(2, 20),
        'min_samples_leaf': randint(1, 20),
        'max_features': ['auto', 'sqrt', 'log2', uniform(0.5, 1.0)],
        'bootstrap': [True, False],
        'oob_score': [True, False],
        'n_jobs': [None, -1],
        'random_state': [42],
        'verbose': [0, 1],
        'warm_start': [True, False],
        'class_weight': [None, 'balanced']
    },
    'AdaBoost Classifier': {
        'base_estimator': [None, DecisionTreeClassifier(max_depth=1)],
        'n_estimators': randint(50, 300),
        'learning_rate': uniform(0.01, 1.0),
        'algorithm': ['SAMME', 'SAMME.R'],
        'random_state': [42]
    },
    'Gradient Boosting': {
        'n_estimators': randint(50, 300),
        'learning_rate': uniform(0.01, 0.5),
        'max_depth': randint(3, 10),
        'min_samples_split': randint(2, 20),
        'min_samples_leaf': randint(1, 20),
        'max_features': ['sqrt', 'log2', None, uniform(0.1, 0.9)],
        'subsample': uniform(0.5, 1.0),
        'criterion': ['friedman_mse', 'squared_error', 'mae'],
        'random_state': [42],
        'verbose': [0, 1]
    },
    'SVC': {
        'C': uniform(0.1, 10),
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'degree': randint(2, 5),
        'gamma': ['scale', 'auto', uniform(0.001, 1)],
        'coef0': uniform(0, 10),
        'shrinking': [True, False],
        'probability': [True, False],
        'tol': uniform(1e-5, 1e-1),
        'cache_size': uniform(50, 500),
        'class_weight': [None, 'balanced'],
        'verbose': [True, False],
        'max_iter': [-1, 100, 200],
        'decision_function_shape': ['ovr', 'ovo'],
        'break_ties': [True, False],
        'random_state': [42]
    },
    'KNN': {
        'n_neighbors': randint(1, 30),
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'p': randint(1, 3),
        'metric': ['minkowski', 'euclidean', 'manhattan', 'chebyshev', 'hamming'],
        'leaf_size': randint(10, 50),
        'n_jobs': [None, -1]
    },
    'XGBoost': {
        'n_estimators': randint(100, 300),
        'max_depth': randint(3, 10),
        'learning_rate': uniform(0.01, 0.3),
        'subsample': uniform(0.5, 1.0),
        'colsample_bytree': uniform(0.5, 1.0),
        'gamma': uniform(0, 0.5),
        'min_child_weight': randint(1, 10),
        'reg_alpha': uniform(0, 1),
        'reg_lambda': uniform(0, 1),
        'scale_pos_weight': uniform(1, 10),
        'max_delta_step': randint(0, 10),
        'colsample_bylevel': uniform(0.5, 1.0),
        'colsample_bynode': uniform(0.5, 1.0)
    },
    'LightGBM': {
        'num_leaves': randint(20, 100),
        'max_depth': randint(3, 15),
        'learning_rate': uniform(0.01, 0.3),
        'n_estimators': randint(100, 300),
        'min_child_samples': randint(10, 100),
        'min_split_gain': uniform(0, 0.5),
        'subsample': uniform(0.5, 1.0),
        'subsample_freq': randint(1, 10),
        'colsample_bytree': uniform(0.5, 1.0),
        'colsample_bylevel': uniform(0.5, 1.0),
        'reg_alpha': uniform(0, 1),
        'reg_lambda': uniform(0, 1),
        'scale_pos_weight': uniform(1, 10)
    }
}

# Optuna objective function for XGBoost and LightGBM with regularization
def objective(trial, model_name, model, X_train, y_train, X_val, y_val):
    if model_name == 'XGBoost':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 300),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
            'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_uniform('gamma', 0, 0.5),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'reg_alpha': trial.suggest_uniform('reg_alpha', 0, 1),
            'reg_lambda': trial.suggest_uniform('reg_lambda', 0, 1),
            'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 1, 10),
            'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),
            'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 0.5, 1.0),
            'colsample_bynode': trial.suggest_uniform('colsample_bynode', 0.5, 1.0)
        }
    elif model_name == 'LightGBM':
        params = {
            'num_leaves': trial.suggest_int('num_leaves', 20, 100),
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
            'n_estimators': trial.suggest_int('n_estimators', 100, 300),
            'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
            'min_split_gain': trial.suggest_uniform('min_split_gain', 0, 0.5),
            'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
            'subsample_freq': trial.suggest_int('subsample_freq', 1, 10),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
            'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 0.5, 1.0),
            'reg_alpha': trial.suggest_uniform('reg_alpha', 0, 1),
            'reg_lambda': trial.suggest_uniform('reg_lambda', 0, 1),
            'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 1, 10)
        }
    model.set_params(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    score = matthews_corrcoef(y_val, preds)
    return score

# Hyperparameter tuning using RandomizedSearchCV
def tune_model(model_name, model, param_grid, X_train, y_train):
    search = RandomizedSearchCV(
        model, param_distributions=param_grid, n_iter=10, scoring='f1', cv=3, random_state=42
    )
    search.fit(X_train, y_train)
    print(f"{model_name} Best Parameters (Random Search): {search.best_params_}")  # Print best parameters
    return search.best_estimator_, search.best_params_

# Hyperparameter tuning using Optuna for XGBoost and LightGBM
def optuna_tune_model(model_name, model, X_train, y_train, X_val, y_val):
    study = optuna.create_study(direction='maximize', sampler=TPESampler())
    study.optimize(lambda trial: objective(trial, model_name, model, X_train, y_train, X_val, y_val), n_trials=50)
    print(f"{model_name} Best Parameters (Optuna): {study.best_params}")  # Print best parameters
    return study.best_params

# Train and evaluate models
def evaluate_models(X_train, y_train, X_val, y_val):
    best_models = {}
    for name, model in models.items():
        logger.info(f"Evaluating {name}")

        # Feature Selection
        if hasattr(model, 'coef_') or hasattr(model, 'feature_importances_'):
            logger.info(f"Performing feature selection for {name}")
            X_train_selected, support = feature_selection(X_train, y_train, model)
            X_val_selected = X_val.iloc[:, support]
        else:
            logger.info(f"Skipping feature selection for {name} due to lack of support")
            X_train_selected = X_train
            X_val_selected = X_val

        if name in ['XGBoost', 'LightGBM']:
            # Perform Random Search
            best_model, best_params_random = tune_model(name, model, param_grids[name], X_train_selected, y_train)
            train_preds_random = best_model.predict(X_train_selected)
            val_preds_random = best_model.predict(X_val_selected)
            train_score_random = matthews_corrcoef(y_train, train_preds_random)
            val_score_random = matthews_corrcoef(y_val, val_preds_random)
            logger.info(f"{name} (Random Search) - Train MCC Score: {train_score_random:.4f}")
            logger.info(f"{name} (Random Search) - Validation MCC Score: {val_score_random:.4f}")
            
            # Perform Bayesian Search (Optuna)
            params_optuna = optuna_tune_model(name, model, X_train_selected, y_train, X_val_selected, y_val)
            model.set_params(**params_optuna)
            model.fit(X_train_selected, y_train)
            train_preds_optuna = model.predict(X_train_selected)
            val_preds_optuna = model.predict(X_val_selected)
            train_score_optuna = matthews_corrcoef(y_train, train_preds_optuna)
            val_score_optuna = matthews_corrcoef(y_val, val_preds_optuna)
            logger.info(f"{name} (Optuna) - Train MCC Score: {train_score_optuna:.4f}")
            logger.info(f"{name} (Optuna) - Validation MCC Score: {val_score_optuna:.4f}")
            logger.info(f"Time elapsed: {time.time() - start_time:.2f} seconds")
            
        else:
            # Hyperparameter tuning using RandomizedSearchCV
            best_model, best_params = tune_model(name, model, param_grids[name], X_train_selected, y_train)
            model = best_model
            train_preds = model.predict(X_train_selected)
            val_preds = model.predict(X_val_selected)
            train_score = matthews_corrcoef(y_train, train_preds)
            val_score = matthews_corrcoef(y_val, val_preds)
            logger.info(f"{name} - Train MCC Score: {train_score:.4f}")
            logger.info(f"{name} - Validation MCC Score: {val_score:.4f}")
            logger.info(f"Time elapsed: {time.time() - start_time:.2f} seconds")

        best_models[name] = model

    return best_models

# Testing best models on the test set
def test_best_models(best_models, X_train, y_train, X_val, y_val):
    results = {}
    for name, model in best_models.items():
        # Training predictions and score
        train_preds = model.predict(X_train)
        train_score = matthews_corrcoef(y_train, train_preds)

        # Validation predictions and score
        val_preds = model.predict(X_val)
        val_score = matthews_corrcoef(y_val, val_preds)

        # Logging the scores
        logger.info(f"{name} - Train MCC Score: {train_score:.4f}")
        logger.info(f"{name} - Validation MCC Score: {val_score:.4f}")
        logger.info(f"Time elapsed: {time.time() - start_time:.2f} seconds")

        # Storing the results
        results[name] = {
            'train_score': train_score,
            'val_score': val_score,
            #'test_score': test_score
        }
    return results

In [29]:
# Evaluate models
best_models = evaluate_models(train_df_transformed, y_train, validation_df_transformed, y_val)

logger.info(f"Model evaluation completed. Time elapsed: {time.time() - start_time:.2f} seconds")

INFO:__main__:Evaluating Logistic Regression
INFO:__main__:Skipping feature selection for Logistic Regression due to lack of support


Logistic Regression Best Parameters (Random Search): {'solver': 'saga', 'penalty': 'elasticnet', 'l1_ratio': 0.1111111111111111, 'C': 0.08858667904100823}


INFO:__main__:Logistic Regression - Train MCC Score: 0.2333
INFO:__main__:Logistic Regression - Validation MCC Score: 0.1315
INFO:__main__:Time elapsed: 455.78 seconds
INFO:__main__:Evaluating Ridge Classifier
INFO:__main__:Skipping feature selection for Ridge Classifier due to lack of support


Ridge Classifier Best Parameters (Random Search): {'alpha': 3.845401188473625, 'fit_intercept': True, 'max_iter': 300, 'solver': 'cholesky', 'tol': 0.007896910002727693}


INFO:__main__:Ridge Classifier - Train MCC Score: 0.2337
INFO:__main__:Ridge Classifier - Validation MCC Score: 0.1350
INFO:__main__:Time elapsed: 482.59 seconds
INFO:__main__:Evaluating Decision Tree
INFO:__main__:Skipping feature selection for Decision Tree due to lack of support


Decision Tree Best Parameters (Random Search): {'class_weight': None, 'criterion': 'entropy', 'max_depth': 17, 'max_features': 'log2', 'max_leaf_nodes': 17, 'min_impurity_decrease': 0.05986584841970366, 'min_samples_leaf': 0.025601864044243652, 'min_samples_split': 0.025599452033620268, 'min_weight_fraction_leaf': 0.005808361216819946, 'splitter': 'random'}


INFO:__main__:Decision Tree - Train MCC Score: 0.0000
INFO:__main__:Decision Tree - Validation MCC Score: 0.0000
INFO:__main__:Time elapsed: 501.62 seconds
INFO:__main__:Evaluating Bagging Classifier
INFO:__main__:Skipping feature selection for Bagging Classifier due to lack of support


KeyboardInterrupt: 

In [None]:
best_models

In [None]:
results = test_best_models(best_models, train_df_transformed, y_train, validation_df_transformed, y_val)

In [None]:
# Initialize logging
logging.info("Starting cross-validation process")
logger.info(f"Time elapsed: {time.time() - start_time:.2f} seconds")

# Sort the models by their validation scores in descending order
sorted_test_models = sorted(results.items(), key=lambda item: item[1]['val_score'], reverse=True)

# Select the top 5 models based on validation scores
five_best_models = {model_name: scores for model_name, scores in sorted_test_models[:5]}

# Output the best five models based on validation scores
for model_name, scores in five_best_models.items():
    print(f"Model: {model_name}, Validation MCC Score: {scores['val_score']}, Train MCC Score: {scores['train_score']}")

# The best_five_test_models dictionary now contains the top five models based on validation data performance


In [None]:
# logger.info("Starting cross-validation process")
# logger.info(f"Time elapsed: {time.time() - start_time:.2f} seconds")

# # Dictionary of the five best models based on the provided results
# five_best_models = {
#     'Bagging Classifier': BaggingClassifier(base_estimator=None, bootstrap=False,
#                                             max_features=0.6039708314340944,
#                                             max_samples=0.7838501639099957, n_estimators=72, n_jobs=-1,
#                                             random_state=42),
#     'Random Forest': RandomForestClassifier(class_weight='balanced', max_features='log2',
#                                             min_samples_leaf=8, min_samples_split=8,
#                                             n_estimators=171, oob_score=True, random_state=42,
#                                             warm_start=True),
#     'XGBoost': XGBClassifier(base_score=None, booster=None, callbacks=None,
#                              colsample_bylevel=0.7324201991368923,
#                              colsample_bynode=0.6984067959652569,
#                              colsample_bytree=0.6585770133495129, device=None,
#                              early_stopping_rounds=None, enable_categorical=False,
#                              eval_metric=None, feature_types=None, gamma=0.3990553580730866,
#                              grow_policy=None, importance_type=None,
#                              interaction_constraints=None, learning_rate=0.23958788518583604,
#                              max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
#                              max_delta_step=5, max_depth=9, max_leaves=None,
#                              min_child_weight=8, monotone_constraints=None,
#                              multi_strategy=None, n_estimators=202, n_jobs=None,
#                              num_parallel_tree=None, random_state=None),
#     'LightGBM': LGBMClassifier(colsample_bytree=0.7085181455028883, device='gpu',
#                                learning_rate=0.09554089088242068, max_depth=11,
#                                min_child_samples=26, min_split_gain=0.18636874352935062,
#                                n_estimators=233, num_leaves=117, reg_alpha=0.19897165336160308,
#                                reg_lambda=0.13427322639962427,
#                                scale_pos_weight=1.0239900845521561, subsample=0.595271788428719,
#                                subsample_freq=8),
#     'Gradient Boosting': GradientBoostingClassifier(criterion='squared_error',
#                                                     learning_rate=0.1009124836035503, max_depth=7,
#                                                     max_features='sqrt', min_samples_leaf=12,
#                                                     min_samples_split=13, n_estimators=138,
#                                                     random_state=42, subsample=0.645614570099021)
# }

In [None]:
test_df_sub = pd.read_csv('test.csv')
test_df_sub.head(2)

In [None]:
# Define the Train_ML function
def Train_ML(Model, X, y, test_data):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    train_scores = []
    val_scores = []
    test_predictions = []

    for fold, (train_index, val_index) in enumerate(skf.split(X, y), 1):
        # Handle indexing based on the type of X and y
        if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series):
            X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        else:
            X_train, X_val = X[train_index], X[val_index]
        
        if isinstance(y, pd.Series):
            y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        else:
            y_train, y_val = y[train_index], y[val_index]
        
        Model.fit(X_train, y_train)

        y_train_pred = Model.predict(X_train)
        train_mcc = matthews_corrcoef(y_train, y_train_pred)
        train_scores.append(train_mcc)

        y_val_pred = Model.predict(X_val)
        val_mcc = matthews_corrcoef(y_val, y_val_pred)
        val_scores.append(val_mcc)
        
        y_test_pred_proba = Model.predict(test_data)
        test_predictions.append(y_test_pred_proba)

        print(f"Fold {fold}: Train MCC = {train_mcc:.6f}, Validation MCC = {val_mcc:.6f}")
        logger.info(f"Fold {fold}: Train MCC = {train_mcc:.6f}, Validation MCC = {val_mcc:.6f}")
        logger.info(f"Time elapsed: {time.time() - start_time:.2f} seconds")

    mean_train_mcc = np.mean(train_scores)
    mean_val_mcc = np.mean(val_scores)

    print(f"\nMean Train MCC: {mean_train_mcc:.6f}")
    print(f"Mean Validation MCC: {mean_val_mcc:.6f}")
    logger.info(f"Mean Train MCC: {mean_train_mcc:.6f}, Mean Validation MCC: {mean_val_mcc:.6f}")
    logger.info(f"Time elapsed: {time.time() - start_time:.2f} seconds")

    return Model, test_predictions

In [None]:
# Perform cross-validation, fit on the entire training data, and predict for each model
for model_name, model in five_best_models.items():
    try:
        logger.info(f"Starting model training and cross-validation for {model_name}")
        trained_model, test_preds = Train_ML(model, train_df_transformed, y_train, test_df_transformed)

        # Averaging predictions across all folds
        final_test_preds = np.mean(test_preds, axis=0)
        final_test_preds_binary = (final_test_preds >= 0.5).astype(int)
        
        # Inverse transform the predictions to get the original class labels
        predictions = label_encoder.inverse_transform(final_test_preds_binary)

        # Log the prediction output
        logger.info(f"{model_name} - Test predictions done")
        logger.info(f"Time elapsed: {time.time() - start_time:.2f} seconds")

        # Output predictions to a CSV file
        output_df = pd.DataFrame({'id': test_df_sub['id'], 'class': predictions})
        output_df.to_csv(f'Submission_01A_Dropped_Simple_{model_name}.csv', index=False)
        logger.info(f"Generated output file for {model_name}")
        logger.info(f"Time elapsed: {time.time() - start_time:.2f} seconds")

    except Exception as e:
        logger.error(f"An error occurred with {model_name}: {e}")
        logger.info(f"Time elapsed: {time.time() - start_time:.2f} seconds")

# Print completion message
print("Predictions for all five models have been saved to individual CSV files.")
logger.info("Predictions for all five models have been saved to individual CSV files.")
logger.info(f"Total Time elapsed: {time.time() - start_time:.2f} seconds")

In [None]:
# Define MCC as the scoring metric
mcc_scorer = make_scorer(matthews_corrcoef)

# Perform cross-validation, fit on the entire training data, and predict for each model
predictions = {}
for model_name, model in five_best_models.items():
    try:
        logging.info(f"Performing cross-validation for {model_name}")
        scores = cross_val_score(model, train_df_transformed, y_train, cv=5, scoring=mcc_scorer, n_jobs=-1)
        logging.info(f"{model_name} - CV Scores: {scores}")
        logging.info(f"{model_name} - Mean CV Score: {np.mean(scores)}")
        logger.info(f"Time elapsed: {time.time() - start_time:.2f} seconds")

        # Fit the model on the entire training data
        model.fit(train_df_transformed, y_train)
    
        # Predict the output for test_df_transformed
        preds = model.predict(test_df_transformed)

        # Inverse transform the predictions to get the original class labels
        predictions[model_name] = label_encoder.inverse_transform(preds)

        # Log the prediction output
        logging.info(f"{model_name} - Test predictions done for {model_name}")
        logger.info(f"Time elapsed: {time.time() - start_time:.2f} seconds")

        # Output predictions to a CSV file
        output_df = pd.DataFrame({'id': test_df_sub['id'], 'class': predictions[model_name]})
        output_df.to_csv(f'Submission_01A(2)_Dropped_Simple_{model_name}.csv', index=False)
        print(output_df.head(2))
        logger.info(f"Generated output file - Time elapsed: {time.time() - start_time:.2f} seconds")

    except Exception as e:
        logger.error(f"An error occurred with {model_name}: {e}")
        logger.info(f"Time elapsed: {time.time() - start_time:.2f} seconds")

# Print completion message
print("Predictions for all five models have been saved to individual CSV files.")
print("Predictions for all five models have been saved to individual CSV files.")
logger.info(f"Time elapsed: {time.time() - start_time:.2f} seconds")