### Import Libraries

In [1]:
import logging
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import time
import xgboost as xgb
# import copy
# import gc
# import math
# import pickle

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll.base import scope
from sklearn.base import clone
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, QuantileTransformer, PowerTransformer
from scipy.stats import uniform, randint
from xgboost import XGBClassifier

import warnings 
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
start_time = time.time()

### Data Loading

In [2]:
train_df = pd.read_csv('train.csv', index_col='id')
test_df = pd.read_csv('test.csv', index_col='id')
# train_data['is_train'] = 1
# test_data['is_train'] = 0

# Combine train and test data for processing
# combined_df = pd.concat([train_data, test_data])
# del train_data, test_data

In [3]:
train_df.head()

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0
1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1
2,Female,25,1,14.0,1,< 1 Year,No,38043.0,152.0,254,0
3,Female,35,1,1.0,0,1-2 Year,Yes,2630.0,156.0,76,0
4,Female,36,1,15.0,1,1-2 Year,No,31951.0,152.0,294,0


In [4]:
test_df.head()

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
11504798,Female,20,1,47.0,0,< 1 Year,No,2630.0,160.0,228
11504799,Male,47,1,28.0,0,1-2 Year,Yes,37483.0,124.0,123
11504800,Male,47,1,43.0,0,1-2 Year,Yes,2630.0,26.0,271
11504801,Female,22,1,47.0,1,< 1 Year,No,24502.0,152.0,115
11504802,Male,51,1,19.0,0,1-2 Year,No,34115.0,124.0,148


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11504798 entries, 0 to 11504797
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Gender                object 
 1   Age                   int64  
 2   Driving_License       int64  
 3   Region_Code           float64
 4   Previously_Insured    int64  
 5   Vehicle_Age           object 
 6   Vehicle_Damage        object 
 7   Annual_Premium        float64
 8   Policy_Sales_Channel  float64
 9   Vintage               int64  
 10  Response              int64  
dtypes: float64(3), int64(5), object(3)
memory usage: 1.0+ GB


In [6]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7669866 entries, 11504798 to 19174663
Data columns (total 10 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Gender                object 
 1   Age                   int64  
 2   Driving_License       int64  
 3   Region_Code           float64
 4   Previously_Insured    int64  
 5   Vehicle_Age           object 
 6   Vehicle_Damage        object 
 7   Annual_Premium        float64
 8   Policy_Sales_Channel  float64
 9   Vintage               int64  
dtypes: float64(3), int64(4), object(3)
memory usage: 643.7+ MB


In [7]:
column_names = train_df.columns.tolist()

for i in column_names:
    print(i, train_df[i].nunique(), 'unique values')

Gender 2 unique values
Age 66 unique values
Driving_License 2 unique values
Region_Code 54 unique values
Previously_Insured 2 unique values
Vehicle_Age 3 unique values
Vehicle_Damage 2 unique values
Annual_Premium 51728 unique values
Policy_Sales_Channel 152 unique values
Vintage 290 unique values
Response 2 unique values


### Split the data into train and validation

In [8]:
# Since we have only one data set, spliting it into train and test (validation)

raw_train_df, validation_df = train_test_split(train_df, train_size = 0.75, random_state = 1, stratify = train_df['Response'])

logger.info(f"Data split completed. Time elapsed: {time.time() - start_time:.2f} seconds")

raw_train_df.head(2)

INFO:__main__:Data split completed. Time elapsed: 33.87 seconds


Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6400262,Female,26,1,28.0,0,< 1 Year,No,54497.0,26.0,234,0
8095698,Female,25,1,30.0,1,< 1 Year,No,38748.0,152.0,131,0


In [9]:
validation_df.head(2)

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6517611,Male,44,1,28.0,0,1-2 Year,Yes,2630.0,157.0,91,0
1591313,Female,23,1,14.0,1,< 1 Year,No,35345.0,152.0,272,0


In [10]:
raw_train_df.shape

(8628598, 11)

In [11]:
validation_df.shape

(2876200, 11)

In [12]:
def transform_categorical_features(df):
    gender_map = {'Male': 0, 'Female': 1}
    vehicle_age_map = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
    vehicle_damage_map = {'No': 0, 'Yes': 1}
    
    df['Gender'] = df['Gender'].map(gender_map)
    df['Vehicle_Age'] = df['Vehicle_Age'].map(vehicle_age_map)
    df['Vehicle_Damage'] = df['Vehicle_Damage'].map(vehicle_damage_map)
    
    return df

def adjust_data_types(df):
    df['Region_Code'] = df['Region_Code'].astype(int)
    df['Annual_Premium'] = df['Annual_Premium'].astype(int)
    df['Policy_Sales_Channel'] = df['Policy_Sales_Channel'].astype(int)
    
    return df

def create_additional_features(df):
    df['Prev_Insured_Annual_Premium'] = pd.factorize(df['Previously_Insured'].astype(str) + df['Annual_Premium'].astype(str))[0]
    df['Prev_Insured_Vehicle_Age'] = pd.factorize(df['Previously_Insured'].astype(str) + df['Vehicle_Age'].astype(str))[0]
    df['Prev_Insured_Vehicle_Damage'] = pd.factorize(df['Previously_Insured'].astype(str) + df['Vehicle_Damage'].astype(str))[0]
    df['Prev_Insured_Vintage'] = pd.factorize(df['Previously_Insured'].astype(str) + df['Vintage'].astype(str))[0]
    
    return df

def optimize_memory_usage(df):
    start_mem_usage = df.memory_usage().sum() / 1024 ** 2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type.name in ['category', 'object']:
            raise ValueError(f"Column '{col}' is of type '{col_type.name}'")

        c_min = df[col].min()
        c_max = df[col].max()
        
        if str(col_type)[:3] == 'int':
            
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
                
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
                
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
                
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)
        
        else:
        
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float16)
            
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            
            else:
                df[col] = df[col].astype(np.float64)

    end_mem_usage = df.memory_usage().sum() / 1024**2
    print(f'------ Memory usage before: {start_mem_usage:.2f} MB')
    print(f'------ Memory usage after: {end_mem_usage:.2f} MB')
    print(f'------ Reduced memory usage by {(100 * (start_mem_usage - end_mem_usage) / start_mem_usage):.1f}%')
    print('**********************' * 5)

    return df

def apply_scaling(df, scaler_type, columns):

    if scaler_type == 'S':
        scaler = StandardScaler() 
    
    elif scaler_type == 'M':
        scaler = MinMaxScaler()  
    
    elif scaler_type == 'R':
        scaler = RobustScaler()  
    
    elif scaler_type == 'A':
        scaler = MaxAbsScaler() 
    
    elif scaler_type == 'Q':
        scaler = QuantileTransformer(output_distribution='normal') 
    
    elif scaler_type == 'P':
        scaler = PowerTransformer() 
    
    else:
        raise ValueError("Invalid scaler type. Choose 'S' for StandardScaler, 'M' for MinMaxScaler, 'R' for RobustScaler, 'A' for MaxAbsScaler,'Q' for QuantileTransformer, or 'P' for PowerTransformer.")

    scaled_data = df.copy()

    for col in columns:
        scaled_data[col] = scaler.fit_transform(scaled_data[[col]])

    return scaled_data


In [13]:
raw_train_df = transform_categorical_features(raw_train_df)
validation_df = transform_categorical_features(validation_df)
test_df = transform_categorical_features(test_df)

raw_train_df = adjust_data_types(raw_train_df)
validation_df = adjust_data_types(validation_df)
test_df = adjust_data_types(test_df)

raw_train_df = create_additional_features(raw_train_df)
validation_df = create_additional_features(validation_df)
test_df = create_additional_features(test_df)

raw_train_df = optimize_memory_usage(raw_train_df)
validation_df = optimize_memory_usage(validation_df)
test_df = optimize_memory_usage(test_df)

------ Memory usage before: 954.55 MB
------ Memory usage after: 263.32 MB
------ Reduced memory usage by 72.4%
**************************************************************************************************************
------ Memory usage before: 318.18 MB
------ Memory usage after: 87.77 MB
------ Reduced memory usage by 72.4%
**************************************************************************************************************
------ Memory usage before: 789.97 MB
------ Memory usage after: 226.75 MB
------ Reduced memory usage by 71.3%
**************************************************************************************************************


In [14]:
raw_train_df.head(2)

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Prev_Insured_Annual_Premium,Prev_Insured_Vehicle_Age,Prev_Insured_Vehicle_Damage,Prev_Insured_Vintage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
6400262,1,26,1,28,0,0,0,54497,26,234,0,0,0,0,0
8095698,1,25,1,30,1,0,0,38748,152,131,0,1,1,1,1


In [15]:
validation_df.head(2)

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Prev_Insured_Annual_Premium,Prev_Insured_Vehicle_Age,Prev_Insured_Vehicle_Damage,Prev_Insured_Vintage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
6517611,0,44,1,28,0,1,1,2630,157,91,0,0,0,0,0
1591313,1,23,1,14,1,0,0,35345,152,272,0,1,1,1,1


In [16]:
test_df.head(2)

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Prev_Insured_Annual_Premium,Prev_Insured_Vehicle_Age,Prev_Insured_Vehicle_Damage,Prev_Insured_Vintage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
11504798,1,20,1,47,0,0,0,2630,160,228,0,0,0,0
11504799,0,47,1,28,0,1,1,37483,124,123,1,1,1,1


In [17]:
raw_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8628598 entries, 6400262 to 8402201
Data columns (total 15 columns):
 #   Column                       Dtype
---  ------                       -----
 0   Gender                       int8 
 1   Age                          int8 
 2   Driving_License              int8 
 3   Region_Code                  int8 
 4   Previously_Insured           int8 
 5   Vehicle_Age                  int8 
 6   Vehicle_Damage               int8 
 7   Annual_Premium               int32
 8   Policy_Sales_Channel         int16
 9   Vintage                      int16
 10  Response                     int8 
 11  Prev_Insured_Annual_Premium  int32
 12  Prev_Insured_Vehicle_Age     int8 
 13  Prev_Insured_Vehicle_Damage  int8 
 14  Prev_Insured_Vintage         int16
dtypes: int16(3), int32(2), int8(10)
memory usage: 263.3 MB


In [18]:
raw_train_df.describe()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Prev_Insured_Annual_Premium,Prev_Insured_Vehicle_Age,Prev_Insured_Vehicle_Damage,Prev_Insured_Vintage
count,8628598.0,8628598.0,8628598.0,8628598.0,8628598.0,8628598.0,8628598.0,8628598.0,8628598.0,8628598.0,8628598.0,8628598.0,8628598.0,8628598.0,8628598.0
mean,0.4587254,38.389,0.9980113,26.41771,0.4630153,0.6032037,0.5027108,30461.89,112.4161,163.8887,0.1229973,16655.22,1.856109,1.468437,264.5105
std,0.4982935,14.99678,0.04455088,12.99227,0.4986303,0.5678678,0.4999927,16444.75,54.03797,79.97808,0.3284341,17908.24,1.168019,0.6439283,162.5876
min,0.0,20.0,0.0,0.0,0.0,0.0,0.0,2630.0,1.0,10.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,24.0,1.0,15.0,0.0,0.0,0.0,25279.0,29.0,99.0,0.0,1816.0,1.0,1.0,120.0
50%,0.0,36.0,1.0,28.0,0.0,1.0,1.0,31826.0,151.0,166.0,0.0,10783.0,2.0,2.0,254.0
75%,1.0,49.0,1.0,35.0,1.0,1.0,1.0,39454.0,152.0,232.0,0.0,25926.0,3.0,2.0,395.0
max,1.0,85.0,1.0,52.0,1.0,2.0,1.0,540165.0,163.0,299.0,1.0,92980.0,5.0,3.0,579.0


In [19]:
scaler_type = 'S'
columns_to_scale_xgb = ['Age', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage','Prev_Insured_Annual_Premium', 'Prev_Insured_Vintage']

# Trying with Standard Scaler

ssc_train_df = apply_scaling(raw_train_df, scaler_type, columns_to_scale_xgb)
ssc_validation_df = apply_scaling(validation_df, scaler_type, columns_to_scale_xgb)
ssc_test_df = apply_scaling(test_df, scaler_type, columns_to_scale_xgb)

logger.info(f"Data scaling completed. Time elapsed: {time.time() - start_time:.2f} seconds")

ssc_train_df.head(2)

INFO:__main__:Data scaling completed. Time elapsed: 106.46 seconds


Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Prev_Insured_Annual_Premium,Prev_Insured_Vehicle_Age,Prev_Insured_Vehicle_Damage,Prev_Insured_Vintage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
6400262,1,-0.826111,1,28,0,0,0,1.461568,-1.599175,0.876632,0,-0.930031,0,0,-1.62688
8095698,1,-0.892792,1,30,1,0,0,0.503876,0.732519,-0.411221,0,-0.929975,1,1,-1.620729


In [20]:
ssc_validation_df.head(2)

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Prev_Insured_Annual_Premium,Prev_Insured_Vehicle_Age,Prev_Insured_Vehicle_Damage,Prev_Insured_Vintage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
6517611,0,0.375931,1,28,0,1,1,-1.688222,0.824496,-0.911745,0,-0.939053,0,0,-1.664407
1591313,1,-1.025614,1,14,1,0,0,0.296346,0.731953,1.351211,0,-0.938996,1,1,-1.658269


In [21]:
ssc_test_df.head(2)

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Prev_Insured_Annual_Premium,Prev_Insured_Vehicle_Age,Prev_Insured_Vehicle_Damage,Prev_Insured_Vintage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
11504798,1,-1.226132,1,47,0,0,0,-1.692555,0.880929,0.801411,-0.929854,0,0,-1.613145
11504799,0,0.573928,1,28,0,1,1,0.426701,0.21517,-0.511344,-0.929798,1,1,-1.607013


### Splitting dependent and independent variable

In [22]:
# Splitting dependent and independent variable

ssc_x_train = ssc_train_df.drop(['Response'], axis = 1)
ssc_y_train = ssc_train_df['Response']

ssc_x_val = ssc_validation_df.drop(['Response'], axis = 1)
ssc_y_val = ssc_validation_df['Response']

# Model training and evaluation

In [23]:
# Train the final model with the best hyperparameters from Bayesian Optimization

ssc_best_xgb_bayes = XGBClassifier(
    n_estimators = int(297.0),
    max_depth = int(10.0),
    learning_rate = 0.09963346243555755,
    subsample = 0.8302721292642807,
    colsample_bytree = 0.8010665164356681,
    gamma = 0.07140222781940667,
    min_child_weight = int(9.0),
    reg_alpha = 0.0022293880023777244,
    reg_lambda = 1.5050229982288488,
    scale_pos_weight = int(1.0),
    max_delta_step = int(6.0),
    colsample_bylevel = 0.9951929930551984,
    colsample_bynode = 0.9897401686365959,
    random_state = 42,
    use_label_encoder = False,
    eval_metric = 'logloss',
    n_jobs = -1
)

start_time = time.time()
ssc_best_xgb_bayes.fit(ssc_x_train, ssc_y_train)
logger.info(f"Final model training completed. Time elapsed: {time.time() - start_time:.2f} seconds")

ssc_y_pred_proba_bayes = ssc_best_xgb_bayes.predict_proba(ssc_x_val)[:, 1]
ssc_test_roc_auc_bayes = roc_auc_score(ssc_y_val, ssc_y_pred_proba_bayes)
logger.info(f"Test AUROC Score with Bayesian Optimization (SSC): {ssc_test_roc_auc_bayes}")


INFO:__main__:Final model training completed. Time elapsed: 287.38 seconds
INFO:__main__:Test AUROC Score with Bayesian Optimization (SSC): 0.8572648026440974


In [24]:
# Train the final model with the best hyperparameters from Random Search CV

ssc_best_xgb_random = XGBClassifier(
    n_estimators = int(289),
    max_depth = int(9),
    learning_rate = 0.09925589984899778,
    subsample = 0.8001040753990633,
    colsample_bytree = 0.960734415379823,
    gamma = 0.03731401177720717,
    min_child_weight = int(9),
    reg_alpha = 0.011005192452767676,
    reg_lambda = 1.4558703250838834,
    scale_pos_weight = int(2),
    max_delta_step = int(9),
    colsample_bylevel = 0.9266807513020847,
    colsample_bynode = 0.9742921180375436,
    random_state = 42,
    use_label_encoder = False,
    eval_metric = 'logloss',
    n_jobs = -1
)

start_time = time.time()
ssc_best_xgb_random.fit(ssc_x_train, ssc_y_train)
logger.info(f"Final model training completed. Time elapsed: {time.time() - start_time:.2f} seconds")

ssc_y_pred_proba_random = ssc_best_xgb_random.predict_proba(ssc_x_val)[:, 1]
ssc_test_roc_auc_random = roc_auc_score(ssc_y_val, ssc_y_pred_proba_random)
logger.info(f"Test AUROC Score with Random Search CV (SSC): {ssc_test_roc_auc_random}")

INFO:__main__:Final model training completed. Time elapsed: 282.73 seconds
INFO:__main__:Test AUROC Score with Random Search CV (SSC): 0.8539285771197189


In [25]:
scaler_type = 'M'
columns_to_scale_xgb = ['Age', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage','Prev_Insured_Annual_Premium', 'Prev_Insured_Vintage']

# Trying with Standard Scaler

mmx_train_df = apply_scaling(raw_train_df, scaler_type, columns_to_scale_xgb)
mmx_validation_df = apply_scaling(validation_df, scaler_type, columns_to_scale_xgb)
mmx_test_df = apply_scaling(test_df, scaler_type, columns_to_scale_xgb)

logger.info(f"Data scaling completed. Time elapsed: {time.time() - start_time:.2f} seconds")

mmx_train_df.head(2)

INFO:__main__:Data scaling completed. Time elapsed: 296.31 seconds


Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Prev_Insured_Annual_Premium,Prev_Insured_Vehicle_Age,Prev_Insured_Vehicle_Damage,Prev_Insured_Vintage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
6400262,1,0.092308,1,28,0,0,0,0.09649,0.154321,0.775087,0,0.0,0,0,0.0
8095698,1,0.076923,1,30,1,0,0,0.067192,0.932099,0.418685,0,1.1e-05,1,1,0.001727


In [26]:
# Splitting dependent and independent variable

mmx_x_train = mmx_train_df.drop(['Response'], axis = 1)
mmx_y_train = mmx_train_df['Response']

mmx_x_val = mmx_validation_df.drop(['Response'], axis = 1)
mmx_y_val = mmx_validation_df['Response']

In [27]:
# Train the final model with the best hyperparameters from Bayesian Optimization

mmx_best_xgb_bayes = XGBClassifier(
    n_estimators = int(297.0),
    max_depth = int(10.0),
    learning_rate = 0.09963346243555755,
    subsample = 0.8302721292642807,
    colsample_bytree = 0.8010665164356681,
    gamma = 0.07140222781940667,
    min_child_weight = int(9.0),
    reg_alpha = 0.0022293880023777244,
    reg_lambda = 1.5050229982288488,
    scale_pos_weight = int(1.0),
    max_delta_step = int(6.0),
    colsample_bylevel = 0.9951929930551984,
    colsample_bynode = 0.9897401686365959,
    random_state = 42,
    use_label_encoder = False,
    eval_metric = 'logloss',
    n_jobs = -1
)

start_time = time.time()
mmx_best_xgb_bayes.fit(mmx_x_train, mmx_y_train)
logger.info(f"Final model training completed. Time elapsed: {time.time() - start_time:.2f} seconds")

mmx_y_pred_proba_bayes = mmx_best_xgb_bayes.predict_proba(mmx_x_val)[:, 1]
mmx_test_roc_auc_bayes = roc_auc_score(mmx_y_val, mmx_y_pred_proba_bayes)
logger.info(f"Test AUROC Score with Bayesian Optimization (MMX): {mmx_test_roc_auc_bayes}")


INFO:__main__:Final model training completed. Time elapsed: 314.02 seconds
INFO:__main__:Test AUROC Score with Bayesian Optimization (MMX): 0.8691466639006543


In [28]:
# Train the final model with the best hyperparameters from Random Search CV

mmx_best_xgb_random = XGBClassifier(
    n_estimators = int(289),
    max_depth = int(9),
    learning_rate = 0.09925589984899778,
    subsample = 0.8001040753990633,
    colsample_bytree = 0.960734415379823,
    gamma = 0.03731401177720717,
    min_child_weight = int(9),
    reg_alpha = 0.011005192452767676,
    reg_lambda = 1.4558703250838834,
    scale_pos_weight = int(2),
    max_delta_step = int(9),
    colsample_bylevel = 0.9266807513020847,
    colsample_bynode = 0.9742921180375436,
    random_state = 42,
    use_label_encoder = False,
    eval_metric = 'logloss',
    n_jobs = -1
)

start_time = time.time()
mmx_best_xgb_random.fit(mmx_x_train, mmx_y_train)
logger.info(f"Final model training completed. Time elapsed: {time.time() - start_time:.2f} seconds")

mmx_y_pred_proba_random = mmx_best_xgb_random.predict_proba(mmx_x_val)[:, 1]
mmx_test_roc_auc_random = roc_auc_score(mmx_y_val, mmx_y_pred_proba_random)
logger.info(f"Test AUROC Score with Random Search CV (MMX): {mmx_test_roc_auc_random}")


INFO:__main__:Final model training completed. Time elapsed: 292.57 seconds
INFO:__main__:Test AUROC Score with Random Search CV (MMX): 0.865920936890239


In [29]:
scaler_type = 'R'
columns_to_scale_xgb = ['Age', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage','Prev_Insured_Annual_Premium', 'Prev_Insured_Vintage']

# Trying with Standard Scaler

rbst_train_df = apply_scaling(raw_train_df, scaler_type, columns_to_scale_xgb)
rbst_validation_df = apply_scaling(validation_df, scaler_type, columns_to_scale_xgb)
rbst_test_df = apply_scaling(test_df, scaler_type, columns_to_scale_xgb)

logger.info(f"Data scaling completed. Time elapsed: {time.time() - start_time:.2f} seconds")

rbst_train_df.head(2)

INFO:__main__:Data scaling completed. Time elapsed: 310.52 seconds


Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Prev_Insured_Annual_Premium,Prev_Insured_Vehicle_Age,Prev_Insured_Vehicle_Damage,Prev_Insured_Vintage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
6400262,1,-0.4,1,28,0,0,0,1.599365,-1.01626,0.511278,0,-0.447242,0,0,-0.923636
8095698,1,-0.44,1,30,1,0,0,0.488325,0.00813,-0.263158,0,-0.4472,1,1,-0.92


In [30]:
# Splitting dependent and independent variable

rbst_x_train = rbst_train_df.drop(['Response'], axis = 1)
rbst_y_train = rbst_train_df['Response']

rbst_x_val = rbst_validation_df.drop(['Response'], axis = 1)
rbst_y_val = rbst_validation_df['Response']

In [31]:
# Train the final model with the best hyperparameters from Bayesian Optimization

rbst_best_xgb_bayes = XGBClassifier(
    n_estimators = int(297.0),
    max_depth = int(10.0),
    learning_rate = 0.09963346243555755,
    subsample = 0.8302721292642807,
    colsample_bytree = 0.8010665164356681,
    gamma = 0.07140222781940667,
    min_child_weight = int(9.0),
    reg_alpha = 0.0022293880023777244,
    reg_lambda = 1.5050229982288488,
    scale_pos_weight = int(1.0),
    max_delta_step = int(6.0),
    colsample_bylevel = 0.9951929930551984,
    colsample_bynode = 0.9897401686365959,
    random_state = 42,
    use_label_encoder = False,
    eval_metric = 'logloss',
    n_jobs = -1
)

start_time = time.time()
rbst_best_xgb_bayes.fit(rbst_x_train, rbst_y_train)
logger.info(f"Final model training completed. Time elapsed: {time.time() - start_time:.2f} seconds")

rbst_y_pred_proba_bayes = rbst_best_xgb_bayes.predict_proba(rbst_x_val)[:, 1]
rbst_test_roc_auc_bayes = roc_auc_score(rbst_y_val, rbst_y_pred_proba_bayes)
logger.info(f"Test AUROC Score with Bayesian Optimization (RBST): {rbst_test_roc_auc_bayes}")


INFO:__main__:Final model training completed. Time elapsed: 303.95 seconds
INFO:__main__:Test AUROC Score with Bayesian Optimization (RBST): 0.8684009207540879


In [32]:
# Train the final model with the best hyperparameters from Random Search CV

rbst_best_xgb_random = XGBClassifier(
    n_estimators = int(289),
    max_depth = int(9),
    learning_rate = 0.09925589984899778,
    subsample = 0.8001040753990633,
    colsample_bytree = 0.960734415379823,
    gamma = 0.03731401177720717,
    min_child_weight = int(9),
    reg_alpha = 0.011005192452767676,
    reg_lambda = 1.4558703250838834,
    scale_pos_weight = int(2),
    max_delta_step = int(9),
    colsample_bylevel = 0.9266807513020847,
    colsample_bynode = 0.9742921180375436,
    random_state = 42,
    use_label_encoder = False,
    eval_metric = 'logloss',
    n_jobs = -1
)

start_time = time.time()
rbst_best_xgb_random.fit(rbst_x_train, rbst_y_train)
logger.info(f"Final model training completed. Time elapsed: {time.time() - start_time:.2f} seconds")

rbst_y_pred_proba_random = rbst_best_xgb_random.predict_proba(rbst_x_val)[:, 1]
rbst_test_roc_auc_random = roc_auc_score(rbst_y_val, rbst_y_pred_proba_random)
logger.info(f"Test AUROC Score with Random Search CV (RBST): {rbst_test_roc_auc_random}")


INFO:__main__:Final model training completed. Time elapsed: 302.74 seconds
INFO:__main__:Test AUROC Score with Random Search CV (RBST): 0.8657278073025539
