### Import Libraries

In [53]:
import logging
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import time
import xgboost as xgb
# import copy
# import gc
# import math
# import pickle

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll.base import scope
from sklearn.base import clone
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, QuantileTransformer, PowerTransformer
from scipy.stats import uniform, randint
from xgboost import XGBClassifier

import warnings 
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
start_time = time.time()

### Data Loading

In [2]:
train_df = pd.read_csv('train.csv', index_col='id')
test_df = pd.read_csv('test.csv', index_col='id')
# train_data['is_train'] = 1
# test_data['is_train'] = 0

# Combine train and test data for processing
# combined_df = pd.concat([train_data, test_data])
# del train_data, test_data

In [3]:
train_df.head(2)

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0
1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1


In [4]:
test_df.head(2)

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
11504798,Female,20,1,47.0,0,< 1 Year,No,2630.0,160.0,228
11504799,Male,47,1,28.0,0,1-2 Year,Yes,37483.0,124.0,123


In [5]:
column_names = train_df.columns.tolist()

for i in column_names:
    print(i, train_df[i].nunique(), 'unique values')

Gender 2 unique values
Age 66 unique values
Driving_License 2 unique values
Region_Code 54 unique values
Previously_Insured 2 unique values
Vehicle_Age 3 unique values
Vehicle_Damage 2 unique values
Annual_Premium 51728 unique values
Policy_Sales_Channel 152 unique values
Vintage 290 unique values
Response 2 unique values


### Split the data into train and validation

In [6]:
# Since we have only one data set, spliting it into train and test (validation)

raw_train_df, validation_df = train_test_split(train_df, train_size = 0.75, random_state = 1, stratify = train_df['Response'])

logger.info(f"Data split completed. Time elapsed: {time.time() - start_time:.2f} seconds")

raw_train_df.head(2)

INFO:__main__:Data split completed. Time elapsed: 34.44 seconds


Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6400262,Female,26,1,28.0,0,< 1 Year,No,54497.0,26.0,234,0
8095698,Female,25,1,30.0,1,< 1 Year,No,38748.0,152.0,131,0


In [7]:
validation_df.head(2)

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6517611,Male,44,1,28.0,0,1-2 Year,Yes,2630.0,157.0,91,0
1591313,Female,23,1,14.0,1,< 1 Year,No,35345.0,152.0,272,0


In [8]:
raw_train_df.shape

(8628598, 11)

In [9]:
validation_df.shape

(2876200, 11)

In [10]:
def transform_categorical_features(df):
    gender_map = {'Male': 0, 'Female': 1}
    vehicle_age_map = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
    vehicle_damage_map = {'No': 0, 'Yes': 1}
    
    df['Gender'] = df['Gender'].map(gender_map)
    df['Vehicle_Age'] = df['Vehicle_Age'].map(vehicle_age_map)
    df['Vehicle_Damage'] = df['Vehicle_Damage'].map(vehicle_damage_map)
    
    return df

def adjust_data_types(df):
    df['Region_Code'] = df['Region_Code'].astype(int)
    df['Annual_Premium'] = df['Annual_Premium'].astype(int)
    df['Policy_Sales_Channel'] = df['Policy_Sales_Channel'].astype(int)
    
    return df

def create_additional_features(df):
    df['Prev_Insured_Annual_Premium'] = pd.factorize(df['Previously_Insured'].astype(str) + df['Annual_Premium'].astype(str))[0]
    df['Prev_Insured_Vehicle_Age'] = pd.factorize(df['Previously_Insured'].astype(str) + df['Vehicle_Age'].astype(str))[0]
    df['Prev_Insured_Vehicle_Damage'] = pd.factorize(df['Previously_Insured'].astype(str) + df['Vehicle_Damage'].astype(str))[0]
    df['Prev_Insured_Vintage'] = pd.factorize(df['Previously_Insured'].astype(str) + df['Vintage'].astype(str))[0]
    
    return df

def optimize_memory_usage(df):
    start_mem_usage = df.memory_usage().sum() / 1024 ** 2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type.name in ['category', 'object']:
            raise ValueError(f"Column '{col}' is of type '{col_type.name}'")

        c_min = df[col].min()
        c_max = df[col].max()
        
        if str(col_type)[:3] == 'int':
            
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
                
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
                
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
                
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)
        
        else:
        
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float16)
            
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            
            else:
                df[col] = df[col].astype(np.float64)

    end_mem_usage = df.memory_usage().sum() / 1024**2
    print(f'------ Memory usage before: {start_mem_usage:.2f} MB')
    print(f'------ Memory usage after: {end_mem_usage:.2f} MB')
    print(f'------ Reduced memory usage by {(100 * (start_mem_usage - end_mem_usage) / start_mem_usage):.1f}%')
    print('**********************' * 5)

    return df

def apply_scaling(df, scaler_type, columns):

    if scaler_type == 'S':
        scaler = StandardScaler() 
    
    elif scaler_type == 'M':
        scaler = MinMaxScaler()  
    
    elif scaler_type == 'R':
        scaler = RobustScaler()  
    
    elif scaler_type == 'A':
        scaler = MaxAbsScaler() 
    
    elif scaler_type == 'Q':
        scaler = QuantileTransformer(output_distribution='normal') 
    
    elif scaler_type == 'P':
        scaler = PowerTransformer() 
    
    else:
        raise ValueError("Invalid scaler type. Choose 'S' for StandardScaler, 'M' for MinMaxScaler, 'R' for RobustScaler, 'A' for MaxAbsScaler,'Q' for QuantileTransformer, or 'P' for PowerTransformer.")

    scaled_data = df.copy()

    for col in columns:
        scaled_data[col] = scaler.fit_transform(scaled_data[[col]])

    return scaled_data


In [11]:
raw_train_df = transform_categorical_features(raw_train_df)
validation_df = transform_categorical_features(validation_df)
test_df = transform_categorical_features(test_df)

raw_train_df = adjust_data_types(raw_train_df)
validation_df = adjust_data_types(validation_df)
test_df = adjust_data_types(test_df)

# raw_train_df = create_additional_features(raw_train_df)
# validation_df = create_additional_features(validation_df)
# test_df = create_additional_features(test_df)

raw_train_df = optimize_memory_usage(raw_train_df)
validation_df = optimize_memory_usage(validation_df)
test_df = optimize_memory_usage(test_df)

------ Memory usage before: 691.23 MB
------ Memory usage after: 197.49 MB
------ Reduced memory usage by 71.4%
**************************************************************************************************************
------ Memory usage before: 230.41 MB
------ Memory usage after: 65.83 MB
------ Reduced memory usage by 71.4%
**************************************************************************************************************
------ Memory usage before: 555.91 MB
------ Memory usage after: 168.23 MB
------ Reduced memory usage by 69.7%
**************************************************************************************************************


In [12]:
raw_train_df.describe()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
count,8628598.0,8628598.0,8628598.0,8628598.0,8628598.0,8628598.0,8628598.0,8628598.0,8628598.0,8628598.0,8628598.0
mean,0.4587254,38.389,0.9980113,26.41771,0.4630153,0.6032037,0.5027108,30461.89,112.4161,163.8887,0.1229973
std,0.4982935,14.99678,0.04455088,12.99227,0.4986303,0.5678678,0.4999927,16444.75,54.03797,79.97808,0.3284341
min,0.0,20.0,0.0,0.0,0.0,0.0,0.0,2630.0,1.0,10.0,0.0
25%,0.0,24.0,1.0,15.0,0.0,0.0,0.0,25279.0,29.0,99.0,0.0
50%,0.0,36.0,1.0,28.0,0.0,1.0,1.0,31826.0,151.0,166.0,0.0
75%,1.0,49.0,1.0,35.0,1.0,1.0,1.0,39454.0,152.0,232.0,0.0
max,1.0,85.0,1.0,52.0,1.0,2.0,1.0,540165.0,163.0,299.0,1.0


In [13]:
scaler_type = 'S'
columns_to_scale_xgb = ['Age', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']

# Trying with Standard Scaler

ssc_train_df = apply_scaling(raw_train_df, scaler_type, columns_to_scale_xgb)
ssc_validation_df = apply_scaling(validation_df, scaler_type, columns_to_scale_xgb)
ssc_test_df = apply_scaling(test_df, scaler_type, columns_to_scale_xgb)

logger.info(f"Data scaling completed. Time elapsed: {time.time() - start_time:.2f} seconds")

ssc_train_df.head(2)

INFO:__main__:Data scaling completed. Time elapsed: 43.59 seconds


Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6400262,1,-0.826111,1,28,0,0,0,1.461568,-1.599175,0.876632,0
8095698,1,-0.892792,1,30,1,0,0,0.503876,0.732519,-0.411221,0


In [14]:
ssc_validation_df.head(2)

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6517611,0,0.375931,1,28,0,1,1,-1.688222,0.824496,-0.911745,0
1591313,1,-1.025614,1,14,1,0,0,0.296346,0.731953,1.351211,0


In [15]:
ssc_test_df.head(2)

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
11504798,1,-1.226132,1,47,0,0,0,-1.692555,0.880929,0.801411
11504799,0,0.573928,1,28,0,1,1,0.426701,0.21517,-0.511344


### Splitting dependent and independent variable

In [16]:
# Splitting dependent and independent variable

ssc_x_train = ssc_train_df.drop(['Response'], axis = 1)
ssc_y_train = ssc_train_df['Response']

ssc_x_val = ssc_validation_df.drop(['Response'], axis = 1)
ssc_y_val = ssc_validation_df['Response']

# Model training and evaluation

In [17]:
# Train the final model with the best hyperparameters from Bayesian Optimization

ssc_best_xgb_bayes = XGBClassifier(
    n_estimators = int(297.0),
    max_depth = int(10.0),
    learning_rate = 0.09963346243555755,
    subsample = 0.8302721292642807,
    colsample_bytree = 0.8010665164356681,
    gamma = 0.07140222781940667,
    min_child_weight = int(9.0),
    reg_alpha = 0.0022293880023777244,
    reg_lambda = 1.5050229982288488,
    scale_pos_weight = int(1.0),
    max_delta_step = int(6.0),
    colsample_bylevel = 0.9951929930551984,
    colsample_bynode = 0.9897401686365959,
    random_state = 42,
    use_label_encoder = False,
    eval_metric = 'logloss',
    n_jobs = -1
)

start_time = time.time()
ssc_best_xgb_bayes.fit(ssc_x_train, ssc_y_train)
logger.info(f"Final model training completed. Time elapsed: {time.time() - start_time:.2f} seconds")

ssc_y_pred_proba_bayes = ssc_best_xgb_bayes.predict_proba(ssc_x_val)[:, 1]
ssc_test_roc_auc_bayes = roc_auc_score(ssc_y_val, ssc_y_pred_proba_bayes)
logger.info(f"Test AUROC Score with Bayesian Optimization (SSC): {ssc_test_roc_auc_bayes}")


INFO:__main__:Final model training completed. Time elapsed: 213.36 seconds
INFO:__main__:Test AUROC Score with Bayesian Optimization (SSC): 0.8549994174560647


In [18]:
xgb_predict_ssc_bayes = ssc_best_xgb_bayes.predict_proba(ssc_test_df)[:,1]
submission7 = pd.DataFrame({'id' : ssc_test_df.index, 'Response' : xgb_predict_ssc_bayes})
submission7

Unnamed: 0,id,Response
0,11504798,0.013699
1,11504799,0.247635
2,11504800,0.265235
3,11504801,0.000159
4,11504802,0.027709
...,...,...
7669861,19174659,0.143669
7669862,19174660,0.000107
7669863,19174661,0.000365
7669864,19174662,0.633787


In [19]:
submission7.to_csv('submission7.csv', index=False)

In [20]:
# Train the final model with the best hyperparameters from Random Search CV

ssc_best_xgb_random = XGBClassifier(
    n_estimators = int(289),
    max_depth = int(9),
    learning_rate = 0.09925589984899778,
    subsample = 0.8001040753990633,
    colsample_bytree = 0.960734415379823,
    gamma = 0.03731401177720717,
    min_child_weight = int(9),
    reg_alpha = 0.011005192452767676,
    reg_lambda = 1.4558703250838834,
    scale_pos_weight = int(2),
    max_delta_step = int(9),
    colsample_bylevel = 0.9266807513020847,
    colsample_bynode = 0.9742921180375436,
    random_state = 42,
    use_label_encoder = False,
    eval_metric = 'logloss',
    n_jobs = -1
)

start_time = time.time()
ssc_best_xgb_random.fit(ssc_x_train, ssc_y_train)
logger.info(f"Final model training completed. Time elapsed: {time.time() - start_time:.2f} seconds")

ssc_y_pred_proba_random = ssc_best_xgb_random.predict_proba(ssc_x_val)[:, 1]
ssc_test_roc_auc_random = roc_auc_score(ssc_y_val, ssc_y_pred_proba_random)
logger.info(f"Test AUROC Score with Random Search CV (SSC): {ssc_test_roc_auc_random}")

INFO:__main__:Final model training completed. Time elapsed: 238.97 seconds
INFO:__main__:Test AUROC Score with Random Search CV (SSC): 0.8554677598355671


In [21]:
xgb_predict_ssc_random = ssc_best_xgb_random.predict_proba(ssc_test_df)[:,1]
submission8 = pd.DataFrame({'id' : ssc_test_df.index, 'Response' : xgb_predict_ssc_random})
submission8

Unnamed: 0,id,Response
0,11504798,0.022558
1,11504799,0.409528
2,11504800,0.401819
3,11504801,0.000267
4,11504802,0.045931
...,...,...
7669861,19174659,0.261597
7669862,19174660,0.000183
7669863,19174661,0.000639
7669864,19174662,0.769702


In [22]:
submission8.to_csv('submission8.csv', index=False)

In [23]:
scaler_type = 'M'
columns_to_scale_xgb = ['Age', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']

# Trying with Standard Scaler

mmx_train_df = apply_scaling(raw_train_df, scaler_type, columns_to_scale_xgb)
mmx_validation_df = apply_scaling(validation_df, scaler_type, columns_to_scale_xgb)
mmx_test_df = apply_scaling(test_df, scaler_type, columns_to_scale_xgb)

logger.info(f"Data scaling completed. Time elapsed: {time.time() - start_time:.2f} seconds")

mmx_train_df.head(2)

INFO:__main__:Data scaling completed. Time elapsed: 292.00 seconds


Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6400262,1,0.092308,1,28,0,0,0,0.09649,0.154321,0.775087,0
8095698,1,0.076923,1,30,1,0,0,0.067192,0.932099,0.418685,0


In [24]:
# Splitting dependent and independent variable

mmx_x_train = mmx_train_df.drop(['Response'], axis = 1)
mmx_y_train = mmx_train_df['Response']

mmx_x_val = mmx_validation_df.drop(['Response'], axis = 1)
mmx_y_val = mmx_validation_df['Response']

In [25]:
# Train the final model with the best hyperparameters from Bayesian Optimization

mmx_best_xgb_bayes = XGBClassifier(
    n_estimators = int(297.0),
    max_depth = int(10.0),
    learning_rate = 0.09963346243555755,
    subsample = 0.8302721292642807,
    colsample_bytree = 0.8010665164356681,
    gamma = 0.07140222781940667,
    min_child_weight = int(9.0),
    reg_alpha = 0.0022293880023777244,
    reg_lambda = 1.5050229982288488,
    scale_pos_weight = int(1.0),
    max_delta_step = int(6.0),
    colsample_bylevel = 0.9951929930551984,
    colsample_bynode = 0.9897401686365959,
    random_state = 42,
    use_label_encoder = False,
    eval_metric = 'logloss',
    n_jobs = -1
)

start_time = time.time()
mmx_best_xgb_bayes.fit(mmx_x_train, mmx_y_train)
logger.info(f"Final model training completed. Time elapsed: {time.time() - start_time:.2f} seconds")

mmx_y_pred_proba_bayes = mmx_best_xgb_bayes.predict_proba(mmx_x_val)[:, 1]
mmx_test_roc_auc_bayes = roc_auc_score(mmx_y_val, mmx_y_pred_proba_bayes)
logger.info(f"Test AUROC Score with Bayesian Optimization (MMX): {mmx_test_roc_auc_bayes}")


INFO:__main__:Final model training completed. Time elapsed: 236.66 seconds
INFO:__main__:Test AUROC Score with Bayesian Optimization (MMX): 0.8798916620062257


In [26]:
xgb_predict_mmx_bayes = mmx_best_xgb_bayes.predict_proba(mmx_test_df)[:,1]
submission9 = pd.DataFrame({'id' : mmx_test_df.index, 'Response' : xgb_predict_mmx_bayes})
submission9

Unnamed: 0,id,Response
0,11504798,0.005777
1,11504799,0.453198
2,11504800,0.235381
3,11504801,0.000083
4,11504802,0.045280
...,...,...
7669861,19174659,0.203576
7669862,19174660,0.000165
7669863,19174661,0.000295
7669864,19174662,0.570926


In [27]:
submission9.to_csv('submission9.csv', index=False)

In [28]:
# Train the final model with the best hyperparameters from Random Search CV

mmx_best_xgb_random = XGBClassifier(
    n_estimators = int(289),
    max_depth = int(9),
    learning_rate = 0.09925589984899778,
    subsample = 0.8001040753990633,
    colsample_bytree = 0.960734415379823,
    gamma = 0.03731401177720717,
    min_child_weight = int(9),
    reg_alpha = 0.011005192452767676,
    reg_lambda = 1.4558703250838834,
    scale_pos_weight = int(2),
    max_delta_step = int(9),
    colsample_bylevel = 0.9266807513020847,
    colsample_bynode = 0.9742921180375436,
    random_state = 42,
    use_label_encoder = False,
    eval_metric = 'logloss',
    n_jobs = -1
)

start_time = time.time()
mmx_best_xgb_random.fit(mmx_x_train, mmx_y_train)
logger.info(f"Final model training completed. Time elapsed: {time.time() - start_time:.2f} seconds")

mmx_y_pred_proba_random = mmx_best_xgb_random.predict_proba(mmx_x_val)[:, 1]
mmx_test_roc_auc_random = roc_auc_score(mmx_y_val, mmx_y_pred_proba_random)
logger.info(f"Test AUROC Score with Random Search CV (MMX): {mmx_test_roc_auc_random}")


INFO:__main__:Final model training completed. Time elapsed: 242.60 seconds
INFO:__main__:Test AUROC Score with Random Search CV (MMX): 0.879568307762447


In [37]:
xgb_predict_mmx_random = mmx_best_xgb_random.predict_proba(mmx_test_df)[:,1]
submission10 = pd.DataFrame({'id' : mmx_test_df.index, 'Response' : xgb_predict_mmx_random})
submission10

Unnamed: 0,id,Response
0,11504798,0.009194
1,11504799,0.631729
2,11504800,0.386482
3,11504801,0.000142
4,11504802,0.095885
...,...,...
7669861,19174659,0.347675
7669862,19174660,0.000294
7669863,19174661,0.000546
7669864,19174662,0.709610


In [38]:
submission10.to_csv('submission10.csv', index=False)

In [39]:
scaler_type = 'R'
columns_to_scale_xgb = ['Age', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']

# Trying with Standard Scaler

rbst_train_df = apply_scaling(raw_train_df, scaler_type, columns_to_scale_xgb)
rbst_validation_df = apply_scaling(validation_df, scaler_type, columns_to_scale_xgb)
rbst_test_df = apply_scaling(test_df, scaler_type, columns_to_scale_xgb)

logger.info(f"Data scaling completed. Time elapsed: {time.time() - start_time:.2f} seconds")

rbst_train_df.head(2)

INFO:__main__:Data scaling completed. Time elapsed: 297.79 seconds


Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6400262,1,-0.4,1,28,0,0,0,1.599365,-1.01626,0.511278,0
8095698,1,-0.44,1,30,1,0,0,0.488325,0.00813,-0.263158,0


In [40]:
# Splitting dependent and independent variable

rbst_x_train = rbst_train_df.drop(['Response'], axis = 1)
rbst_y_train = rbst_train_df['Response']

rbst_x_val = rbst_validation_df.drop(['Response'], axis = 1)
rbst_y_val = rbst_validation_df['Response']

In [41]:
# Train the final model with the best hyperparameters from Bayesian Optimization

rbst_best_xgb_bayes = XGBClassifier(
    n_estimators = int(297.0),
    max_depth = int(10.0),
    learning_rate = 0.09963346243555755,
    subsample = 0.8302721292642807,
    colsample_bytree = 0.8010665164356681,
    gamma = 0.07140222781940667,
    min_child_weight = int(9.0),
    reg_alpha = 0.0022293880023777244,
    reg_lambda = 1.5050229982288488,
    scale_pos_weight = int(1.0),
    max_delta_step = int(6.0),
    colsample_bylevel = 0.9951929930551984,
    colsample_bynode = 0.9897401686365959,
    random_state = 42,
    use_label_encoder = False,
    eval_metric = 'logloss',
    n_jobs = -1
)

start_time = time.time()
rbst_best_xgb_bayes.fit(rbst_x_train, rbst_y_train)
logger.info(f"Final model training completed. Time elapsed: {time.time() - start_time:.2f} seconds")

rbst_y_pred_proba_bayes = rbst_best_xgb_bayes.predict_proba(rbst_x_val)[:, 1]
rbst_test_roc_auc_bayes = roc_auc_score(rbst_y_val, rbst_y_pred_proba_bayes)
logger.info(f"Test AUROC Score with Bayesian Optimization (RBST): {rbst_test_roc_auc_bayes}")


INFO:__main__:Final model training completed. Time elapsed: 280.87 seconds
INFO:__main__:Test AUROC Score with Bayesian Optimization (RBST): 0.8797423715666235


In [43]:
xgb_predict_rbst_bayes = rbst_best_xgb_bayes.predict_proba(rbst_test_df)[:,1]
submission11 = pd.DataFrame({'id' : rbst_test_df.index, 'Response' : xgb_predict_rbst_bayes})
submission11

Unnamed: 0,id,Response
0,11504798,0.005777
1,11504799,0.453198
2,11504800,0.235381
3,11504801,0.000083
4,11504802,0.045280
...,...,...
7669861,19174659,0.203576
7669862,19174660,0.000165
7669863,19174661,0.000295
7669864,19174662,0.570926


In [44]:
submission11.to_csv('submission11.csv', index=False)

In [45]:
# Train the final model with the best hyperparameters from Random Search CV

rbst_best_xgb_random = XGBClassifier(
    n_estimators = int(289),
    max_depth = int(9),
    learning_rate = 0.09925589984899778,
    subsample = 0.8001040753990633,
    colsample_bytree = 0.960734415379823,
    gamma = 0.03731401177720717,
    min_child_weight = int(9),
    reg_alpha = 0.011005192452767676,
    reg_lambda = 1.4558703250838834,
    scale_pos_weight = int(2),
    max_delta_step = int(9),
    colsample_bylevel = 0.9266807513020847,
    colsample_bynode = 0.9742921180375436,
    random_state = 42,
    use_label_encoder = False,
    eval_metric = 'logloss',
    n_jobs = -1
)

start_time = time.time()
rbst_best_xgb_random.fit(rbst_x_train, rbst_y_train)
logger.info(f"Final model training completed. Time elapsed: {time.time() - start_time:.2f} seconds")

rbst_y_pred_proba_random = rbst_best_xgb_random.predict_proba(rbst_x_val)[:, 1]
rbst_test_roc_auc_random = roc_auc_score(rbst_y_val, rbst_y_pred_proba_random)
logger.info(f"Test AUROC Score with Random Search CV (RBST): {rbst_test_roc_auc_random}")


INFO:__main__:Final model training completed. Time elapsed: 237.87 seconds
INFO:__main__:Test AUROC Score with Random Search CV (RBST): 0.8794557726439091


In [46]:
xgb_predict_rbst_random = rbst_best_xgb_random.predict_proba(rbst_test_df)[:,1]
submission12 = pd.DataFrame({'id' : rbst_test_df.index, 'Response' : xgb_predict_rbst_random})
submission12

Unnamed: 0,id,Response
0,11504798,0.009194
1,11504799,0.631729
2,11504800,0.386482
3,11504801,0.000142
4,11504802,0.095885
...,...,...
7669861,19174659,0.347675
7669862,19174660,0.000294
7669863,19174661,0.000546
7669864,19174662,0.709610


In [55]:
submission12.to_csv('submission12.csv', index=False)

In [51]:
# Combine predictions from base models into a new dataset

X_meta_train_ssc = np.column_stack([ssc_y_pred_proba_bayes, ssc_y_pred_proba_random])
y_meta_train = ssc_y_val

In [54]:
# Train the meta-model

meta_model = LogisticRegression()

meta_model.fit(X_meta_train_ssc, y_meta_train)

# Meta-model predictions

meta_val_pred_ssc = meta_model.predict_proba(X_meta_train_ssc)[:, 1]

# Evaluate the meta-model

meta_auc_ssc = roc_auc_score(y_meta_train, meta_val_pred_ssc)
logger.info(f"Meta model training completed. Time elapsed: {time.time() - start_time:.2f} seconds")
logger.info(f"Meta-model AUC: {meta_auc_ssc}")

INFO:__main__:Meta model training completed. Time elapsed: 12.62 seconds
INFO:__main__:Meta-model AUC: 0.8533904540318797


In [65]:
meta_test_ssc = np.column_stack([xgb_predict_ssc_bayes, xgb_predict_ssc_random])
meta_predict_ssc = meta_model.predict_proba(meta_test_ssc)[:, 1]
submission13 = pd.DataFrame({'id' : test_df.index, 'Response' : meta_predict_ssc})
submission13

Unnamed: 0,id,Response
0,11504798,0.021576
1,11504799,0.272708
2,11504800,0.223237
3,11504801,0.018398
4,11504802,0.025529
...,...,...
7669861,19174659,0.127055
7669862,19174660,0.018387
7669863,19174661,0.018450
7669864,19174662,0.517760


In [66]:
submission13.to_csv('submission13.csv', index=False)

In [63]:
X_meta_train_mmx = np.column_stack([mmx_y_pred_proba_bayes, mmx_y_pred_proba_random])
y_meta_train = ssc_y_val

In [64]:
# Train the meta-model

meta_model_mmx = LogisticRegression()

meta_model_mmx.fit(X_meta_train_mmx, y_meta_train)

# Meta-model predictions

meta_val_pred_mmx = meta_model.predict_proba(X_meta_train_mmx)[:, 1]

# Evaluate the meta-model

meta_auc_mmx = roc_auc_score(y_meta_train, meta_val_pred_mmx)
logger.info(f"Meta model training completed. Time elapsed: {time.time() - start_time:.2f} seconds")
logger.info(f"Meta-model AUC: {meta_auc_mmx}")

INFO:__main__:Meta model training completed. Time elapsed: 1012.83 seconds
INFO:__main__:Meta-model AUC: 0.8757567994787498


In [67]:
meta_test_mmx = np.column_stack([xgb_predict_mmx_bayes, xgb_predict_mmx_random])
meta_predict_mmx = meta_model.predict_proba(meta_test_mmx)[:, 1]
submission14 = pd.DataFrame({'id' : test_df.index, 'Response' : meta_predict_mmx})
submission14

Unnamed: 0,id,Response
0,11504798,0.019576
1,11504799,0.493884
2,11504800,0.237676
3,11504801,0.018381
4,11504802,0.040799
...,...,...
7669861,19174659,0.202448
7669862,19174660,0.018403
7669863,19174661,0.018440
7669864,19174662,0.470237


In [68]:
submission14.to_csv('submission14.csv', index=False)

In [69]:
X_meta_train_rbst = np.column_stack([rbst_y_pred_proba_bayes, rbst_y_pred_proba_random])
y_meta_train = ssc_y_val

# Train the meta-model

meta_model_rbst = LogisticRegression()

meta_model_rbst.fit(X_meta_train_rbst, y_meta_train)

# Meta-model predictions

meta_val_pred_rbst = meta_model.predict_proba(X_meta_train_rbst)[:, 1]

# Evaluate the meta-model

meta_auc_rbst = roc_auc_score(y_meta_train, meta_val_pred_rbst)
logger.info(f"Meta model training completed. Time elapsed: {time.time() - start_time:.2f} seconds")
logger.info(f"Meta-model AUC: {meta_auc_rbst}")

meta_test_rbst = np.column_stack([xgb_predict_rbst_bayes, xgb_predict_rbst_random])
meta_predict_rbst = meta_model.predict_proba(meta_test_rbst)[:, 1]
submission15 = pd.DataFrame({'id' : test_df.index, 'Response' : meta_predict_rbst})
submission15

INFO:__main__:Meta model training completed. Time elapsed: 1301.92 seconds
INFO:__main__:Meta-model AUC: 0.8757159050124658


Unnamed: 0,id,Response
0,11504798,0.019576
1,11504799,0.493884
2,11504800,0.237676
3,11504801,0.018381
4,11504802,0.040799
...,...,...
7669861,19174659,0.202448
7669862,19174660,0.018403
7669863,19174661,0.018440
7669864,19174662,0.470237


In [70]:
submission15.to_csv('submission15.csv', index=False)

In [83]:
X_meta_train_ssc_mmx = np.column_stack([ssc_y_pred_proba_bayes, ssc_y_pred_proba_random, 
                                        mmx_y_pred_proba_bayes, mmx_y_pred_proba_random])
y_meta_train = ssc_y_val

# Train the meta-model

meta_model_ssc_mmx = LogisticRegression()

meta_model_ssc_mmx.fit(X_meta_train_ssc_mmx, y_meta_train)

# Meta-model predictions

meta_val_pred_ssc_mmx = meta_model_ssc_mmx.predict_proba(X_meta_train_ssc_mmx)[:, 1]

# Evaluate the meta-model

meta_auc_ssc_mmx = roc_auc_score(y_meta_train, meta_val_pred_ssc_mmx)
logger.info(f"Meta model training completed. Time elapsed: {time.time() - start_time:.2f} seconds")
logger.info(f"Meta-model AUC: {meta_auc_ssc_mmx}")

meta_test_ssc_mmx = np.column_stack([xgb_predict_ssc_bayes, xgb_predict_ssc_random, 
                                    xgb_predict_mmx_bayes, xgb_predict_mmx_random])
meta_predict_ssc_mmx = meta_model_ssc_mmx.predict_proba(meta_test_ssc_mmx)[:, 1]
submission16 = pd.DataFrame({'id' : test_df.index, 'Response' : meta_predict_ssc_mmx})
submission16

INFO:__main__:Meta model training completed. Time elapsed: 3458.48 seconds
INFO:__main__:Meta-model AUC: 0.8777435645385328


Unnamed: 0,id,Response
0,11504798,0.013342
1,11504799,0.543543
2,11504800,0.210561
3,11504801,0.012300
4,11504802,0.026980
...,...,...
7669861,19174659,0.172356
7669862,19174660,0.012313
7669863,19174661,0.012342
7669864,19174662,0.546891


In [84]:
submission16.to_csv('submission16.csv', index=False)

In [85]:
X_meta_train_rbst_mmx = np.column_stack([rbst_y_pred_proba_bayes, rbst_y_pred_proba_random, 
                                        mmx_y_pred_proba_bayes, mmx_y_pred_proba_random])
y_meta_train = rbst_y_val

# Train the meta-model

meta_model_rbst_mmx = LogisticRegression()

meta_model_rbst_mmx.fit(X_meta_train_rbst_mmx, y_meta_train)

# Meta-model predictions

meta_val_pred_rbst_mmx = meta_model_rbst_mmx.predict_proba(X_meta_train_rbst_mmx)[:, 1]

# Evaluate the meta-model

meta_auc_rbst_mmx = roc_auc_score(y_meta_train, meta_val_pred_rbst_mmx)
logger.info(f"Meta model training completed. Time elapsed: {time.time() - start_time:.2f} seconds")
logger.info(f"Meta-model AUC: {meta_auc_rbst_mmx}")

meta_test_rbst_mmx = np.column_stack([xgb_predict_rbst_bayes, xgb_predict_rbst_random, 
                                    xgb_predict_mmx_bayes, xgb_predict_mmx_random])
meta_predict_rbst_mmx = meta_model_rbst_mmx.predict_proba(meta_test_rbst_mmx)[:, 1]
submission17 = pd.DataFrame({'id' : test_df.index, 'Response' : meta_predict_rbst_mmx})
submission17

INFO:__main__:Meta model training completed. Time elapsed: 3505.40 seconds
INFO:__main__:Meta-model AUC: 0.877885772276235


Unnamed: 0,id,Response
0,11504798,0.013517
1,11504799,0.539770
2,11504800,0.214026
3,11504801,0.012610
4,11504802,0.029031
...,...,...
7669861,19174659,0.174994
7669862,19174660,0.012626
7669863,19174661,0.012653
7669864,19174662,0.569829


In [86]:
submission17.to_csv('submission17.csv', index=False)

In [87]:
X_meta_train_rbst_ssc = np.column_stack([rbst_y_pred_proba_bayes, rbst_y_pred_proba_random, 
                                        ssc_y_pred_proba_bayes, ssc_y_pred_proba_random])
y_meta_train = rbst_y_val

# Train the meta-model

meta_model_rbst_ssc = LogisticRegression()

meta_model_rbst_ssc.fit(X_meta_train_rbst_ssc, y_meta_train)

# Meta-model predictions

meta_val_pred_rbst_ssc = meta_model_rbst_ssc.predict_proba(X_meta_train_rbst_ssc)[:, 1]

# Evaluate the meta-model

meta_auc_rbst_ssc = roc_auc_score(y_meta_train, meta_val_pred_rbst_ssc)
logger.info(f"Meta model training completed. Time elapsed: {time.time() - start_time:.2f} seconds")
logger.info(f"Meta-model AUC: {meta_auc_rbst_ssc}")

meta_test_rbst_ssc = np.column_stack([xgb_predict_rbst_bayes, xgb_predict_rbst_random, 
                                    xgb_predict_ssc_bayes, xgb_predict_ssc_random])
meta_predict_rbst_ssc = meta_model_rbst_ssc.predict_proba(meta_test_rbst_ssc)[:, 1]
submission18 = pd.DataFrame({'id' : test_df.index, 'Response' : meta_predict_rbst_ssc})
submission18

INFO:__main__:Meta model training completed. Time elapsed: 3541.06 seconds
INFO:__main__:Meta-model AUC: 0.8776266366778683


Unnamed: 0,id,Response
0,11504798,0.013316
1,11504799,0.542335
2,11504800,0.211068
3,11504801,0.012274
4,11504802,0.026980
...,...,...
7669861,19174659,0.172517
7669862,19174660,0.012288
7669863,19174661,0.012316
7669864,19174662,0.546360


In [88]:
submission18.to_csv('submission18.csv', index=False)

In [89]:
X_meta_train_rbst_ssc_mmx = np.column_stack([rbst_y_pred_proba_bayes, rbst_y_pred_proba_random, 
                                        ssc_y_pred_proba_bayes, ssc_y_pred_proba_random, 
                                            mmx_y_pred_proba_bayes, mmx_y_pred_proba_random])
y_meta_train = rbst_y_val

# Train the meta-model

meta_model_rbst_ssc_mmx = LogisticRegression()

meta_model_rbst_ssc_mmx.fit(X_meta_train_rbst_ssc_mmx, y_meta_train)

# Meta-model predictions

meta_val_pred_rbst_ssc_mmx = meta_model_rbst_ssc_mmx.predict_proba(X_meta_train_rbst_ssc_mmx)[:, 1]

# Evaluate the meta-model

meta_auc_rbst_ssc_mmx = roc_auc_score(y_meta_train, meta_val_pred_rbst_ssc_mmx)
logger.info(f"Meta model training completed. Time elapsed: {time.time() - start_time:.2f} seconds")
logger.info(f"Meta-model AUC: {meta_auc_rbst_ssc_mmx}")

meta_test_rbst_ssc_mmx = np.column_stack([xgb_predict_rbst_bayes, xgb_predict_rbst_random, 
                                    xgb_predict_ssc_bayes, xgb_predict_ssc_random,
                                         xgb_predict_mmx_bayes, xgb_predict_mmx_random])
meta_predict_rbst_ssc_mmx = meta_model_rbst_ssc_mmx.predict_proba(meta_test_rbst_ssc_mmx)[:, 1]
submission19 = pd.DataFrame({'id' : test_df.index, 'Response' : meta_predict_rbst_ssc_mmx})
submission19

INFO:__main__:Meta model training completed. Time elapsed: 3625.52 seconds
INFO:__main__:Meta-model AUC: 0.8777433188365189


Unnamed: 0,id,Response
0,11504798,0.013298
1,11504799,0.543571
2,11504800,0.210950
3,11504801,0.012262
4,11504802,0.026997
...,...,...
7669861,19174659,0.172769
7669862,19174660,0.012275
7669863,19174661,0.012304
7669864,19174662,0.545885


In [90]:
submission19.to_csv('submission19.csv', index=False)