### Import Libraries

In [1]:
import keras_tuner as kt
import logging
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import seaborn as sns
import tensorflow as tf
import time
import xgboost as xgb

from keras import Sequential, Model
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Dropout, BatchNormalization, Input, concatenate
from keras.metrics import AUC
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, QuantileTransformer, PowerTransformer
from tensorflow import keras
from xgboost import XGBClassifier

import warnings 
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
start_time = time.time()

### Data Loading

In [2]:
train_df = pd.read_csv('train.csv', index_col='id')
test_df = pd.read_csv('test.csv', index_col='id')

In [3]:
train_df.head(2)

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0
1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1


In [4]:
test_df.head(2)

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
11504798,Female,20,1,47.0,0,< 1 Year,No,2630.0,160.0,228
11504799,Male,47,1,28.0,0,1-2 Year,Yes,37483.0,124.0,123


In [5]:
column_names = train_df.columns.tolist()

for i in column_names:
    print(i, train_df[i].nunique(), 'unique values')

Gender 2 unique values
Age 66 unique values
Driving_License 2 unique values
Region_Code 54 unique values
Previously_Insured 2 unique values
Vehicle_Age 3 unique values
Vehicle_Damage 2 unique values
Annual_Premium 51728 unique values
Policy_Sales_Channel 152 unique values
Vintage 290 unique values
Response 2 unique values


### Split the data into train and validation

In [6]:
# Since we have only one data set, spliting it into train and test (validation)

raw_train_df, validation_df = train_test_split(train_df, train_size = 0.75, random_state = 1, stratify = train_df['Response'])

logger.info(f"Data split completed. Time elapsed: {time.time() - start_time:.2f} seconds")

raw_train_df.head(2)

INFO:__main__:Data split completed. Time elapsed: 26.78 seconds


Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6400262,Female,26,1,28.0,0,< 1 Year,No,54497.0,26.0,234,0
8095698,Female,25,1,30.0,1,< 1 Year,No,38748.0,152.0,131,0


In [7]:
validation_df.head(2)

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6517611,Male,44,1,28.0,0,1-2 Year,Yes,2630.0,157.0,91,0
1591313,Female,23,1,14.0,1,< 1 Year,No,35345.0,152.0,272,0


In [8]:
raw_train_df.shape

(8628598, 11)

In [9]:
validation_df.shape

(2876200, 11)

In [10]:
def transform_categorical_features(df):
    gender_map = {'Male': 0, 'Female': 1}
    vehicle_age_map = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
    vehicle_damage_map = {'No': 0, 'Yes': 1}
    
    df['Gender'] = df['Gender'].map(gender_map)
    df['Vehicle_Age'] = df['Vehicle_Age'].map(vehicle_age_map)
    df['Vehicle_Damage'] = df['Vehicle_Damage'].map(vehicle_damage_map)
    
    return df

def adjust_data_types(df):
    df['Region_Code'] = df['Region_Code'].astype(int)
    df['Annual_Premium'] = df['Annual_Premium'].astype(int)
    df['Policy_Sales_Channel'] = df['Policy_Sales_Channel'].astype(int)
    
    return df

def create_additional_features(df):
    df['Prev_Insured_Annual_Premium'] = pd.factorize(df['Previously_Insured'].astype(str) + df['Annual_Premium'].astype(str))[0]
    df['Prev_Insured_Vehicle_Age'] = pd.factorize(df['Previously_Insured'].astype(str) + df['Vehicle_Age'].astype(str))[0]
    df['Prev_Insured_Vehicle_Damage'] = pd.factorize(df['Previously_Insured'].astype(str) + df['Vehicle_Damage'].astype(str))[0]
    df['Prev_Insured_Vintage'] = pd.factorize(df['Previously_Insured'].astype(str) + df['Vintage'].astype(str))[0]
    
    return df

def optimize_memory_usage(df):
    start_mem_usage = df.memory_usage().sum() / 1024 ** 2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type.name in ['category', 'object']:
            raise ValueError(f"Column '{col}' is of type '{col_type.name}'")

        c_min = df[col].min()
        c_max = df[col].max()
        
        if str(col_type)[:3] == 'int':
            
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
                
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
                
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
                
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)
        
        else:
        
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float16)
            
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            
            else:
                df[col] = df[col].astype(np.float64)

    end_mem_usage = df.memory_usage().sum() / 1024**2
    print(f'------ Memory usage before: {start_mem_usage:.2f} MB')
    print(f'------ Memory usage after: {end_mem_usage:.2f} MB')
    print(f'------ Reduced memory usage by {(100 * (start_mem_usage - end_mem_usage) / start_mem_usage):.1f}%')
    print('**********************' * 5)

    return df

def apply_scaling(df, scaler_type, columns):

    if scaler_type == 'S':
        scaler = StandardScaler() 
    
    elif scaler_type == 'M':
        scaler = MinMaxScaler()  
    
    elif scaler_type == 'R':
        scaler = RobustScaler()  
    
    elif scaler_type == 'A':
        scaler = MaxAbsScaler() 
    
    elif scaler_type == 'Q':
        scaler = QuantileTransformer(output_distribution='normal') 
    
    elif scaler_type == 'P':
        scaler = PowerTransformer() 
    
    else:
        raise ValueError("Invalid scaler type. Choose 'S' for StandardScaler, 'M' for MinMaxScaler, 'R' for RobustScaler, 'A' for MaxAbsScaler,'Q' for QuantileTransformer, or 'P' for PowerTransformer.")

    scaled_data = df.copy()

    for col in columns:
        scaled_data[col] = scaler.fit_transform(scaled_data[[col]])

    return scaled_data


In [11]:
raw_xgb_train_df = raw_train_df.copy()
xgb_validation_df = validation_df.copy()
xgb_test_df = test_df.copy()

In [12]:
raw_xgb_train_df = transform_categorical_features(raw_xgb_train_df)
xgb_validation_df = transform_categorical_features(xgb_validation_df)
xgb_test_df = transform_categorical_features(xgb_test_df)

raw_xgb_train_df = adjust_data_types(raw_xgb_train_df)
xgb_validation_df = adjust_data_types(xgb_validation_df)
xgb_test_df = adjust_data_types(xgb_test_df)

# raw_train_df = create_additional_features(raw_train_df)
# validation_df = create_additional_features(validation_df)
# test_df = create_additional_features(test_df)

raw_xgb_train_df = optimize_memory_usage(raw_xgb_train_df)
xgb_validation_df = optimize_memory_usage(xgb_validation_df)
xgb_test_df = optimize_memory_usage(xgb_test_df)

------ Memory usage before: 691.23 MB
------ Memory usage after: 197.49 MB
------ Reduced memory usage by 71.4%
**************************************************************************************************************
------ Memory usage before: 230.41 MB
------ Memory usage after: 65.83 MB
------ Reduced memory usage by 71.4%
**************************************************************************************************************
------ Memory usage before: 555.91 MB
------ Memory usage after: 168.23 MB
------ Reduced memory usage by 69.7%
**************************************************************************************************************


In [13]:
raw_xgb_train_df.head(2)

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6400262,1,26,1,28,0,0,0,54497,26,234,0
8095698,1,25,1,30,1,0,0,38748,152,131,0


In [14]:
scaler_type = 'M'
columns_to_scale_xgb = ['Age', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']

# Trying with Standard Scaler

mmx_train_df = apply_scaling(raw_xgb_train_df, scaler_type, columns_to_scale_xgb)
mmx_validation_df = apply_scaling(xgb_validation_df, scaler_type, columns_to_scale_xgb)
mmx_test_df = apply_scaling(xgb_test_df, scaler_type, columns_to_scale_xgb)

logger.info(f"Data scaling completed. Time elapsed: {time.time() - start_time:.2f} seconds")

mmx_train_df.head(2)

INFO:__main__:Data scaling completed. Time elapsed: 32.67 seconds


Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6400262,1,0.092308,1,28,0,0,0,0.09649,0.154321,0.775087,0
8095698,1,0.076923,1,30,1,0,0,0.067192,0.932099,0.418685,0


In [15]:
# Splitting dependent and independent variable

mmx_x_train = mmx_train_df.drop(['Response'], axis = 1)
mmx_y_train = mmx_train_df['Response']

mmx_x_val = mmx_validation_df.drop(['Response'], axis = 1)
mmx_y_val = mmx_validation_df['Response']

In [38]:
# Train the final model with the best hyperparameters from Bayesian Optimization

mmx_best_xgb_bayes = XGBClassifier(
    n_estimators = int(297.0),
    max_depth = int(10.0),
    learning_rate = 0.09963346243555755,
    subsample = 0.8302721292642807,
    colsample_bytree = 0.8010665164356681,
    gamma = 0.07140222781940667,
    min_child_weight = int(9.0),
    reg_alpha = 0.0022293880023777244,
    reg_lambda = 1.5050229982288488,
    scale_pos_weight = int(1.0),
    max_delta_step = int(6.0),
    colsample_bylevel = 0.9951929930551984,
    colsample_bynode = 0.9897401686365959,
    random_state = 42,
    use_label_encoder = False,
    eval_metric = 'logloss',
    n_jobs = -1
)

start_time = time.time()
mmx_best_xgb_bayes.fit(mmx_x_train, mmx_y_train)
logger.info(f"Final model training completed. Time elapsed: {time.time() - start_time:.2f} seconds")

mmx_y_pred_proba_bayes = mmx_best_xgb_bayes.predict_proba(mmx_x_val)[:, 1]
mmx_test_roc_auc_bayes = roc_auc_score(mmx_y_val, mmx_y_pred_proba_bayes)
logger.info(f"Test AUROC Score with Bayesian Optimization (MMX): {mmx_test_roc_auc_bayes}")


INFO:__main__:Final model training completed. Time elapsed: 251.45 seconds
INFO:__main__:Test AUROC Score with Bayesian Optimization (MMX): 0.8798916620062257


In [17]:
# Proceeding with encoding
# Label encoder on gender column

train_df['Gender'] = train_df['Gender'].astype('category')
train_df['Gender'] = train_df['Gender'].cat.codes

test_df['Gender'] = test_df['Gender'].astype('category')
test_df['Gender'] = test_df['Gender'].cat.codes

train_df.head(2)

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,1,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0
1,1,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1


In [18]:
# Define the mapping for encoding

veh_age_mapping = {
    '< 1 Year': 0,
    '1-2 Year': 1,
    '> 2 Years': 2
}

# Encode the 'Vehicle_Age' column

train_df['Vehicle_Age'] = train_df['Vehicle_Age'].map(veh_age_mapping)
test_df['Vehicle_Age'] = test_df['Vehicle_Age'].map(veh_age_mapping)

train_df.head(2)

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,1,21,1,35.0,0,1,Yes,65101.0,124.0,187,0
1,1,43,1,28.0,0,2,Yes,58911.0,26.0,288,1


In [19]:
# Encoding 'Vehicle_Damage' column - using label encoding

train_df['Vehicle_Damage'] = train_df['Vehicle_Damage'].astype('category')
train_df['Vehicle_Damage'] = train_df['Vehicle_Damage'].cat.codes

test_df['Vehicle_Damage'] = test_df['Vehicle_Damage'].astype('category')
test_df['Vehicle_Damage'] = test_df['Vehicle_Damage'].cat.codes

train_df.head(2)

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,1,21,1,35.0,0,1,1,65101.0,124.0,187,0
1,1,43,1,28.0,0,2,1,58911.0,26.0,288,1


In [20]:
# Since we have only one data set, spliting it into train and test (validation)

raw_dnn_train_df, dnn_validation_df = train_test_split(train_df, train_size = 0.75, random_state = 1, stratify = train_df['Response'])
raw_dnn_train_df.head(2)

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6400262,0,26,1,28.0,0,0,0,54497.0,26.0,234,0
8095698,0,25,1,30.0,1,0,0,38748.0,152.0,131,0


In [21]:
# Splitting dependent and independent variable

dnn_x_train = raw_dnn_train_df.drop(['Response'], axis = 1)
dnn_y_train = raw_dnn_train_df['Response']

dnn_x_val = dnn_validation_df.drop(['Response'], axis = 1)
dnn_y_val = dnn_validation_df['Response']

dnn_x_train.head(2)

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6400262,0,26,1,28.0,0,0,0,54497.0,26.0,234
8095698,0,25,1,30.0,1,0,0,38748.0,152.0,131


In [22]:
# Using satandardisation technique

ssc = StandardScaler()
scaled_x_train = pd.DataFrame(ssc.fit_transform(dnn_x_train))
scaled_y_train = dnn_y_train
scaled_x_val = pd.DataFrame(ssc.fit_transform(dnn_x_val))
scaled_y_val = dnn_y_val

scaled_x_train.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-1.086257,-0.826111,0.04464,0.121787,-0.928574,-1.062226,-1.005436,1.461568,-1.599175,0.876632
1,-1.086257,-0.892792,0.04464,0.275725,1.07692,-1.062226,-1.005436,0.503876,0.732519,-0.411221


In [23]:
scaled_inputs = scaled_x_train.shape[1]
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 10, min_delta = 0.0001, verbose = 1)

In [24]:
# Designing the Model
scaled_model = Sequential()

scaled_model.add(Dense(input_dim = scaled_inputs, activation = 'relu', units = 128))
scaled_model.add(BatchNormalization())
scaled_model.add(Dense(activation = 'relu', units = 128))
scaled_model.add(BatchNormalization())
scaled_model.add(Dense(activation = 'relu', units = 64))
scaled_model.add(BatchNormalization())
scaled_model.add(Dense(activation = 'relu', units = 32))
scaled_model.add(BatchNormalization())
scaled_model.add(Dense(activation = 'sigmoid', units = 1))

scaled_model.summary()

In [25]:
# Compiling the model

scaled_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = [AUC (name = 'auroc')])

# Training the model

history_scaled = scaled_model.fit(scaled_x_train, scaled_y_train, 
                                    validation_data = (scaled_x_val, scaled_y_val), 
                                    epochs = 100, 
                                    callbacks = [early_stopping])

Epoch 1/100
[1m269644/269644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m688s[0m 3ms/step - auroc: 0.8491 - loss: 0.2702 - val_auroc: 0.8590 - val_loss: 0.2655
Epoch 2/100
[1m269644/269644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m699s[0m 3ms/step - auroc: 0.8573 - loss: 0.2649 - val_auroc: 0.8601 - val_loss: 0.2635
Epoch 3/100
[1m269644/269644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m701s[0m 3ms/step - auroc: 0.8582 - loss: 0.2644 - val_auroc: 0.8603 - val_loss: 0.2636
Epoch 4/100
[1m269644/269644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m664s[0m 2ms/step - auroc: 0.8591 - loss: 0.2638 - val_auroc: 0.8609 - val_loss: 0.2640
Epoch 5/100
[1m269644/269644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m647s[0m 2ms/step - auroc: 0.8596 - loss: 0.2634 - val_auroc: 0.8612 - val_loss: 0.2638
Epoch 6/100
[1m269644/269644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m694s[0m 3ms/step - auroc: 0.8600 - loss: 0.2634 - val_auroc: 0.8617 - val_loss: 0.2639
Epoch 7/10

In [28]:
dnn_val_pred = scaled_model.predict(scaled_x_val).flatten()
dnn_val_pred

[1m89882/89882[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 783us/step


array([2.0230600e-01, 1.9973275e-05, 6.2302606e-05, ..., 2.3997338e-04,
       3.0790985e-01, 4.4098188e-04], dtype=float32)

In [29]:
mmx_y_pred_proba_bayes

array([2.2329526e-01, 2.7593787e-04, 1.4222435e-04, ..., 1.2681331e-04,
       4.0064275e-01, 1.2459834e-04], dtype=float32)

In [30]:
# Combine predictions from base models into a new dataset

X_meta_train = np.column_stack([mmx_y_pred_proba_bayes, dnn_val_pred])
y_meta_train = scaled_y_val

In [31]:
# Train the meta-model

meta_model = LogisticRegression()
meta_model.fit(X_meta_train, y_meta_train)

# Meta-model predictions

meta_val_pred = meta_model.predict_proba(X_meta_train)[:, 1]

# Evaluate the meta-model

meta_auc = roc_auc_score(y_meta_train, meta_val_pred)
logger.info(f"Meta model training completed. Time elapsed: {time.time() - start_time:.2f} seconds")
logger.info(f"Meta-model AUC: {meta_auc}")

INFO:__main__:Meta model training completed. Time elapsed: 8702.64 seconds
INFO:__main__:Meta-model AUC: 0.8778154592690062


In [39]:
xgb_predict = mmx_best_xgb_bayes.predict_proba(mmx_test_df)[:,1]
submission1 = pd.DataFrame({'id' : mmx_test_df.index, 'Response' : xgb_predict})
submission1

Unnamed: 0,id,Response
0,11504798,0.005777
1,11504799,0.453198
2,11504800,0.235381
3,11504801,0.000083
4,11504802,0.045280
...,...,...
7669861,19174659,0.203576
7669862,19174660,0.000165
7669863,19174661,0.000295
7669864,19174662,0.570926


In [45]:
submission1.to_csv('submission1.csv', index=False)

In [43]:
dnn_test_df = pd.DataFrame(ssc.fit_transform(test_df))
dnn_predict = scaled_model.predict(dnn_test_df).flatten()
submission2 = pd.DataFrame({'id' : test_df.index, 'Response' : dnn_predict})
submission2

[1m239684/239684[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 836us/step


Unnamed: 0,id,Response
0,11504798,0.013052
1,11504799,0.261340
2,11504800,0.265566
3,11504801,0.000028
4,11504802,0.044413
...,...,...
7669861,19174659,0.160382
7669862,19174660,0.000146
7669863,19174661,0.000646
7669864,19174662,0.460582


In [46]:
submission2.to_csv('submission2.csv', index=False)

In [44]:
meta_test = np.column_stack([xgb_predict, dnn_predict])
meta_predict = meta_model.predict_proba(meta_test)[:, 1]
submission3 = pd.DataFrame({'id' : test_df.index, 'Response' : meta_predict})
submission3

Unnamed: 0,id,Response
0,11504798,0.021847
1,11504799,0.445733
2,11504800,0.185769
3,11504801,0.020125
4,11504802,0.030810
...,...,...
7669861,19174659,0.111875
7669862,19174660,0.020144
7669863,19174661,0.020197
7669864,19174662,0.776262


In [47]:
submission3.to_csv('submission3.csv', index=False)