## Preparing Data for Model

In [1]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import label_binarize

In [2]:
train_raw = pd.read_csv('/kaggle/input/playground-series-s4e7/train.csv')

In [3]:
test_raw = pd.read_csv('/kaggle/input/playground-series-s4e7/test.csv')

In [4]:
train_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11504798 entries, 0 to 11504797
Data columns (total 12 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   Gender                object 
 2   Age                   int64  
 3   Driving_License       int64  
 4   Region_Code           float64
 5   Previously_Insured    int64  
 6   Vehicle_Age           object 
 7   Vehicle_Damage        object 
 8   Annual_Premium        float64
 9   Policy_Sales_Channel  float64
 10  Vintage               int64  
 11  Response              int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 1.0+ GB


## Creating Numeric and Categorical Lists

#### Training Data

In [5]:
train = train_raw.drop(['id', 'Response'], axis=1)

In [6]:
train['Driving_License'] = train['Driving_License'].astype('category')
train['Previously_Insured'] = train['Previously_Insured'].astype('category')

In [7]:
numeric_columns = train.select_dtypes(include=[int, float]).columns
categorical_columns = train.select_dtypes(exclude=[int, float]).columns

#### Testing Data

In [8]:
test = test_raw.drop('id', axis=1)

In [9]:
test['Driving_License'] = test['Driving_License'].astype('category')
test['Previously_Insured'] = test['Previously_Insured'].astype('category')

## Splitting into Training and Testing

In [10]:
y = train_raw['Response']

In [11]:
xtrain, xtest, ytrain, ytest = train_test_split(train, y, test_size = 0.3, stratify = y, random_state = 123)

## Scaling Data

#### Numeric

In [12]:
scaler = StandardScaler()

In [13]:
# subtraining
xtrain_scaled = scaler.fit_transform(xtrain[numeric_columns])
xtrain[numeric_columns] = xtrain_scaled

# subtesting
xtest_scaled = scaler.transform(xtest[numeric_columns])
xtest[numeric_columns] = xtest_scaled

In [14]:
# full training
train_scaled = scaler.fit_transform(train[numeric_columns])
train[numeric_columns] = train_scaled

# full testing
test_scaled = scaler.transform(test[numeric_columns])
test[numeric_columns] = test_scaled

#### Categorical

In [15]:
transformer = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), 
     ['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Age',
       'Vehicle_Damage']))

Sub Training Set

In [16]:
transformed_xtrain = transformer.fit_transform(xtrain)
# dense_array = transformed_xtrain.toarray()
transformed_df = pd.DataFrame(transformed_xtrain, columns = transformer.get_feature_names_out())
transformed_df.index = xtrain.index

# Joining tables
xtrain = pd.concat([xtrain, transformed_df], axis=1)
# Dropping old categorical columns
xtrain.drop(categorical_columns, axis=1, inplace=True)

Sus Testing Set

In [17]:
transformed_xtest = transformer.transform(xtest)
# dense_array = transformed_xtest.toarray()
transformed_df = pd.DataFrame(transformed_xtest, columns = transformer.get_feature_names_out())
transformed_df.index = xtest.index

# Joining tables
xtest = pd.concat([xtest, transformed_df], axis=1)
# Dropping old categorical columns
xtest.drop(categorical_columns, axis=1, inplace=True)

Full Training Set

In [18]:
transformed_X = transformer.fit_transform(train)
# dense_array = transformed_X.toarray()
transformed_df = pd.DataFrame(transformed_X, columns = transformer.get_feature_names_out())
transformed_df.index = train.index

# Joining tables
X_scaled = pd.concat([train, transformed_df], axis=1)
# Dropping old categorical columns
X_scaled.drop(categorical_columns, axis=1, inplace=True)

Full Testing Set

In [19]:
transformed_test = transformer.transform(test)
# dense_array = transformed_test.toarray()
transformed_df = pd.DataFrame(transformed_test, columns = transformer.get_feature_names_out())
transformed_df.index = test.index

# Joining tables
X_test_scaled = pd.concat([test, transformed_df], axis=1)
# Dropping old categorical columns
X_test_scaled.drop(categorical_columns, axis=1, inplace=True)

Checking the Scaled Data Sets

In [20]:
X_scaled.head()

Unnamed: 0,Age,Region_Code,Annual_Premium,Policy_Sales_Channel,Vintage,onehotencoder__Gender_Female,onehotencoder__Gender_Male,onehotencoder__Driving_License_0,onehotencoder__Driving_License_1,onehotencoder__Previously_Insured_0,onehotencoder__Previously_Insured_1,onehotencoder__Vehicle_Age_1-2 Year,onehotencoder__Vehicle_Age_< 1 Year,onehotencoder__Vehicle_Age_> 2 Years,onehotencoder__Vehicle_Damage_No,onehotencoder__Vehicle_Damage_Yes
0,-1.15941,0.660528,2.105145,0.214202,0.288852,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.307897,0.121718,1.728962,-1.599414,1.551675,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,-0.892627,-0.955902,0.460756,0.732378,1.126566,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,-0.225669,-1.95655,-1.691389,0.806403,-1.099003,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
4,-0.158974,-0.878929,0.090529,0.732378,1.626694,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0


In [21]:
X_scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11504798 entries, 0 to 11504797
Data columns (total 16 columns):
 #   Column                                Dtype  
---  ------                                -----  
 0   Age                                   float64
 1   Region_Code                           float64
 2   Annual_Premium                        float64
 3   Policy_Sales_Channel                  float64
 4   Vintage                               float64
 5   onehotencoder__Gender_Female          float64
 6   onehotencoder__Gender_Male            float64
 7   onehotencoder__Driving_License_0      float64
 8   onehotencoder__Driving_License_1      float64
 9   onehotencoder__Previously_Insured_0   float64
 10  onehotencoder__Previously_Insured_1   float64
 11  onehotencoder__Vehicle_Age_1-2 Year   float64
 12  onehotencoder__Vehicle_Age_< 1 Year   float64
 13  onehotencoder__Vehicle_Age_> 2 Years  float64
 14  onehotencoder__Vehicle_Damage_No      float64
 15  onehotencoder

In [22]:
X_test_scaled.head()

Unnamed: 0,Age,Region_Code,Annual_Premium,Policy_Sales_Channel,Vintage,onehotencoder__Gender_Female,onehotencoder__Gender_Male,onehotencoder__Driving_License_0,onehotencoder__Driving_License_1,onehotencoder__Previously_Insured_0,onehotencoder__Previously_Insured_1,onehotencoder__Vehicle_Age_1-2 Year,onehotencoder__Vehicle_Age_< 1 Year,onehotencoder__Vehicle_Age_> 2 Years,onehotencoder__Vehicle_Damage_No,onehotencoder__Vehicle_Damage_Yes
0,-1.226106,1.584203,-1.691389,0.880428,0.801483,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.57468,0.121718,0.426724,0.214202,-0.511353,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.57468,1.276311,-1.691389,-1.599414,1.339121,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
3,-1.092714,1.584203,-0.362167,0.732378,-0.611378,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,0.841463,-0.571038,0.222041,0.214202,-0.198773,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


In [23]:
X_test_scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7669866 entries, 0 to 7669865
Data columns (total 16 columns):
 #   Column                                Dtype  
---  ------                                -----  
 0   Age                                   float64
 1   Region_Code                           float64
 2   Annual_Premium                        float64
 3   Policy_Sales_Channel                  float64
 4   Vintage                               float64
 5   onehotencoder__Gender_Female          float64
 6   onehotencoder__Gender_Male            float64
 7   onehotencoder__Driving_License_0      float64
 8   onehotencoder__Driving_License_1      float64
 9   onehotencoder__Previously_Insured_0   float64
 10  onehotencoder__Previously_Insured_1   float64
 11  onehotencoder__Vehicle_Age_1-2 Year   float64
 12  onehotencoder__Vehicle_Age_< 1 Year   float64
 13  onehotencoder__Vehicle_Age_> 2 Years  float64
 14  onehotencoder__Vehicle_Damage_No      float64
 15  onehotencoder__

In [24]:
xtrain.head()

Unnamed: 0,Age,Region_Code,Annual_Premium,Policy_Sales_Channel,Vintage,onehotencoder__Gender_Female,onehotencoder__Gender_Male,onehotencoder__Driving_License_0,onehotencoder__Driving_License_1,onehotencoder__Previously_Insured_0,onehotencoder__Previously_Insured_1,onehotencoder__Vehicle_Age_1-2 Year,onehotencoder__Vehicle_Age_< 1 Year,onehotencoder__Vehicle_Age_> 2 Years,onehotencoder__Vehicle_Damage_No,onehotencoder__Vehicle_Damage_Yes
2657020,-0.959534,-1.802515,-0.245764,0.732397,0.789091,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3894175,-0.759429,0.198812,0.189965,0.732397,1.164125,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
7485213,-0.225816,-1.725541,-1.692518,-1.599453,-1.711138,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
4817060,-1.092937,-0.647903,0.122522,-1.599453,0.45156,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
8738434,0.441201,0.121838,0.867184,-1.599453,0.601573,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [25]:
xtrain.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8053358 entries, 2657020 to 2779041
Data columns (total 16 columns):
 #   Column                                Dtype  
---  ------                                -----  
 0   Age                                   float64
 1   Region_Code                           float64
 2   Annual_Premium                        float64
 3   Policy_Sales_Channel                  float64
 4   Vintage                               float64
 5   onehotencoder__Gender_Female          float64
 6   onehotencoder__Gender_Male            float64
 7   onehotencoder__Driving_License_0      float64
 8   onehotencoder__Driving_License_1      float64
 9   onehotencoder__Previously_Insured_0   float64
 10  onehotencoder__Previously_Insured_1   float64
 11  onehotencoder__Vehicle_Age_1-2 Year   float64
 12  onehotencoder__Vehicle_Age_< 1 Year   float64
 13  onehotencoder__Vehicle_Age_> 2 Years  float64
 14  onehotencoder__Vehicle_Damage_No      float64
 15  onehotencoder_

In [26]:
xtest.head()

Unnamed: 0,Age,Region_Code,Annual_Premium,Policy_Sales_Channel,Vintage,onehotencoder__Gender_Female,onehotencoder__Gender_Male,onehotencoder__Driving_License_0,onehotencoder__Driving_License_1,onehotencoder__Previously_Insured_0,onehotencoder__Previously_Insured_1,onehotencoder__Vehicle_Age_1-2 Year,onehotencoder__Vehicle_Age_< 1 Year,onehotencoder__Vehicle_Age_> 2 Years,onehotencoder__Vehicle_Damage_No,onehotencoder__Vehicle_Damage_Yes
4020808,2.04204,0.968554,1.42004,-1.599453,1.489155,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
7404271,0.107692,0.121838,1.075288,-1.599453,0.664079,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
6049075,-0.692727,0.121838,0.561414,0.732397,-1.161088,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1856884,-0.359219,-0.878826,-1.692518,0.214208,-0.411019,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
10713674,-0.759429,1.199476,0.182667,0.732397,0.576571,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [27]:
xtest.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3451440 entries, 4020808 to 5998772
Data columns (total 16 columns):
 #   Column                                Dtype  
---  ------                                -----  
 0   Age                                   float64
 1   Region_Code                           float64
 2   Annual_Premium                        float64
 3   Policy_Sales_Channel                  float64
 4   Vintage                               float64
 5   onehotencoder__Gender_Female          float64
 6   onehotencoder__Gender_Male            float64
 7   onehotencoder__Driving_License_0      float64
 8   onehotencoder__Driving_License_1      float64
 9   onehotencoder__Previously_Insured_0   float64
 10  onehotencoder__Previously_Insured_1   float64
 11  onehotencoder__Vehicle_Age_1-2 Year   float64
 12  onehotencoder__Vehicle_Age_< 1 Year   float64
 13  onehotencoder__Vehicle_Age_> 2 Years  float64
 14  onehotencoder__Vehicle_Damage_No      float64
 15  onehotencoder_

## Changing Feature Names

In [28]:
dfs = [xtrain, xtest, X_scaled, X_test_scaled]

for df in dfs:
    df.columns = df.columns.str.replace('[', '(', regex=False)
    df.columns = df.columns.str.replace(']', ')', regex=False)
    df.columns = df.columns.str.replace('<', 'less_than', regex=False)
    df.columns = df.columns.str.replace('>', 'greater_than', regex=False)

# Model Training

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb

In [30]:
hyperparams_xgbm = {
    'n_estimators': 1427, 
    'learning_rate': 0.045640623671124717, 
    'max_depth': 15, 
    'gamma': 2.1304843387906605, 
    'min_child_weight': 17, 
    'reg_alpha': 0.01490102698164227, 
    'reg_lambda': 0.053395147804257845, 
    'subsample': 0.8263124090290509, 
    'colsample_bytree': 0.8685865103401375, 
    'colsample_bylevel': 0.8566147867891335, 
    'colsample_bynode': 0.8021382585492708, 
    'grow_policy': 'lossguide'
}

In [31]:
hyperparams_lgbm = {
    'num_leaves': 18, 
    'max_depth': 9, 
    'learning_rate': 0.1732080006922778, 
    'n_estimators': 485, 
    'min_child_samples': 12, 
    'colsample_bytree': 0.6532730571679977, 
    'reg_alpha': 0.2824691927675552, 
    'reg_lambda': 0.8127125917275994, 
    'bagging_fraction': 0.579537420550804
}

In [32]:
hyperparams_lr = {
    'penalty': 'l2', 
    'C': 0.3255323004350203, 
    'solver': 'liblinear', 
    'max_iter': 486, 
    'class_weight': 'balanced'
}

In [33]:
cat = CatBoostClassifier(depth = 11, random_seed = 123, eval_metric = 'AUC')
lgbm = lgb.LGBMClassifier(**hyperparams_lgbm)
xgbm = XGBClassifier(**hyperparams_xgbm)
lr = LogisticRegression(**hyperparams_lr)

estimators = [('LightGBM', lgbm), ('XGBoost', xgbm), ('Logistic', lr), ('CatBoost', cat)]

vc = VotingClassifier(estimators = estimators, voting = 'soft', verbose = True)
vc.fit(X_scaled, y)

[LightGBM] [Info] Number of positive: 1415059, number of negative: 10089739
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.000198 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 738
[LightGBM] [Info] Number of data points in the train set: 11504798, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122997 -> initscore=-1.964348
[LightGBM] [Info] Start training from score -1.964348
[Voting] ................. (1 of 4) Processing LightGBM, total= 5.8min
[Voting] .................. (2 of 4) Processing XGBoost, total=49.4min
[Voting] ................. (3 of 4) Processing Logistic, total= 1.8min
Learning rate set to 0.5
0:	total: 2.94s	remaining: 48m 54s
1:	total: 5.41s	remaining: 45m 2s
2:	total: 7.97s	remaining: 44m 8s
3:	total: 10.4s	remaining: 43m 21s
4:	total: 12.9s	remaining: 42m 52s
5:	total: 15.5s	remaining: 42m 44s
6:	total: 17.8s	remaining: 42m 10s
7:	total: 20.3s	re

In [34]:
y_test_prob = vc.predict_proba(X_test_scaled)[:, 1]



# Submission

In [36]:
submission = pd.DataFrame({
    'id' : test_raw['id'],
    'Response' : y_test_prob
})

In [None]:
submission.to_csv('/kaggle/working/submission.csv')