In [61]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold,  StratifiedShuffleSplit
from sklearn.preprocessing import KBinsDiscretizer, RobustScaler, LabelEncoder
from sklearn.metrics import roc_auc_score, roc_curve
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from tqdm.notebook import tqdm ,tnrange

In [49]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [50]:
train['source'] = 'train'
test['source'] = 'test'
data = pd.concat([train, test])

In [51]:
data

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,source
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1.0,train
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0.0,train
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1.0,train
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0.0,train
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0.0,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...
127032,508142,Female,26,1,37.0,1,< 1 Year,No,30867.0,152.0,56,,test
127033,508143,Female,38,1,28.0,0,1-2 Year,Yes,28700.0,122.0,165,,test
127034,508144,Male,21,1,46.0,1,< 1 Year,No,29802.0,152.0,74,,test
127035,508145,Male,71,1,28.0,1,1-2 Year,No,62875.0,26.0,265,,test


In [52]:
# Concatenating Features

data['Policy_Region'] = data['Policy_Sales_Channel'].astype(str) + '_'  + data['Region_Code'].astype(str)
data['Vehicle_License'] = data['Vehicle_Age'].astype(str) +  '_' + data['Driving_License'].astype(str)
data

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,source,Policy_Region,Vehicle_License
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1.0,train,26.0_28.0,> 2 Years_1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0.0,train,26.0_3.0,1-2 Year_1
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1.0,train,26.0_28.0,> 2 Years_1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0.0,train,152.0_11.0,< 1 Year_1
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0.0,train,152.0_41.0,< 1 Year_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127032,508142,Female,26,1,37.0,1,< 1 Year,No,30867.0,152.0,56,,test,152.0_37.0,< 1 Year_1
127033,508143,Female,38,1,28.0,0,1-2 Year,Yes,28700.0,122.0,165,,test,122.0_28.0,1-2 Year_1
127034,508144,Male,21,1,46.0,1,< 1 Year,No,29802.0,152.0,74,,test,152.0_46.0,< 1 Year_1
127035,508145,Male,71,1,28.0,1,1-2 Year,No,62875.0,26.0,265,,test,26.0_28.0,1-2 Year_1


In [53]:
data

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,source,Policy_Region,Vehicle_License
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1.0,train,26.0_28.0,> 2 Years_1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0.0,train,26.0_3.0,1-2 Year_1
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1.0,train,26.0_28.0,> 2 Years_1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0.0,train,152.0_11.0,< 1 Year_1
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0.0,train,152.0_41.0,< 1 Year_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127032,508142,Female,26,1,37.0,1,< 1 Year,No,30867.0,152.0,56,,test,152.0_37.0,< 1 Year_1
127033,508143,Female,38,1,28.0,0,1-2 Year,Yes,28700.0,122.0,165,,test,122.0_28.0,1-2 Year_1
127034,508144,Male,21,1,46.0,1,< 1 Year,No,29802.0,152.0,74,,test,152.0_46.0,< 1 Year_1
127035,508145,Male,71,1,28.0,1,1-2 Year,No,62875.0,26.0,265,,test,26.0_28.0,1-2 Year_1


In [54]:
cat_features = ['Gender','Driving_License','Region_Code','Previously_Insured',
                'Vehicle_Damage','Policy_Sales_Channel','Policy_Region',
                'Vehicle_Age','Vintage','Annual_Premium','Vehicle_License']
label = 'Response'
def categorical_encoding(data, cat_cols):
    label_dict = {}
    for col in cat_cols:
        le = LabelEncoder()
        le.fit(data[col].unique().tolist())
        data[col] = le.transform(data[col])
        label_dict[col] = le
    le = LabelEncoder()
    data[label] = le.fit_transform(data[[label]])
    label_dict[label] = le
    return data, label_dict
data, label_dict = categorical_encoding(data, cat_features)

  y = column_or_1d(y, warn=True)


In [55]:
premium_discretizer = KBinsDiscretizer(n_bins=8, encode='ordinal', strategy='quantile')
data['Premium_Bins'] =premium_discretizer.fit_transform(data['Annual_Premium'].values.reshape(-1,1)).astype(int)

age_discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')

data['Age_Bins'] =age_discretizer.fit_transform(data['Age'].values.reshape(-1,1)).astype(int)



In [56]:
gender_counts = data['Gender'].value_counts().to_dict()
data['Gender_Count'] = data['Gender'].map(gender_counts)

vehicle_age_count = data['Vehicle_Age'].value_counts().to_dict()
data['Vehicle_Age_Count'] = data['Vehicle_Age'].map(vehicle_age_count)

region_code_count = data['Region_Code'].value_counts().to_dict()
data['Region_Code_Count'] = data['Region_Code'].map(region_code_count)
data

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,source,Policy_Region,Vehicle_License,Premium_Bins,Age_Bins,Gender_Count,Vehicle_Age_Count,Region_Code_Count
0,1,1,44,1,28,0,2,1,24158,24,207,1,train,1601,5,5,6,274325,21326,141937
1,2,1,76,1,3,0,0,0,17240,24,173,0,train,1603,1,3,9,274325,267015,12349
2,3,1,47,1,28,0,2,1,21998,24,17,1,train,1601,5,4,7,274325,21326,141937
3,4,1,21,1,11,1,1,0,12323,147,193,0,train,862,3,2,0,274325,219805,12328
4,5,0,29,1,41,1,1,0,11200,147,29,0,train,895,3,1,4,233821,219805,24400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127032,508142,0,26,1,37,1,1,0,14571,147,46,42339,test,890,3,2,3,233821,219805,7343
127033,508143,0,38,1,28,0,0,1,12404,117,155,42338,test,383,1,2,5,233821,267015,141937
127034,508144,1,21,1,46,1,1,0,13506,147,64,42337,test,900,3,2,0,274325,219805,26357
127035,508145,1,71,1,28,1,0,0,43856,24,255,84699,test,1601,1,6,9,274325,267015,141937


In [57]:
data['Policy_per_Region'] = data.groupby('Region_Code')['Policy_Sales_Channel'].transform('nunique')

data['Annual_Premium_per_Region'] = data.groupby('Region_Code')['Annual_Premium'].transform('std').fillna(-1)

data['Nu_Unique_Region_Premium'] = data.groupby('Annual_Premium')['Region_Code'].transform('nunique')

data['Annual_Premium_per_Region_after'] = data['Annual_Premium_per_Region']%1

data['Annual_Premium_per_Region_before'] = data['Annual_Premium_per_Region'].astype(int)

data['Avg_Policy_Region_Age'] = data.groupby(['Policy_Region'])['Age'].transform('mean')

data['Avg_Policy_Region_Premium'] = data.groupby(['Policy_Region'])['Annual_Premium'].transform('mean') 

data['Avg_Region_Premium'] = data.groupby(['Region_Code'])['Annual_Premium'].transform('mean')

data['Nunq_Premium_Region'] = data.groupby(['Annual_Premium'])['Region_Code'].transform('nunique')

In [58]:
final_train = data[data['source']=='train']
target = final_train['Response']
final_train = final_train.drop(columns=['id', 'source', 'Response'])
final_test = data[data['source']=='test']
final_test_id = final_test['id']
final_test = final_test.drop(columns=['id', 'source', 'Response'])

In [64]:
# XGBoost Classifier
probs = np.zeros(shape=(len(final_test)))
scores = []
avg_loss = []

X_train, y_train = final_train, target
seeds = [1]

for seed in tnrange(len(seeds)):
    print(' ')
    print('#'*100)
    print('Seed', seeds[seed])
    sf = StratifiedShuffleSplit(n_splits=4, test_size=0.3, random_state=seed)
    for i, (idxT, idxV) in enumerate(sf.split(X_train, y_train)):
        print('Fold', i)
        print('Rows of Train= ', len(idxT), 'Rows of Holdout = ', len(idxV))
        clf = XGBClassifier(n_estimators=1000,
                           max_depth=6,
                           learning_rate=0.05,
                            subsample=0.9,
                            colsample_bytree=0.35,
                            objective='binary:logistic',
                            random_state=1)
        preds = clf.fit(X_train.iloc[idxT], y_train.iloc[idxT],
                       eval_set=[(X_train.iloc[idxV], y_train.iloc[idxV])],
                       verbose=100, eval_metric=['auc', 'logloss'],
                       early_stopping_rounds=40)
        probs_oof = clf.predict_proba(X_train.iloc[idxV])[:,1]
        probs += clf.predict_proba(final_test)[:,1]
        roc = roc_auc_score(y_train.iloc[idxV], probs_oof)
        scores.append(roc)
        avg_loss.append(clf.best_score)
        print("ROC_AUC= ", roc)
        print('#'*100)
        
print("Loss= {0:0.5f}, {1:0.5f}".format(np.array(avg_loss).mean(), np.array(avg_loss).std()))
print('%.6f (%.6f)' % (np.array(scores).mean(), np.array(scores).std()))


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

 
####################################################################################################
Seed 1
Fold 0
Rows of Train=  266776 Rows of Holdout =  114333
[0]	validation_0-auc:0.73262	validation_0-logloss:0.66374
Multiple eval metrics have been passed: 'validation_0-logloss' will be used for early stopping.

Will train until validation_0-logloss hasn't improved in 40 rounds.
[100]	validation_0-auc:0.85761	validation_0-logloss:0.27214
[200]	validation_0-auc:0.85964	validation_0-logloss:0.26377
[300]	validation_0-auc:0.85953	validation_0-logloss:0.26338
Stopping. Best iteration:
[303]	validation_0-auc:0.85952	validation_0-logloss:0.26337

ROC_AUC=  0.8595193710249233
####################################################################################################
Fold 1
Rows of Train=  266776 Rows of Holdout =  114333
[0]	validation_0-auc:0.73340	validation_0-logloss:0.66371
Multiple eval metrics have been passed: 'validation_0-logloss' will be used for early stopping.

Wil

In [65]:
sample = pd.read_csv("data/sample.csv")
sample['Response'] = probs/4
sample.to_csv('submission.csv',index =False)