In [6]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, KBinsDiscretizer
from sklearn.model_selection import KFold, train_test_split

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
train['source'] = 'train'
test['source'] = 'test'

df = pd.concat([train, test])

In [4]:
df

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,source
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1.0,train
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0.0,train
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1.0,train
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0.0,train
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0.0,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...
127032,508142,Female,26,1,37.0,1,< 1 Year,No,30867.0,152.0,56,,test
127033,508143,Female,38,1,28.0,0,1-2 Year,Yes,28700.0,122.0,165,,test
127034,508144,Male,21,1,46.0,1,< 1 Year,No,29802.0,152.0,74,,test
127035,508145,Male,71,1,28.0,1,1-2 Year,No,62875.0,26.0,265,,test


In [5]:
df['Policy_Region'] = df['Policy_Sales_Channel'].astype(str) + '_' + df['Region_Code'].astype(str)
df['Vehicle_License'] = df['Vehicle_Age'].astype(str) + '_' + df['Driving_License'].astype(str)

In [8]:
premium_discretizer = KBinsDiscretizer(n_bins = 8, encode = 'ordinal', strategy = 'quantile')
df['Premium_bins'] = premium_discretizer.fit_transform(df['Annual_Premium'].values.reshape(-1,1)).astype(int)

age_discretizer = KBinsDiscretizer(n_bins = 10, encode = 'ordinal', strategy = 'quantile')
df['Age_bins'] = age_discretizer.fit_transform(df['Age'].values.reshape(-1,1)).astype(int)

In [9]:
gender_counts = df['Gender'].value_counts().to_dict()
df['Gender_Count'] = df['Gender'].map(gender_counts)

previous_insured_counts = df['Previously_Insured'].value_counts().to_dict()
df['Pre_Insured_Counts'] = df['Previously_Insured'].map(previous_insured_counts)

vehicle_age_counts = df['Vehicle_Age'].value_counts().to_dict()
df['vehicle_counts_age'] = df['Vehicle_Age'].map(vehicle_age_counts)

vehicle_dam_count = df['Vehicle_Damage'].value_counts().to_dict()
df['Vehicle_Damage_Count'] = df['Vehicle_Damage'].map(vehicle_dam_count)

In [10]:
df['Policy_Per_Region'] = df.groupby('Region_Code')['Policy_Sales_Channel'].transform('nunique')
df['Policy_Per_Region_Sum'] = df.groupby('Region_Code')['Policy_Sales_Channel'].transform('sum')

df['Vintage'] = df['Vintage'] / 365 
#df['Previous_Insure_Region'] = df.groupby('Region_Code')['Previously_Insured'].transform('sum')
df['Premium_Per_Region'] = df.groupby('Region_Code')['Annual_Premium'].transform('sum')
df['Premium_Per_Policy'] = df.groupby('Policy_Sales_Channel')['Annual_Premium'].transform('sum')
df['Policy_Per_Premium_Bin'] = df.groupby('Premium_bins')['Policy_Sales_Channel'].transform('nunique')
df['Premium_Per_Age_Bin'] = df.groupby('Age_bins')['Annual_Premium'].transform('mean')
df['Mean_Premium_Per_Region'] = df.groupby('Region_Code')['Annual_Premium'].transform('mean')

In [11]:
gender = {'Male':0, 'Female':1}
vehicle_age = {'> 2 Years':2, '1-2 Year':1, '< 1 Year':0}
vehicle_damage = {'Yes':1, 'No':0}

df['Gender'] = df['Gender'].map(gender)
df['Vehicle_Age'] = df['Vehicle_Age'].map(vehicle_age)
df['Vehicle_Damage'] = df['Vehicle_Damage'].map(vehicle_damage)

# To make Catboost working 

df['Policy_Sales_Channel'] = df['Policy_Sales_Channel'].apply(lambda x: np.int(x))
df['Region_Code'] = df['Region_Code'].apply(lambda x: np.int(x))
df

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,...,Pre_Insured_Counts,vehicle_counts_age,Vehicle_Damage_Count,Policy_Per_Region,Policy_Per_Region_Sum,Premium_Per_Region,Premium_Per_Policy,Policy_Per_Premium_Bin,Premium_Per_Age_Bin,Mean_Premium_Per_Region
0,1,0,44,1,28,0,2,1,40454.0,26,...,275076,21326,256248,127.0,12380958.0,5.485848e+09,3.585804e+09,127.0,30906.815730,38649.878763
1,2,0,76,1,3,0,1,0,33536.0,26,...,275076,267015,251898,64.0,1489410.0,3.021238e+08,3.585804e+09,129.0,33703.683881,24465.445218
2,3,0,47,1,28,0,2,1,38294.0,26,...,275076,21326,256248,127.0,12380958.0,5.485848e+09,3.585804e+09,127.0,31462.573097,38649.878763
3,4,0,21,1,11,1,0,0,28619.0,152,...,233070,219805,251898,53.0,1585838.0,3.409331e+08,5.547025e+09,123.0,29522.814907,27655.183160
4,5,1,29,1,41,1,0,0,27496.0,152,...,233070,219805,251898,68.0,2915447.0,7.551551e+08,5.547025e+09,122.0,27333.200227,30948.979918
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127032,508142,1,26,1,37,1,0,0,30867.0,152,...,233070,219805,251898,52.0,974514.0,2.090564e+08,5.547025e+09,123.0,29789.954679,28470.157565
127033,508143,1,38,1,28,0,1,1,28700.0,122,...,275076,267015,256248,127.0,12380958.0,5.485848e+09,4.890773e+08,123.0,28809.333832,38649.878763
127034,508144,0,21,1,46,1,0,0,29802.0,152,...,233070,219805,251898,67.0,3046178.0,7.145954e+08,5.547025e+09,123.0,29522.814907,27112.165876
127035,508145,0,71,1,28,1,1,0,62875.0,26,...,233070,267015,251898,127.0,12380958.0,5.485848e+09,3.585804e+09,121.0,33703.683881,38649.878763


In [20]:
label = 'Response'
def categorical_encoding(data, cat_cols):
    label_dict = {}
    for col in cat_cols:
        le = LabelEncoder()
        le.fit(df[col].unique().tolist())
        df[col] = le.transform(df[col])
        label_dict[col] = le
    le = LabelEncoder()
    df[label] = le.fit_transform(df[[label]])
    label_dict[label] = le
    return df, label_dict
df, label_dict = categorical_encoding(df, ['Policy_Region', 'Vehicle_License'])

In [12]:
cat_cols = ['Gender', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Vehicle_Age',
           'Vehicle_Damage', 'Policy_Sales_Channel']

num_cols = ['Age', 'Vintage']

In [142]:
sc = StandardScaler()
#df[num_cols] = sc.fit_transform(df[num_cols])
mm = MinMaxScaler()
df[['Annual_Premium']] = mm.fit_transform(df[['Annual_Premium']])
df

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,source
0,1,0,44,1,28,0,2,1,0.070366,26,217,1.0,train
1,2,0,76,1,3,0,1,0,0.057496,26,183,0.0,train
2,3,0,47,1,28,0,2,1,0.066347,26,27,1.0,train
3,4,0,21,1,11,1,0,0,0.048348,152,203,0.0,train
4,5,1,29,1,41,1,0,0,0.046259,152,39,0.0,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...
127032,508142,1,26,1,37,1,0,0,0.052531,152,56,,test
127033,508143,1,38,1,28,0,1,1,0.048499,122,165,,test
127034,508144,0,21,1,46,1,0,0,0.050549,152,74,,test
127035,508145,0,71,1,28,1,1,0,0.112076,26,265,,test


In [23]:
final_train = df.loc[df['source']=='train']
final_test = df.loc[df['source']=='test']

In [24]:
final_train = final_train.drop(columns=['id', 'source'], axis=1)
final_train['Response'] = final_train['Response'].apply(lambda x: np.int(x))
test_id = final_test['id']
final_test = final_test.drop(columns=['id', 'source', 'Response'], axis=1)

In [25]:
X = final_train.drop(['Response'], axis=1).values
y = final_train['Response'].values

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 22)

In [27]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2) 
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel()) 
  
print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape)) 
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape)) 
  
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1))) 
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0)))

After OverSampling, the shape of train_X: (501534, 25)
After OverSampling, the shape of train_y: (501534,) 

After OverSampling, counts of label '1': 250767
After OverSampling, counts of label '0': 250767


In [28]:
bestLGB = LGBMClassifier(random_state = 22, max_depth = 7, n_estimators = 110, reg_lambda = 1.2, reg_alpha = 1.2, min_child_weight = 1,
                         learning_rate = 0.15, gamma = 0.3, colsample_bytree = 0.5)
bestLGB.fit(X_train_res, y_train_res)
y_pred = bestLGB.predict_proba(X_test)

In [37]:
submission['Response'].value_counts()

0.000730    49
0.000480    43
0.000468    37
0.000725    32
0.000975    31
            ..
0.000816     1
0.360996     1
0.582280     1
0.654356     1
0.185866     1
Name: Response, Length: 101960, dtype: int64

In [30]:
Preds = [predClass[1] for predClass in bestLGB.predict_proba(final_test.values)]

In [31]:
submission = pd.DataFrame(data = {'id': test_id, 'Response': Preds})
submission.to_csv('final.csv', index = False)
submission.head()

Unnamed: 0,id,Response
0,381110,0.000912
1,381111,0.496496
2,381112,0.559974
3,381113,0.006912
4,381114,0.000866
