## Imports

In [4]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_selection import RFECV
from lightgbm import LGBMRegressor

## Change file location to Data

In [21]:
head, tail = os.path.split(os.getcwd())
os.chdir(os.path.join(head,'01_Data'))

In [23]:
os.listdir()

['insurance_train.csv',
 'cleaned_test.csv',
 'cleaned_data.csv',
 'insurance_test.csv',
 'cleaned_test.pkl',
 'cleaned_data.pkl']

## Reading data

In [24]:
data = pd.read_pickle('cleaned_data.pkl')
data.head()

Unnamed: 0,ID,Start_Date_Contract,Date_Last_Renewal,Date_Next_Renewal,Date_Of_Birth,Date_Of_DL_Issuance,Issurance_Broker_Agent_Channel,Years_Associates,Total_Policies_Entity,Max_Policy_Simultaneous_Force,...,Non_Continuation_Insurance_Flag,New_License,Car_Age_Cat,Ratio_Premium_Car_Value,Power_Wt_Ratio,Customer_Loyalty,New_Bhp_Risk,Years_Driving_At_Start_Date,Young_Driver,Young_Bhp_Risk
0,34429,2017-06-01,2017-06-01,2018-06-01,1996-08-29,2016-05-31,0,1,1,1,...,1,0,Old,0.044787,0.072115,1.0,0,1,1,75
1,5552,2016-09-19,2018-09-19,2019-09-19,1992-04-30,2010-08-03,0,3,1,1,...,0,0,Standard,0.019446,0.085837,1.7,0,6,0,0
2,47700,2003-01-08,2018-01-08,2019-01-08,1972-03-23,1998-02-01,1,20,2,2,...,0,0,Old,0.015644,0.076923,8.15,0,5,0,0
3,25425,2015-10-01,2018-10-01,2019-10-01,1946-03-05,1964-08-07,0,8,2,3,...,0,0,Standard,0.010544,0.070565,4.15,0,51,0,0
4,4727,2017-01-26,2018-01-26,2019-01-26,1973-04-25,1998-07-24,0,9,1,1,...,0,0,Old,0.02025,0.047004,3.8,0,19,0,0


In [26]:
data.columns

Index(['ID', 'Start_Date_Contract', 'Date_Last_Renewal', 'Date_Next_Renewal',
       'Date_Of_Birth', 'Date_Of_DL_Issuance',
       'Issurance_Broker_Agent_Channel', 'Years_Associates',
       'Total_Policies_Entity', 'Max_Policy_Simultaneous_Force',
       'Max_Product_Simultaneous_Held', 'Policies_Terminated_Non_Payment',
       'Half_Yearly_Payment_Method', 'Premium_Amt_Current_Yr',
       'Total_Cost_Claims_Current_Yr', 'Total_Number_Claims_Current_Yr',
       'Total_Number_Claims_Entire_Duration',
       'Ratio_Claims_Total_Duration_Force',
       'Motorbikes_Vans_Cars_Agricultural', 'Rural_Urban_Flag',
       'Multiple_Drivers_Regular_Flag', 'Yr_Vehicle_Registration',
       'Vehicle_Power_HP', 'Cylinder_Capacity', 'Market_Value_EOY19',
       'Vehicle_Doors', 'Energy_Source', 'Vehicle_Wt_Kg', 'Loss_Cost',
       'Historically_Adjusted_Loss_Cost', 'Claim_Status', 'Age',
       'Years_Driving', 'Car_Age', 'Time_Since_Last_Renewal',
       'Non_Payment_Termination', 'Non_Continuati

In [27]:
X = data.copy()

In [28]:
X = X.drop(columns=['ID', 'Total_Cost_Claims_Current_Yr', 'Total_Number_Claims_Current_Yr',
                    'Total_Number_Claims_Entire_Duration', 'Ratio_Claims_Total_Duration_Force',
                    'Loss_Cost', 'Historically_Adjusted_Loss_Cost', 'Claim_Status'])
X.columns

Index(['Start_Date_Contract', 'Date_Last_Renewal', 'Date_Next_Renewal',
       'Date_Of_Birth', 'Date_Of_DL_Issuance',
       'Issurance_Broker_Agent_Channel', 'Years_Associates',
       'Total_Policies_Entity', 'Max_Policy_Simultaneous_Force',
       'Max_Product_Simultaneous_Held', 'Policies_Terminated_Non_Payment',
       'Half_Yearly_Payment_Method', 'Premium_Amt_Current_Yr',
       'Motorbikes_Vans_Cars_Agricultural', 'Rural_Urban_Flag',
       'Multiple_Drivers_Regular_Flag', 'Yr_Vehicle_Registration',
       'Vehicle_Power_HP', 'Cylinder_Capacity', 'Market_Value_EOY19',
       'Vehicle_Doors', 'Energy_Source', 'Vehicle_Wt_Kg', 'Age',
       'Years_Driving', 'Car_Age', 'Time_Since_Last_Renewal',
       'Non_Payment_Termination', 'Non_Continuation_Insurance_Flag',
       'New_License', 'Car_Age_Cat', 'Ratio_Premium_Car_Value',
       'Power_Wt_Ratio', 'Customer_Loyalty', 'New_Bhp_Risk',
       'Years_Driving_At_Start_Date', 'Young_Driver', 'Young_Bhp_Risk'],
      dtype='object')

In [30]:
Y_reg = data[['Loss_Cost','Historically_Adjusted_Loss_Cost']]
Y_class = data[['Claim_Status']]

## Fixing the X data

In [31]:
X.isna().sum()

Start_Date_Contract                    0
Date_Last_Renewal                      0
Date_Next_Renewal                      0
Date_Of_Birth                          0
Date_Of_DL_Issuance                    0
Issurance_Broker_Agent_Channel         0
Years_Associates                       0
Total_Policies_Entity                  0
Max_Policy_Simultaneous_Force          0
Max_Product_Simultaneous_Held          0
Policies_Terminated_Non_Payment        0
Half_Yearly_Payment_Method             0
Premium_Amt_Current_Yr                 0
Motorbikes_Vans_Cars_Agricultural      0
Rural_Urban_Flag                       0
Multiple_Drivers_Regular_Flag          0
Yr_Vehicle_Registration                0
Vehicle_Power_HP                       0
Cylinder_Capacity                      0
Market_Value_EOY19                     0
Vehicle_Doors                          0
Energy_Source                        593
Vehicle_Wt_Kg                          0
Age                                    0
Years_Driving   

In [35]:
data.isna().sum()[data.isna().sum() > 0]

Energy_Source                        593
Loss_Cost                          33300
Historically_Adjusted_Loss_Cost    33300
dtype: int64

In [39]:
data.loc[data['Energy_Source'].isna(),'Claim_Status'].value_counts()

Claim_Status
0    564
1     29
Name: count, dtype: int64

We can fill the X['Energy_Source'] with other

In [41]:
X['Energy_Source'] = X['Energy_Source'].fillna('Other')
X['Energy_Source'].value_counts(dropna = False)

Energy_Source
D        23074
P        13784
Other      593
Name: count, dtype: int64

Create categorical variables

## Fixing Y

In [42]:
Y_reg = Y_reg.fillna(0)