In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import linear_model

## Importing Data

In [16]:
import os
# going back
os.chdir(os.getcwd()[:-20]+'/Data')

In [19]:
insurance_data = pd.read_csv('insurance_train.csv')

In [20]:
insurance_data.head()

Unnamed: 0,X.1,X.2,X.3,X.4,X.5,X.6,X.7,X.8,X.9,X.10,...,X.19,X.20,X.21,X.22,X.23,X.24,X.25,X.26,X.27,X.28
0,34429,01/06/2017,01/06/2017,01/06/2018,29/08/1996,31/05/2016,0,1,1,1,...,3,0,0,1994,75,1360,12795.55,4,P,1040
1,5552,19/09/2016,19/09/2018,19/09/2019,30/04/1992,03/08/2010,0,3,1,1,...,3,0,0,2004,100,1896,15386.0,5,D,1165
2,47700,08/01/2003,08/01/2018,08/01/2019,23/03/1972,01/02/1998,1,20,2,2,...,3,0,0,2002,95,1390,16470.0,5,P,1235
3,25425,01/10/2015,01/10/2018,01/10/2019,05/03/1946,07/08/1964,0,8,2,3,...,3,0,0,2010,140,2497,27381.0,5,D,1984
4,4727,26/01/2017,26/01/2018,26/01/2019,25/04/1973,24/07/1998,0,9,1,1,...,3,0,0,1998,80,1969,36600.0,3,P,1702


## Data Manipulation

In [24]:
# based on description
final_column_name = ['ID', 
               'Start_Date_Contract', 
               'Date_Last_Renewal', 
               'Date_Next_Renewal', 
               'Date_Of_Birth', 
               'Date_Of_DL_Issuance', 
               'Issurance_Broker_Agent_Channel', 
               'Years_Associates',
               'Total_Policies_Entity',
               'Max_Policy_Simultaneous_Force',
               'Max_Product_Simultaneous_Held',
               'Policies_Terminated_Non_Payment',
               'Half_Yearly_Payment_Method',
               'Premium_Amt_Current_Yr',
               'Total_Cost_Claims_Current_Yr',
               'Total_Number_Claims_Current_Yr',
               'Total_Number_Claims_Entire_Duration',
               'Ration_Claims_Total_Duration_Force',
               'Motorbikes_Vans_Cars_Agricultural',
               'Rural_Urban_Flag',
               'Multiple_Drivers_Regular_Flag',
               'Yr_Vehicle_Registration',
               'Vehicle_Power_HP',
               'Cylinder_Capacity',
               'Market_Value_EOY19',
               'Vehicle_Doors',
               'Energy_Source',
               'Vehicle_Wt_Kg'
               ]

# saving old column names
old_column_names = list(insurance_data.columns)

# updating columns
insurance_data.columns = final_column_name
insurance_data.head()

Unnamed: 0,ID,Start_Date_Contract,Date_Last_Renewal,Date_Next_Renewal,Date_Of_Birth,Date_Of_DL_Issuance,Issurance_Broker_Agent_Channel,Years_Associates,Total_Policies_Entity,Max_Policy_Simultaneous_Force,...,Motorbikes_Vans_Cars_Agricultural,Rural_Urban_Flag,Multiple_Drivers_Regular_Flag,Yr_Vehicle_Registration,Vehicle_Power_HP,Cylinder_Capacity,Market_Value_EOY19,Vehicle_Doors,Energy_Source,Vehicle_Wt_Kg
0,34429,01/06/2017,01/06/2017,01/06/2018,29/08/1996,31/05/2016,0,1,1,1,...,3,0,0,1994,75,1360,12795.55,4,P,1040
1,5552,19/09/2016,19/09/2018,19/09/2019,30/04/1992,03/08/2010,0,3,1,1,...,3,0,0,2004,100,1896,15386.0,5,D,1165
2,47700,08/01/2003,08/01/2018,08/01/2019,23/03/1972,01/02/1998,1,20,2,2,...,3,0,0,2002,95,1390,16470.0,5,P,1235
3,25425,01/10/2015,01/10/2018,01/10/2019,05/03/1946,07/08/1964,0,8,2,3,...,3,0,0,2010,140,2497,27381.0,5,D,1984
4,4727,26/01/2017,26/01/2018,26/01/2019,25/04/1973,24/07/1998,0,9,1,1,...,3,0,0,1998,80,1969,36600.0,3,P,1702


In [25]:
insurance_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37451 entries, 0 to 37450
Data columns (total 28 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   ID                                   37451 non-null  int64  
 1   Start_Date_Contract                  37451 non-null  object 
 2   Date_Last_Renewal                    37451 non-null  object 
 3   Date_Next_Renewal                    37451 non-null  object 
 4   Date_Of_Birth                        37451 non-null  object 
 5   Date_Of_DL_Issuance                  37451 non-null  object 
 6   Issurance_Broker_Agent_Channel       37451 non-null  int64  
 7   Years_Associates                     37451 non-null  int64  
 8   Total_Policies_Entity                37451 non-null  int64  
 9   Max_Policy_Simultaneous_Force        37451 non-null  int64  
 10  Max_Product_Simultaneous_Held        37451 non-null  int64  
 11  Policies_Terminated_Non_Paym

In [29]:
date_columns = [
    'Start_Date_Contract',
    'Date_Last_Renewal', 
    'Date_Next_Renewal', 
    'Date_Of_Birth', 
    'Date_Of_DL_Issuance',
#    'Yr_Vehicle_Registration'
]

display(date_columns)

['Start_Date_Contract',
 'Date_Last_Renewal',
 'Date_Next_Renewal',
 'Date_Of_Birth',
 'Date_Of_DL_Issuance']

In [34]:
for i in date_columns:
    insurance_data[i] = pd.to_datetime(insurance_data[i], format="%d/%m/%Y")

In [36]:
insurance_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37451 entries, 0 to 37450
Data columns (total 28 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   ID                                   37451 non-null  int64         
 1   Start_Date_Contract                  37451 non-null  datetime64[ns]
 2   Date_Last_Renewal                    37451 non-null  datetime64[ns]
 3   Date_Next_Renewal                    37451 non-null  datetime64[ns]
 4   Date_Of_Birth                        37451 non-null  datetime64[ns]
 5   Date_Of_DL_Issuance                  37451 non-null  datetime64[ns]
 6   Issurance_Broker_Agent_Channel       37451 non-null  int64         
 7   Years_Associates                     37451 non-null  int64         
 8   Total_Policies_Entity                37451 non-null  int64         
 9   Max_Policy_Simultaneous_Force        37451 non-null  int64         
 10  Max_Produc