<h1 style="font-size:40px; color:purple; text-align:center"><u>Auto Insurance - Model Training and Evaluation</u></h1>

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 30)

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

In [5]:
df = pd.read_csv('finalized_insurance_data.csv')
print(df.shape)
df.head()

(8762, 34)


Unnamed: 0,Claim Amount,Monthly Premium Auto,Number of Policies,State Code_IA,State Code_KS,State Code_MO,State Code_NE,State Code_OK,Coverage_Basic,Coverage_Extended,Coverage_Premium,Education_Bachelor,Education_College,Education_Doctor,Education_High School or Below,...,EmploymentStatus_Unemployed,Location Code_Rural,Location Code_Suburban,Location Code_Urban,Policy_Corporate,Policy_Personal,Policy_Special,Claim Reason_Collision,Claim Reason_Hail,Claim Reason_Other,Claim Reason_Scratch/Dent,Vehicle Class_Four-Door Car,Vehicle Class_SUV,Vehicle Class_Sports Car,Vehicle Class_Two-Door Car
0,276.351928,69,1,0,1,0,0,0,1,0,0,1,0,0,0,...,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1
1,697.95359,94,8,0,0,0,1,0,0,1,0,1,0,0,0,...,1,0,1,0,0,1,0,0,0,0,1,1,0,0,0
2,1288.743165,108,2,0,0,0,0,1,0,0,1,1,0,0,0,...,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1
3,764.586183,106,7,0,0,1,0,0,1,0,0,1,0,0,0,...,1,0,1,0,1,0,0,1,0,0,0,0,1,0,0
4,281.369258,73,1,0,1,0,0,0,1,0,0,1,0,0,0,...,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0


## ML Imports

In [6]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Lasso, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from xgboost import XGBRegressor

### Function to calculate rmse

In [7]:
def rmse(ytrue, ypred):
    return np.sqrt(np.mean(np.square(ytrue - ypred)))

## Train test split

In [8]:
target = df['Claim Amount']

In [9]:
features = df.drop(columns = ['Claim Amount'])
df.shape

(8762, 34)

In [10]:
features.head()

Unnamed: 0,Monthly Premium Auto,Number of Policies,State Code_IA,State Code_KS,State Code_MO,State Code_NE,State Code_OK,Coverage_Basic,Coverage_Extended,Coverage_Premium,Education_Bachelor,Education_College,Education_Doctor,Education_High School or Below,Education_Master,...,EmploymentStatus_Unemployed,Location Code_Rural,Location Code_Suburban,Location Code_Urban,Policy_Corporate,Policy_Personal,Policy_Special,Claim Reason_Collision,Claim Reason_Hail,Claim Reason_Other,Claim Reason_Scratch/Dent,Vehicle Class_Four-Door Car,Vehicle Class_SUV,Vehicle Class_Sports Car,Vehicle Class_Two-Door Car
0,69,1,0,1,0,0,0,1,0,0,1,0,0,0,0,...,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1
1,94,8,0,0,0,1,0,0,1,0,1,0,0,0,0,...,1,0,1,0,0,1,0,0,0,0,1,1,0,0,0
2,108,2,0,0,0,0,1,0,0,1,1,0,0,0,0,...,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1
3,106,7,0,0,1,0,0,1,0,0,1,0,0,0,0,...,1,0,1,0,1,0,0,1,0,0,0,0,1,0,0
4,73,1,0,1,0,0,0,1,0,0,1,0,0,0,0,...,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0


In [11]:
target[:5]

0     276.351928
1     697.953590
2    1288.743165
3     764.586183
4     281.369258
Name: Claim Amount, dtype: float64

In [39]:
xtrain, xtest, ytrain, ytest = train_test_split(features, target, test_size = 0.2, random_state = 1234)

print('\t\tTrain \tTest')
print('Num of X in:\t{}\t{}'.format(len(xtrain), len(xtest)))
print('Num of Y in:\t{}\t{}'.format(len(ytrain), len(ytest)))

		Train 	Test
Num of X in:	7009	1753
Num of Y in:	7009	1753


In [41]:
pipelines = {
    'Lasso': make_pipeline(StandardScaler(), Lasso(random_state = 123)),
    'Ridge': make_pipeline(StandardScaler(), Ridge(random_state = 123)),
    'GB'   : make_pipeline(StandardScaler(), GradientBoostingRegressor(random_state = 123)),
    'RF'   : make_pipeline(StandardScaler(), RandomForestRegressor(random_state = 123)),
    'XGB'  : make_pipeline(StandardScaler(), XGBRegressor(random_state = 123))
}

In [42]:
hypers = {
    'Lasso': {
        'lasso__alpha' : [0.05, 0.1, 1, 10, 11, 15]
    },
    
    'Ridge': {
        'ridge__alpha': [0.05, 0.1, 1, 10, 11, 15]
    },
    
    'GB': {
        'gradientboostingregressor__n_estimators': [40, 70, 110],
        'gradientboostingregressor__learning_rate': [0.01, 0.03, 0.05],
        'gradientboostingregressor__max_depth': [2, 3, 4],
        'gradientboostingregressor__min_samples_leaf': [5, 2, 3]
    },
    
    'RF': { 
        'randomforestregressor__n_estimators': [150, 200, 220],
        'randomforestregressor__min_samples_leaf': [2, 3]
    },
    
    'XGB': { 
        'xgbregressor__n_estimators': [50, 70, 100],
        'xgbregressor__learning_rate': [0.02, 0.03, 0.04],
        'xgbregressor__max_depth': [2, 3],
        'xgbregressor__reg_lambda': [1.5, 2, 2.5]
    }
}

In [43]:
fitted_models = {}

for name, pipeline in pipelines.items():
    model = GridSearchCV(pipeline, hypers[name], cv=10, n_jobs=-1)
    model.fit(xtrain, ytrain)
    
    fitted_models[name] = model
    print('{} has been fitted'.format(name))

Lasso has been fitted
Ridge has been fitted
GB has been fitted
RF has been fitted


  if getattr(data, 'base', None) is not None and \


XGB has been fitted


In [44]:
for name, model in fitted_models.items():
    print('{}: {}'.format(name, model.best_score_))

Lasso: 0.16988939040351722
Ridge: 0.16906752748245515
GB: 0.6887066585810236
RF: 0.6846613494984543
XGB: 0.6812377540629311


In [45]:
for name, model in fitted_models.items():
    preds = model.predict(xtest)
    
    print('{}--------------------'.format(name))
    print('r2_score: \t{0:.2f}'.format(r2_score(ytest, preds)))
    print('rmse: \t\t{0:.2f}'.format(rmse(ytest, preds)))

Lasso--------------------
r2_score: 	0.14
rmse: 		644.81
Ridge--------------------
r2_score: 	0.14
rmse: 		644.14
GB--------------------
r2_score: 	0.66
rmse: 		407.33
RF--------------------
r2_score: 	0.63
rmse: 		420.35
XGB--------------------
r2_score: 	0.66
rmse: 		404.38


## Best model's params
**XGBoost has performed best with an R2-score of 0.66 on test set.**

In [46]:
xgb = fitted_models['XGB']

In [48]:
xgb.best_params_

{'xgbregressor__learning_rate': 0.04,
 'xgbregressor__max_depth': 3,
 'xgbregressor__n_estimators': 100,
 'xgbregressor__reg_lambda': 1.5}

In [51]:
import pickle

In [52]:
with open('xgb_claim_predictor.pkl', 'wb') as f:
    pickle.dump(xgb, f)

## Making a class to do all the work that we need to predict on unseen data

In [73]:
class Claim_Predictor:    
    
    def __init__(self, model_location):
        with open(model_location, 'rb') as f:
            self.model = pickle.load(f)
            
    
    def predict_claim(self, X_new, clean=True):
        if clean:
            X_new = self.clean_data(X_new)
            
        return X_new, self.model.predict(X_new)
    
    
    def clean_data(self, data):
        '''

        Function taking raw data as input and cleaning it.
        Return: Cleaned representation of raw data given to the function.

        '''

        data = data.drop(columns = ['Country', 'State', 'Sales Channel', 'Effective To Date', 'Months Since Last Claim',
                                    'Months Since Policy Inception', 'Number of Open Complaints', 'Income', 'Customer',
                                    'Vehicle Size', 'Policy Type', 'Marital Status', 'Gender', 'Response'])

        data['Vehicle Class'].replace('Luxury Car', 'Sports Car', inplace = True)
        data['Vehicle Class'].replace('Luxury SUV', 'SUV', inplace = True)
        data['Policy'].replace('Special L1', 'Special', inplace = True)
        data['Policy'].replace('Special L2', 'Special', inplace = True)
        data['Policy'].replace('Special L3', 'Special', inplace = True)
        data['Policy'].replace('Personal L1', 'Personal', inplace = True)
        data['Policy'].replace('Personal L2', 'Personal', inplace = True)
        data['Policy'].replace('Personal L3', 'Personal', inplace = True)
        data['Policy'].replace('Corporate L1', 'Corporate', inplace = True)
        data['Policy'].replace('Corporate L2', 'Corporate', inplace = True)
        data['Policy'].replace('Corporate L3', 'Corporate', inplace = True)
        data['EmploymentStatus'].replace('Disabled', 'Medical Leave', inplace = True)
        data = df.drop_duplicates()
        data = pd.get_dummies(data = df, columns = list(df.columns[df.dtypes == 'object']))

        return data

## Conclusion of the Notebook:
> We trained **different types of models** on the **Auto Insurance dataset** and found that **XGBoost Regression** has achieved best performance.
## Thank You!
**for going through the project.**