In [84]:
import numpy as np
import pandas as pd 
import scipy
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier as RFC, RandomForestRegressor as RFR
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import r2_score
from sklearn.model_selection import RandomizedSearchCV
from collections import defaultdict
from sklearn.metrics import classification_report
from pprint import pprint
import pickle
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [43]:
data = pd.read_csv('../Data/Compressed.csv')
data.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [50]:
X = data.drop(['target', 'insuranceclaim', 'charges'], axis = 1)
y_claim = data['insuranceclaim']
y_charge = data['charges']
X_train, X_test, y_train, y_test = train_test_split(X, y_claim, test_size=0.33, random_state=4)

<h2> Predicting whether customer claims insurance </h2>

In [45]:
clf = RFC()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9479638009049773

In [46]:
## Removing low correlation features
clf = RFC()
clf.fit(X_train.drop(['sex', 'region'], axis = 1), y_train)
clf.score(X_test.drop(['sex', 'region'], axis = 1), y_test)

0.9705882352941176

<h2> Predicting value of claim </h2>

In [47]:
X = data.drop(['target', 'insuranceclaim', 'charges'], axis = 1)
y = data['charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [48]:
clf = RFR()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8221538717292104

In [49]:
## Removing low correlation features
clf = RFR()
clf.fit(X_train.drop(['sex', 'region'], axis = 1), y_train)
clf.score(X_test.drop(['sex', 'region'], axis = 1), y_test)

0.8323627363044895

In [56]:
## Grid search

In [66]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [67]:
%%time
rf = RFR()
# Random search of parameters, using 3 fold cross validation, 
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(X, y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   41.1s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  4.5min finished


CPU times: user 2.2 s, sys: 68.9 ms, total: 2.26 s
Wall time: 4min 32s


In [70]:
pprint(rf_random.best_params_)

{'bootstrap': True,
 'max_depth': 70,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'n_estimators': 400}


In [71]:
np.save('../Weights/rfrparam.npy', rf_random.best_params_)

In [73]:
## Removing low correlation features
clf = RFR(bootstrap = True,
 max_depth = 70,
 max_features = 'auto',
 min_samples_leaf = 4,
 min_samples_split = 10,
 n_estimators = 400)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9071074777671437

In [76]:
print("Mean squared error =", mean_squared_error(clf.predict(X_test), y_test))    

Mean squared error = 0.022606902856008054


In [77]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [80]:
%%time
y = data['insuranceclaim']
rf = RFC()

# Random search of parameters, using 3 fold cross validation, 
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(X, y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   31.4s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.8min finished


CPU times: user 2.5 s, sys: 66.6 ms, total: 2.56 s
Wall time: 3min 49s


In [81]:
pprint(rf_random.best_params_)

{'bootstrap': False,
 'max_depth': 90,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 800}


In [83]:
## Removing low correlation features
X_train, X_test, y_train, y_test = train_test_split(X, y_claim, test_size=0.33, random_state=4)
clf = RFC(bootstrap = False,
 max_depth = 90,
 max_features = 'sqrt',
 min_samples_leaf = 1,
 min_samples_split = 5,
 n_estimators = 800)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9773755656108597

In [86]:
print(classification_report(clf.predict(X_test), y_test))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       181
           1       0.99      0.97      0.98       261

   micro avg       0.98      0.98      0.98       442
   macro avg       0.98      0.98      0.98       442
weighted avg       0.98      0.98      0.98       442



In [87]:
print("We are able to predict insurance claims with 98 % accuracy")

We are able to predict insurance claims with 98 % accuracy
