In [82]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor,BaggingRegressor,GradientBoostingRegressor,RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score,train_test_split

In [7]:
import warnings
warnings.filterwarnings('ignore')

In [13]:
df = pd.read_csv("https://raw.githubusercontent.com/yashdahekar/Insurance_Premium_Prediction/main/notebooks/data/insurance.csv")
df.sample(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
1184,23,female,28.5,1,yes,southeast,18328.24
596,42,female,29.5,2,no,southeast,7640.31
1103,58,male,36.1,0,no,southeast,11363.28
52,48,male,28.0,1,yes,southwest,23568.27
1008,25,male,25.0,2,no,northeast,23241.47
1129,19,female,18.6,0,no,southwest,1728.9
78,22,female,39.8,0,no,northeast,2755.02
861,38,female,28.0,3,no,southwest,7151.09
500,29,male,34.4,0,yes,southwest,36197.7
1248,18,female,39.8,0,no,southeast,1633.96


In [14]:
df['sex']=df['sex'].map({'male':0,'female':1})
df['smoker']=df['smoker'].map({'yes':1,'no':0})
df['region']=df['region'].map({'northwest':0, 'northeast':1,'southeast':2,'southwest':3})

# Drop duplicate rows
df.drop_duplicates(inplace=True)

In [15]:
X = df.drop('expenses', axis=1)
y = df['expenses']

In [44]:
train_score = []
test_score = []
cross_val = []
for state in range(40,50):
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=state)
    
    model = LinearRegression()
    model.fit(X_train,y_train)
    train_score.append(model.score(X_train,y_train))
    test_score.append(model.score(X_test,y_test))
    cvs=(cross_val_score(model,X,y,cv=5,)).mean()
    cross_val.append(cvs)
df1=pd.DataFrame({'train acc':train_score,'test acc':test_score,'cvs':cross_val})

In [48]:
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.2,random_state=42)

print('LinearRegression')
lrmodel=LinearRegression()
lrmodel.fit(xtrain,ytrain)
print(lrmodel.score(xtrain,ytrain))
print(lrmodel.score(xtest,ytest))
print(cross_val_score(lrmodel,X,y,cv=5,).mean())

LinearRegression
0.7294764585037568
0.8060851445991865
0.7469845487559986


In [56]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train,y_train)
X_test_scaled = scaler.transform(X_test)

In [61]:


print('LinearRegression')
lrmodel=LinearRegression()
lrmodel.fit(X_train_scaled,ytrain)
# print(lrmodel.score(X_train_scaled,ytrain))
# print(lrmodel.score(X_test_scaled,ytest))
# print(cross_val_score(lrmodel,X,y,cv=5,).mean())
y_pred = lrmodel.predict(X_test_scaled)
print(r2_score(y_test,y_pred))

LinearRegression
0.8060851445991863


In [50]:
from sklearn.metrics import r2_score
print('SVR')

svrmodel=SVR()
svrmodel.fit(xtrain,ytrain)
ypredtrain1=svrmodel.predict(xtrain)
ypredtest1=svrmodel.predict(xtest)
print(r2_score(ytrain,ypredtrain1))
print(r2_score(ytest,ypredtest1))
print(cross_val_score(svrmodel,X,y,cv=5,).mean())

SVR
-0.10151261342106288
-0.13444640564948562
-0.10374370402776947


In [52]:
print("RandomForestRegressor:")
rfmodel=RandomForestRegressor(random_state=42)
rfmodel.fit(xtrain,ytrain)
ypredtrain2=rfmodel.predict(xtrain)
ypredtest2=rfmodel.predict(xtest)
print(r2_score(ytrain,ypredtrain2))
print(r2_score(ytest,ypredtest2))
print(cross_val_score(rfmodel,X,y,cv=5,).mean())


print("Hyperparametertuning")
from sklearn.model_selection import GridSearchCV
estimator=RandomForestRegressor(random_state=42)
param_grid={'n_estimators':[10,40,50,98,100,120,150]}
grid=GridSearchCV(estimator,param_grid,scoring="r2",cv=5)
grid.fit(xtrain,ytrain)
print(grid.best_params_)
rfmodel=RandomForestRegressor(random_state=42,n_estimators=120)
rfmodel.fit(xtrain,ytrain)
ypredtrain2=rfmodel.predict(xtrain)
ypredtest2=rfmodel.predict(xtest)
print(r2_score(ytrain,ypredtrain2))
print(r2_score(ytest,ypredtest2))
print(cross_val_score(rfmodel,X,y,cv=5,).mean())

RandomForestRegressor:
0.9738795992068785
0.8816702601099398
0.8365119070918684
Hyperparametertuning
{'n_estimators': 120}
0.9747098036172237
0.8819718448025101
0.8371418060948695


In [62]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [120]:
models = {
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet(),
    "SVR":SVR(),
    "adaboost":AdaBoostRegressor(),
    "Bagging":BaggingRegressor(),
    "gradBoosting":GradientBoostingRegressor(),
    "randomforest":RandomForestRegressor(),
    "Decision":DecisionTreeRegressor(),
    'xgb':XGBRegressor()
}

In [84]:
trained_model_list=[]
model_list=[]
r2_list=[]

In [85]:
list(models)

['LinearRegression',
 'Lasso',
 'Ridge',
 'Elasticnet',
 'SVR',
 'adaboost',
 'Bagging',
 'gradBoosting',
 'randomforest',
 'Decision',
 'xgb']

In [86]:
for i in range(len(models)):
    model = list(models.values())[i]
    print(model)

LinearRegression()
Lasso()
Ridge()
ElasticNet()
SVR()
AdaBoostRegressor()
BaggingRegressor()
GradientBoostingRegressor()
RandomForestRegressor()
DecisionTreeRegressor()
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)


In [87]:
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    #this is a validation(test) score
    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 5969.340956281675
MAE: 4185.431622715441
R2 score 80.60851445991865


Lasso
Model Training Performance
RMSE: 5970.130010113579
MAE: 4186.125914511538
R2 score 80.60338761681963


Ridge
Model Training Performance
RMSE: 5983.998368642142
MAE: 4201.505579948953
R2 score 80.5131679361279


Elasticnet
Model Training Performance
RMSE: 10557.852271804284
MAE: 7860.478245606802
R2 score 39.33901312609023


SVR
Model Training Performance
RMSE: 14438.192803357986
MAE: 9249.88940527465
R2 score -13.444640564948562


adaboost
Model Training Performance
RMSE: 5060.728394564313
MAE: 4169.275575034788
R2 score 86.06251641509887


Bagging
Model Training Performance
RMSE: 5049.285995778239
MAE: 2780.1987425373136
R2 score 86.12547097073944


gradBoosting
Model Training Performance
RMSE: 4258.087690347157
MAE: 2526.4662682640155
R2 score 90.13294585228158


randomforest
Model Training Performance
RMSE: 4729.227309126136
MAE: 2662.647428731342
R2 score 87

In [101]:
gbc = GradientBoostingRegressor()
parameters = {
    "n_estimators":[5,50,250,500],
    "max_depth":[1,3,5,7,9],
    "learning_rate":[0.01,0.1,1,10,100]
}

from sklearn.model_selection import GridSearchCV
cv = GridSearchCV(gbc,parameters,cv=5)
cv.fit(X_train,y_train)
cv.best_params_

{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500}

In [97]:
gbc = GradientBoostingRegressor(learning_rate= 0.01, max_depth= 3, n_estimators= 500)
gbc.fit(X_train,y_train)
y_pred = gbc.predict(X_test)
print(r2_score(y_test,y_pred)*100)

90.11495169411076


In [104]:
hyperparameter_grid = {
    'n_estimators': [100, 500, 900, 1100, 1500],
    'max_depth': [2, 3, 5, 10, 15],
    'learning_rate': [0.05, 0.1, 0.15, 0.20],
    'min_child_weight': [1, 2, 3, 4]
    }
xgb = XGBRegressor()
cv_xgb = GridSearchCV(xgb,hyperparameter_grid,cv=5)
cv_xgb.fit(X_train,y_train)


{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500}

In [105]:
cv_xgb.best_params_

{'learning_rate': 0.05,
 'max_depth': 3,
 'min_child_weight': 2,
 'n_estimators': 100}

In [106]:
xgb_re = XGBRegressor(learning_rate= 0.05,max_depth= 3,min_child_weight= 2,n_estimators= 100)
xgb_re.fit(X_train,y_train)
y_pred =xgb_re.predict(X_test)
print(r2_score(y_test,y_pred)*100)

90.26077103145799


In [121]:
# Number of trees in random forest
n_estimators = [20,60,100,120]

# Number of features to consider at every split
max_features = [0.2,0.6,1.0]

# Maximum number of levels in tree
max_depth = [2,8,None]

# Number of samples
max_samples = [0.5,0.75,1.0]

In [122]:
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
              'max_samples':max_samples
             }
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0]}


In [124]:
rf3 = RandomForestRegressor()
rf_grid = GridSearchCV(estimator = rf3, 
                       param_grid = param_grid, 
                       cv = 5, 
                       verbose=2, 
                       n_jobs = -1)

rf_grid.fit(X_train,y_train)
rf_grid.best_params_

Fitting 5 folds for each of 108 candidates, totalling 540 fits


{'max_depth': 8, 'max_features': 0.6, 'max_samples': 0.75, 'n_estimators': 120}

In [130]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,test_size=0.2)
rf = RandomForestRegressor(max_depth= 8, max_features= 0.6, max_samples= 0.75, n_estimators= 120)
rf.fit(X_train,y_train)
y_pred =rf.predict(X_test)
print(r2_score(y_test,y_pred)*100)

89.182218057539
