# Problem Statement or Requirement:
A client’s requirement is, he wants to predict the insurance charges based on the several parameters. The Client has provided the dataset of the same.
As a data scientist, you must develop a model which will predict the insurance charges.


In [2]:
#import libraries
import pandas as pd
import numpy as np

In [3]:
data=pd.read_csv("insurance_pre.csv")

In [4]:
data

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [5]:
2.#basic info about the dataset (Total number of rows, columns)
data.shape

(1338, 6)

In [6]:
data.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'charges'], dtype='object')

In [7]:
data = pd.get_dummies(data,drop_first=True)
#dummies=One-Hot Encoding and  drop_first to remove repetetion 

In [8]:
data=data.astype(int)
#2024 update to show 0 and 1

In [9]:
data

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27,0,16884,0,1
1,18,33,1,1725,1,0
2,28,33,3,4449,1,0
3,33,22,0,21984,1,0
4,32,28,0,3866,1,0
...,...,...,...,...,...,...
1333,50,30,3,10600,1,0
1334,18,31,0,2205,0,0
1335,18,36,0,1629,0,0
1336,21,25,0,2007,0,0


In [10]:
data.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [11]:
# creating feature variables
#multiple inputs
indep=data[['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]
#one output
dep=data[['charges']]

In [None]:
## removed X_train,X_test,y_train,y_test   0.70 and 0.30 it will take care by GSV.

# GridSearchCV import libraries to all models

In [12]:
#import numpy as np
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.datasets import make_regression

# SVM : Super Vector Machine
#SVR -regression

In [None]:
##default regression:class sklearn.svm.SVR(*, kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=0.001, C=1.0, epsilon=0.1, 
#shrinking=True, cache_size=200, verbose=False, max_iter=-1)[source]
#document  https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html

In [13]:
import warnings
from sklearn.exceptions import DataConversionWarning

# Suppress the specific warning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [14]:
# Define the SVR parameter grid  #model creation
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']}
    #'gamma':['scale','auto']}
grid_svr= GridSearchCV(SVR(), param_grid,refit = True, verbose=3,n_jobs=-1)
#grid_svr.fit(X_train, y_train)
grid_svr.fit(indep,dep)

#verbose=3 (folds) 3=granular detail of each fitting process for individual folds in cross-validations, 
#refit =true best performance

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [12]:
#ypred_svr_grid=grid_svr.predict(X_test)
#y_pred predected output

In [17]:
#evaluation metric
#from sklearn.metrics import r2_score
#rscore_svr_grid=r2_score(y_test,ypred_svr_grid)


In [18]:
#print("R2 Score =", rscore_svr_grid, "The Best Parameters are: {}".format(grid_svr.best_params_))
#svr accuracy

In [15]:
#Table Format all the values
svr_result=grid_svr.cv_results_
svr_table=pd.DataFrame.from_dict(svr_result)
print(svr_table)

    mean_fit_time  std_fit_time  mean_score_time  std_score_time param_C  \
0        0.596478      0.016154         0.060252        0.016154     0.1   
1        0.491923      0.177876         0.141595        0.006258     0.1   
2        0.243862      0.009643         0.047773        0.009938     0.1   
3        0.317678      0.019047         0.069794        0.007266     0.1   
4        0.249911      0.008302         0.033834        0.004878       1   
5        0.321151      0.008206         0.159987        0.017635       1   
6        0.257245      0.011046         0.045897        0.010041       1   
7        0.325723      0.012986         0.061793        0.005862       1   
8        0.299720      0.029936         0.039663        0.005023      10   
9        0.292823      0.011428         0.156068        0.009828      10   
10       0.288468      0.019289         0.047352        0.005060      10   
11       0.366098      0.021491         0.070331        0.005320      10   
12       0.6

# DecisionTree

In [None]:
# document
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html

#friedman_mse (Mean Squared Error for Decision Trees)
#Poisson criterion can only be used if all target values are non-negative. 
#If the target has negatives, you'll get a ValueError

In [16]:
#model creation
#param_grid = {'criterion':['mse','mae','friedman_mse'],  
param_grid = {'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
'max_features': ['auto','sqrt','log2'],
'splitter':['best','random']}
grid_dt_regressor = GridSearchCV(DecisionTreeRegressor(), param_grid, refit = True, verbose= 3,n_jobs=-1)
# fitting the model for grid search
#grid_dt_regressor.fit(X_train, y_train)
grid_dt_regressor.fit(indep, dep)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


40 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\hp\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\hp\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\hp\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\hp\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
skle

In [19]:
#ypred_dt_grid=grid_dt_regressor.predict(X_test)
#y_pred predected output

In [21]:
#evaluation metric
#dt_rscore_grid=r2_score(y_test,ypred_dt_grid)

In [17]:
print("The R_score value for best parameter {}:".format(grid_dt_regressor.best_params_))
#decision tree R value (accuracy)

The R_score value for best parameter {'criterion': 'poisson', 'max_features': 'sqrt', 'splitter': 'best'}:


In [18]:
#Table Format all the values
dt_result=grid_dt_regressor.cv_results_
dt_table=pd.DataFrame.from_dict(dt_result)
print(dt_table)

    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0        0.001608      0.001969         0.000000        0.000000   
1        0.000000      0.000000         0.000000        0.000000   
2        0.075680      0.011697         0.012546        0.006274   
3        0.039210      0.029351         0.020038        0.004686   
4        0.019411      0.001646         0.006908        0.005641   
5        0.017598      0.003229         0.003201        0.004296   
6        0.003789      0.006808         0.000000        0.000000   
7        0.000000      0.000000         0.000000        0.000000   
8        0.019144      0.006161         0.002875        0.005750   
9        0.014831      0.000374         0.013451        0.006835   
10       0.019903      0.004839         0.003847        0.005864   
11       0.020329      0.004989         0.003215        0.003474   
12       0.000000      0.000000         0.000000        0.000000   
13       0.002503      0.005006         0.000000

# Random Forest

In [None]:
#document
#https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

In [19]:
#import RF model algarithm
#param_grid = {'criterion':['mse','mae'],
param_grid = {'criterion':['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
'max_features': ['auto','sqrt','log2'],
'n_estimators':[10,50,100]}
reg_grid_rf = GridSearchCV(RandomForestRegressor(), param_grid, refit = True, verbose= 3,n_jobs=-1)
# fitting the model for grid search
#reg_grid_rf.fit(X_train, y_train)
reg_grid_rf.fit(indep, dep)
#model creation

Fitting 5 folds for each of 36 candidates, totalling 180 fits


60 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
26 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\hp\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\hp\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\hp\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\hp\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
skle

In [28]:
#y_pred_rf_grid=reg_grid_rf.predict(X_test)
#y_ore predicted score

In [29]:
#evaluation metric
#rf_rscore_grid=r2_score(y_test,y_pred_rf_grid)

In [20]:
print("The R_score value for best parameter {}:".format(reg_grid_rf.best_params_))

The R_score value for best parameter {'criterion': 'absolute_error', 'max_features': 'log2', 'n_estimators': 50}:


In [21]:
#Table Format all the values
rf_result=reg_grid_rf.cv_results_
rf_table=pd.DataFrame.from_dict(rf_result)
print(rf_table)

    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0        0.006765      0.002603         0.000000    0.000000e+00   
1        0.003226      0.003408         0.000000    0.000000e+00   
2        0.001601      0.003203         0.000000    0.000000e+00   
3        0.148888      0.009686         0.015622    1.168008e-07   
4        0.685512      0.058244         0.033277    3.461252e-03   
5        1.325920      0.041752         0.052931    1.361883e-02   
6        0.147667      0.007735         0.015807    3.744531e-04   
7        0.698195      0.027175         0.037521    7.629204e-03   
8        1.367497      0.044838         0.047399    5.902709e-04   
9        0.003124      0.006248         0.000000    0.000000e+00   
10       0.003124      0.006248         0.000000    0.000000e+00   
11       0.005838      0.007180         0.000000    0.000000e+00   
12       0.353405      0.021347         0.013732    7.269024e-03   
13       1.695108      0.051298         0.034367

In [40]:
reg_grid_rf.best_score_

0.8307673549839858

In [None]:
#no r score here only table format

In [41]:
print("RF Score",reg_grid_rf.best_score_)
print("DT Score",grid_dt_regressor.best_score_)
print("SVR Score",grid_svr.best_score_)

RF Score 0.8307673549839858
DT Score 0.7004043721056169
SVR Score 0.535311353603007


In [42]:
#save best model
import pickle
filename="best_regression_model_grid.sav"
pickle.dump(reg_grid_rf,open(filename,'wb'))  #best r score grid regressor mentioned here
#reg_gb is regressor

In [43]:
#load model
loaded_model=pickle.load(open("best_regression_model_grid.sav",'rb'))

In [44]:
data.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [45]:
# get target using multi fixed inputs -future prediction
result=loaded_model.predict([[36,23,1,1,0]])
#[][] to fix value error
print("Charges",result)

Charges [7293.48]




In [None]:
#another way to validate

In [None]:
# get target using dynamic fixed inputs
#input pass dynamically
age_input=float(input("Age:"))
bmi_input=float(input("bmi"))
children_input=float(input("children "))
sex_male_input=int(input("sex_male 0 or 1:"))
smoker_yes_input=int(input("smoker_yes 0 or 1:"))

In [None]:
result=loaded_model.predict([[age_input,bmi_input,children_input,sex_male_input,smoker_yes_input]])
print("Total Charges are= ",result)

# Linear and Multi Linear not supported (GSV)
# gamma should remove for better performance