In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
dataset=pd.read_csv("insurance_pre.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [3]:
dataset=pd.get_dummies(dataset,drop_first=True)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [4]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [5]:
independent=dataset[['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [6]:
dependent=dataset[[ 'charges']]
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [7]:
#sklearn library used to split train and test model
from sklearn.model_selection import train_test_split
#Then call train_test_split function it is parameterised function
X_train,X_test,Y_train,Y_test=train_test_split(independent, dependent, test_size=0.30, random_state=0)

In [8]:
from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor(n_estimators=100,random_state=0,criterion="mae",max_features="sqrt")
regressor=regressor.fit(X_train,Y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
param_grid={"criterion":['mse','mae'],'max_features':['auto','log2','sqrt'],'n_estimators':[10,100]}
grid=GridSearchCV(RandomForestRegressor(),param_grid,refit=True,verbose=3,n_jobs=-1)
#fitting the model for grid search
grid.fit(X_train,Y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    7.0s finished
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'criterion': ['mse', 'mae'],

In [10]:
#print best parameter after tuning
#print(grid.best_params_)
re=grid.cv_results_
#print (re)
grid_predictions=grid.predict(X_test)
#print classification report
from sklearn.metrics import r2_score
r_score=r2_score(Y_test,grid_predictions)
print("The R_score value for best parameter{}:".format(grid.best_params_),r_score)

The R_score value for best parameter{'criterion': 'mse', 'max_features': 'sqrt', 'n_estimators': 100}: 0.8730075065009215


In [11]:
table=pd.DataFrame.from_dict(re)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.032032,0.006269,0.004673,0.003523,mse,auto,10,"{'criterion': 'mse', 'max_features': 'auto', '...",0.781575,0.791989,0.80033,0.791298,0.007672,8
1,0.273345,0.016113,0.01921,0.004636,mse,auto,100,"{'criterion': 'mse', 'max_features': 'auto', '...",0.806096,0.802649,0.792395,0.80038,0.005819,5
2,0.034587,0.024998,0.00133,0.001881,mse,log2,10,"{'criterion': 'mse', 'max_features': 'log2', '...",0.816299,0.786229,0.789541,0.797356,0.013463,7
3,0.161001,0.005066,0.015757,0.000187,mse,log2,100,"{'criterion': 'mse', 'max_features': 'log2', '...",0.813264,0.808251,0.795198,0.805571,0.007615,3
4,0.0186,0.002174,0.0,0.0,mse,sqrt,10,"{'criterion': 'mse', 'max_features': 'sqrt', '...",0.782888,0.788031,0.79508,0.788666,0.004997,9
5,0.159132,0.010161,0.017461,0.002598,mse,sqrt,100,"{'criterion': 'mse', 'max_features': 'sqrt', '...",0.811848,0.813438,0.798846,0.808044,0.006536,1
6,0.124036,0.001929,0.001502,0.002124,mae,auto,10,"{'criterion': 'mae', 'max_features': 'auto', '...",0.790069,0.781513,0.791859,0.787813,0.004515,11
7,1.176412,0.00614,0.018312,0.007313,mae,auto,100,"{'criterion': 'mae', 'max_features': 'auto', '...",0.803273,0.793214,0.79953,0.798672,0.004151,6
8,0.08416,0.01128,0.00133,0.001881,mae,log2,10,"{'criterion': 'mae', 'max_features': 'log2', '...",0.793999,0.793686,0.777013,0.788233,0.007934,10
9,0.768095,0.031611,0.017023,0.003682,mae,log2,100,"{'criterion': 'mae', 'max_features': 'log2', '...",0.81459,0.811655,0.790917,0.805721,0.010536,2


In [12]:
age_input=float(input("Age:"))
bmi_input=float(input("BMI:"))
children_input=float(input("Children:"))
sex_male_input=int(input("Sex Male 0 or 1:"))
smoker_yes_input=int(input("Smoker Yes 0 or 1:"))

Age:50
BMI:69
Children:4
Sex Male 0 or 1:0
Smoker Yes 0 or 1:1


In [13]:
Future_Prediction=grid.predict([[age_input,bmi_input,children_input,sex_male_input,
smoker_yes_input]])
print("Future_Prediction={}".format(Future_Prediction))

Future_Prediction=[44771.3327659]
