In [1]:
#importing the Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 

In [2]:
#read_csv function called to read the csv file and store to dataset
dataset = pd.read_csv("insurance_pre.csv")

In [5]:
#display values from dataset
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [7]:
#convert categarical data to numerical data using the function get_dummies, drop_first will delete the first column
dataset = pd.get_dummies(dataset, drop_first=True)

In [9]:
#display the values from dataset
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,False,True
1,18,33.770,1,1725.55230,True,False
2,28,33.000,3,4449.46200,True,False
3,33,22.705,0,21984.47061,True,False
4,32,28.880,0,3866.85520,True,False
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,True,False
1334,18,31.920,0,2205.98080,False,False
1335,18,36.850,0,1629.83350,False,False
1336,21,25.800,0,2007.94500,False,False


In [11]:
#display columns from dataset
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [13]:
#splitting input values into independent as input
independent = dataset[['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]

In [15]:
#display input values from varialbe independent
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,False,True
1,18,33.770,1,True,False
2,28,33.000,3,True,False
3,33,22.705,0,True,False
4,32,28.880,0,True,False
...,...,...,...,...,...
1333,50,30.970,3,True,False
1334,18,31.920,0,False,False
1335,18,36.850,0,False,False
1336,21,25.800,0,False,False


In [17]:
#splitting output values into variable dependent
dependent = dataset['charges']

In [19]:
#display the output values from dependent
dependent

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

In [21]:
#splitting the data into training and test sets using sklearn model selection
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(independent, dependent, test_size = 1/3, random_state =0)

In [23]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [25]:
#Procedure to import specific function RandomForestRegressor from sklearn.ensemble
from sklearn.ensemble import RandomForestRegressor

In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
param_grid={'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],'max_features':['sqrt', 'log2'],
           'n_estimators':[10,50,80,100]}
grid=GridSearchCV(RandomForestRegressor(),param_grid,refit=True,verbose=3,n_jobs=-1)

#fitting the model for grid search
grid.fit(X_train,y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [35]:
# print best parameter after tuning 
#print(grid.best_params_) 
re=grid.cv_results_
#print(re)
grid_predictions = grid.predict(X_test) 
   
# print classification report 
from sklearn.metrics import r2_score
r_score=r2_score(y_test,grid_predictions)

print("The R_score value for best parameter {}:".format(grid.best_params_),r_score)

The R_score value for best parameter {'criterion': 'absolute_error', 'max_features': 'sqrt', 'n_estimators': 80}: 0.873981329437285


In [37]:
table = pd.DataFrame.from_dict(re)

In [39]:
#display table
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.031511,0.006989,0.003461,0.000936,squared_error,sqrt,10,"{'criterion': 'squared_error', 'max_features':...",0.794413,0.73412,0.833961,0.802574,0.752989,0.783611,0.035773,26
1,0.190528,0.005537,0.010435,0.003853,squared_error,sqrt,50,"{'criterion': 'squared_error', 'max_features':...",0.804265,0.757952,0.831883,0.831776,0.75239,0.795653,0.034597,23
2,0.266599,0.023128,0.00905,0.000113,squared_error,sqrt,80,"{'criterion': 'squared_error', 'max_features':...",0.798775,0.767356,0.833317,0.829699,0.762716,0.798373,0.029786,21
3,0.298914,0.005261,0.012344,0.001361,squared_error,sqrt,100,"{'criterion': 'squared_error', 'max_features':...",0.799335,0.77637,0.836469,0.836271,0.757874,0.801264,0.031531,8
4,0.030031,0.001136,0.002307,0.000137,squared_error,log2,10,"{'criterion': 'squared_error', 'max_features':...",0.795866,0.756298,0.840552,0.772567,0.748989,0.782854,0.033034,27
5,0.14817,0.00312,0.006533,0.000121,squared_error,log2,50,"{'criterion': 'squared_error', 'max_features':...",0.790852,0.771494,0.831736,0.827767,0.770028,0.798375,0.026682,20
6,0.24727,0.008284,0.011034,0.001288,squared_error,log2,80,"{'criterion': 'squared_error', 'max_features':...",0.80879,0.771132,0.839309,0.82822,0.765849,0.80266,0.029608,2
7,0.336509,0.027852,0.014928,0.003388,squared_error,log2,100,"{'criterion': 'squared_error', 'max_features':...",0.808718,0.765463,0.839545,0.832529,0.764142,0.802079,0.032109,4
8,0.031231,0.003463,0.002331,0.000236,friedman_mse,sqrt,10,"{'criterion': 'friedman_mse', 'max_features': ...",0.776405,0.72017,0.833884,0.810534,0.713674,0.770933,0.047784,32
9,0.173941,0.020366,0.009158,0.003596,friedman_mse,sqrt,50,"{'criterion': 'friedman_mse', 'max_features': ...",0.793629,0.776583,0.831305,0.834388,0.760649,0.799311,0.029318,18


In [41]:
age_input = float(input("Age:"))
bmi_input = float(input("BMI:"))
children_input = float(input("Children:"))
sex_male_input = int(input("Sex Male 0 or 1:"))
smoker_yes_input = int(input("Smoker Yes 0 or 1:"))

Age: 43
BMI: 35
Children: 0
Sex Male 0 or 1: 1
Smoker Yes 0 or 1: 0


In [43]:
Future_Prediction = grid.predict([[age_input,bmi_input,children_input,sex_male_input,smoker_yes_input]]) # change the paramter,play with it.
print("Future_Prediction={}".format(Future_Prediction))

Future_Prediction=[16598.70930688]
