In [1]:
#importing the Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 

In [2]:
#read_csv function called to read the csv file and store to dataset
dataset = pd.read_csv("insurance_pre.csv")

In [3]:
#display values from dataset
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [7]:
#convert categarical data to numerical data using the function get_dummies, drop_first will delete the first column
dataset = pd.get_dummies(dataset, drop_first = True)

In [9]:
#display the values from dataset
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,False,True
1,18,33.770,1,1725.55230,True,False
2,28,33.000,3,4449.46200,True,False
3,33,22.705,0,21984.47061,True,False
4,32,28.880,0,3866.85520,True,False
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,True,False
1334,18,31.920,0,2205.98080,False,False
1335,18,36.850,0,1629.83350,False,False
1336,21,25.800,0,2007.94500,False,False


In [11]:
#display columns from dataset
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [13]:
#splitting input values into independent as input
independent = dataset[['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]

In [15]:
#display input values from varialbe independent
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,False,True
1,18,33.770,1,True,False
2,28,33.000,3,True,False
3,33,22.705,0,True,False
4,32,28.880,0,True,False
...,...,...,...,...,...
1333,50,30.970,3,True,False
1334,18,31.920,0,False,False
1335,18,36.850,0,False,False
1336,21,25.800,0,False,False


In [17]:
#splitting output values into variable dependent
dependent = dataset[['charges']]

In [19]:
#display the output values from dependent
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [33]:
#procedure to import GridSearchV and DecisionTreeRegressor Functions and save the model
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
param_grid = {'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'], 'max_features': ['sqrt', 'log2'], 'splitter':['best','random']}

grid = GridSearchCV(DecisionTreeRegressor(), param_grid, refit = True, verbose = 3, n_jobs = -1)

# fitting the model for grid search
grid.fit(independent, dependent)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [35]:
# print the best parameter after tuning
#print (grid.best_params_)
re=grid.cv_results_
print ("The R_score value for the best parameter {}:".format(grid.best_params_))

The R_score value for the best parameter {'criterion': 'poisson', 'max_features': 'sqrt', 'splitter': 'best'}:


In [37]:
table = pd.DataFrame.from_dict(re)

In [39]:
#display table
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.008465,0.000233,0.004173,0.000182,squared_error,sqrt,best,"{'criterion': 'squared_error', 'max_features':...",0.716407,0.633636,0.760916,0.715074,0.66886,0.698978,0.043763,5
1,0.009602,0.002761,0.005465,0.002446,squared_error,sqrt,random,"{'criterion': 'squared_error', 'max_features':...",0.705458,0.599739,0.631739,0.654614,0.690365,0.656383,0.038424,10
2,0.011921,0.002624,0.006959,0.003236,squared_error,log2,best,"{'criterion': 'squared_error', 'max_features':...",0.759587,0.561668,0.727908,0.701641,0.726417,0.695444,0.069372,6
3,0.009079,0.0019,0.007739,0.003909,squared_error,log2,random,"{'criterion': 'squared_error', 'max_features':...",0.697836,0.529337,0.731724,0.667525,0.604345,0.646154,0.071895,13
4,0.008591,0.001037,0.004134,0.0003,friedman_mse,sqrt,best,"{'criterion': 'friedman_mse', 'max_features': ...",0.766271,0.659141,0.632037,0.71579,0.724877,0.699623,0.048058,3
5,0.006674,0.001036,0.006765,0.003077,friedman_mse,sqrt,random,"{'criterion': 'friedman_mse', 'max_features': ...",0.655873,0.627947,0.677744,0.693963,0.57127,0.645359,0.04316,14
6,0.007109,0.00081,0.008094,0.003493,friedman_mse,log2,best,"{'criterion': 'friedman_mse', 'max_features': ...",0.713264,0.665074,0.584165,0.637635,0.674264,0.65488,0.042863,11
7,0.006717,0.001197,0.004431,0.001165,friedman_mse,log2,random,"{'criterion': 'friedman_mse', 'max_features': ...",0.722422,0.511386,0.670858,0.558309,0.763225,0.64524,0.095925,15
8,0.029585,0.003515,0.004068,0.00062,absolute_error,sqrt,best,"{'criterion': 'absolute_error', 'max_features'...",0.724415,0.635261,0.645097,0.728009,0.719803,0.690517,0.0413,7
9,0.021673,0.002353,0.003794,0.000403,absolute_error,sqrt,random,"{'criterion': 'absolute_error', 'max_features'...",0.697354,0.650689,0.74542,0.665043,0.761885,0.704078,0.043521,2


In [41]:
age_input = float(input("Age:"))
bmi_input = float(input("BMI:"))
children_input = float(input("Children:"))
sex_male_input = int(input("Sex Male 0 or 1:"))
smoker_yes_input = int(input("Smoker Yes 0 or 1:"))

Age: 34
BMI: 45
Children: 2
Sex Male 0 or 1: 1
Smoker Yes 0 or 1: 1


In [43]:
Future_Prediction = grid.predict([[age_input,bmi_input,children_input,sex_male_input,smoker_yes_input]]) # change the paramter,play with it.
print("Future_Prediction={}".format(Future_Prediction))

Future_Prediction=[39241.442]


