In [1]:
#importing the Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 

In [2]:
#read_csv function called to read the csv file and store to dataset
dataset = pd.read_csv("insurance_pre.csv")

In [5]:
#display values from dataset
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [7]:
#convert categarical data to numerical data using the function get_dummies, drop_first will delete the first column
dataset = pd.get_dummies(dataset, drop_first = True)

In [9]:
#display the values from dataset
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,False,True
1,18,33.770,1,1725.55230,True,False
2,28,33.000,3,4449.46200,True,False
3,33,22.705,0,21984.47061,True,False
4,32,28.880,0,3866.85520,True,False
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,True,False
1334,18,31.920,0,2205.98080,False,False
1335,18,36.850,0,1629.83350,False,False
1336,21,25.800,0,2007.94500,False,False


In [11]:
#display columns from dataset
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [13]:
#splitting input values into independent as input
independent = dataset[['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]

In [15]:
#display input values from varialbe independent
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,False,True
1,18,33.770,1,True,False
2,28,33.000,3,True,False
3,33,22.705,0,True,False
4,32,28.880,0,True,False
...,...,...,...,...,...
1333,50,30.970,3,True,False
1334,18,31.920,0,False,False
1335,18,36.850,0,False,False
1336,21,25.800,0,False,False


In [17]:
#splitting output values into variable dependent
dependent = dataset[['charges']]

In [19]:
#display the output values from dependent
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [21]:
#splitting the data into training and test sets using sklearn model selection
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(independent, dependent, test_size = 0.30, random_state =0)

In [40]:
#Standardisation, calculate mean and deviation
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [42]:
#procedure to import GridSearchV and DecisionTreeRegressor Functions and save the model
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
param_grid = {'criterion':['mse', 'mae', 'friedman_mse'], 'max_features': ['auto', 'sqrt', 'log2'], 'splitter':['best','random']}

grid = GridSearchCV(DecisionTreeRegressor(), param_grid, refit = True, verbose = 3, n_jobs = -1)

# fitting the model for grid search
grid.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


70 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/sutharsanurudrasingam/anaconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/sutharsanurudrasingam/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/Users/sutharsanurudrasingam/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/sutharsanurudrasingam/anaconda3/lib/python3.12/site-packages/sklearn

In [68]:
# print best parameter after tuning 
#print(grid.best_params_) 
re=grid.cv_results_
#print(re)
grid_predictions = grid.predict(X_test) 
   
# print classification report 
from sklearn.metrics import r2_score
r_score=r2_score(y_test,grid_predictions)

print("The R_score value for best parameter {}:".format(grid.best_params_),r_score)

The R_score value for best parameter {'criterion': 'friedman_mse', 'max_features': 'log2', 'splitter': 'best'}: 0.7582991131758102


In [70]:
table = pd.DataFrame.from_dict(re)

In [72]:
#display table
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001004,2e-05,0.0,0.0,mse,auto,best,"{'criterion': 'mse', 'max_features': 'auto', '...",,,,,,,,5
1,0.001126,0.000143,0.0,0.0,mse,auto,random,"{'criterion': 'mse', 'max_features': 'auto', '...",,,,,,,,5
2,0.001002,0.000195,0.0,0.0,mse,sqrt,best,"{'criterion': 'mse', 'max_features': 'sqrt', '...",,,,,,,,5
3,0.00082,0.000131,0.0,0.0,mse,sqrt,random,"{'criterion': 'mse', 'max_features': 'sqrt', '...",,,,,,,,5
4,0.000749,5.4e-05,0.0,0.0,mse,log2,best,"{'criterion': 'mse', 'max_features': 'log2', '...",,,,,,,,5
5,0.000639,0.000167,0.0,0.0,mse,log2,random,"{'criterion': 'mse', 'max_features': 'log2', '...",,,,,,,,5
6,0.000629,7e-05,0.0,0.0,mae,auto,best,"{'criterion': 'mae', 'max_features': 'auto', '...",,,,,,,,5
7,0.000555,4.1e-05,0.0,0.0,mae,auto,random,"{'criterion': 'mae', 'max_features': 'auto', '...",,,,,,,,5
8,0.000614,4.9e-05,0.0,0.0,mae,sqrt,best,"{'criterion': 'mae', 'max_features': 'sqrt', '...",,,,,,,,5
9,0.000509,3.4e-05,0.0,0.0,mae,sqrt,random,"{'criterion': 'mae', 'max_features': 'sqrt', '...",,,,,,,,5


In [74]:
age_input = float(input("Age:"))
bmi_input = float(input("BMI:"))
children_input = float(input("Children:"))
sex_male_input = int(input("Sex Male 0 or 1:"))
smoker_yes_input = int(input("Smoker Yes 0 or 1:"))

Age: 32
BMI: 43
Children: 2
Sex Male 0 or 1: 0
Smoker Yes 0 or 1: 1


In [76]:
Future_Prediction = grid.predict([[age_input,bmi_input,children_input,sex_male_input,smoker_yes_input]]) # change the paramter,play with it.
print("Future_Prediction={}".format(Future_Prediction))

Future_Prediction=[46661.4424]
