In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [2]:
insurance_data = pd.read_csv('/content/insurance.csv')
insurance_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## **Data preprocessing and analysis**

In [3]:
insurance_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [4]:
insurance_data.shape

(1338, 7)

In [5]:
insurance_data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [6]:
insurance_data.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [7]:
insurance_data['region'].value_counts()

southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64

# Replacing categorical data with numerical data

In [8]:
# [sex: male-1, female-0][smoker: yes-1, no-0][region: southeast-0, southwest-1, northeast-2, northwest-3]
insurance_data.replace({'sex':{'male':1,'female':0}, 'region':{'southeast':0, 'southwest':1, 'northwest':2, 'northeast':3}, 'smoker':{'yes':1,'no':0}}, inplace=True)
insurance_data.sample(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
699,23,0,39.27,2,0,0,3500.6123
29,31,1,36.3,2,1,1,38711.0
434,31,1,28.595,1,0,2,4243.59005
834,36,1,33.82,1,0,2,5377.4578
433,60,0,30.5,0,0,1,12638.195
399,18,0,38.17,0,0,0,1631.6683
294,25,1,26.8,3,0,1,3906.127
768,64,0,39.7,0,0,1,14319.031
1257,54,0,27.645,1,0,2,11305.93455
1286,28,0,17.29,0,0,3,3732.6251


# ***Spliting data into feature and target columns, test and train datasets***

In [9]:
# spliting data into feature and target
x = insurance_data.drop(['charges'], axis=1)
y = insurance_data['charges']

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=17)

#  Training the Model 

In [11]:
model = LinearRegression()

In [12]:
model.fit(x_train, y_train)

# Model Evaluation

### ***Linear Regression***

In [13]:
prediction = model.predict(x_train)

In [14]:
# R squared error
score1 = metrics.r2_score(prediction, y_train)
# Mean Absolute Error
score2 = metrics.mean_absolute_error(prediction, y_train)

print("R squared error on training data : %s" % (score1))
print("Mean absolute error on training data is %s" % score2)

R squared error on training data : 0.661316808981369
Mean absolute error on training data is 4333.546841500637


In [15]:
prediction2 = model.predict(x_test)

In [16]:
# R squared error
score1te = metrics.r2_score(prediction2, y_test)
# Mean Absolute Error
score2te = metrics.mean_absolute_error(prediction2, y_test)

print("R squared error on test data : %s" % (score1te))
print("Mean absolute error on test data is %s" % score2te)

R squared error on test data : 0.7113119269067253
Mean absolute error on test data is 3927.9417552872897


# ***XGBRegressor***

In [17]:
model2 = XGBRegressor()

In [18]:
model2.fit(x_train, y_train)

In [19]:
prediction3 = model2.predict(x_train)

In [20]:
# R squared error
score3 = metrics.r2_score(prediction3, y_train)
# Mean Absolute Error
score4 = metrics.mean_absolute_error(prediction3, y_train)

print("R squared error on training data : %s" % (score3))
print("Mean absolute error on training data is %s" % score4)

R squared error on training data : 0.9958381547778764
Mean absolute error on training data is 435.488082363441


In [21]:
prediction4 = model2.predict(x_test)

In [22]:
# R squared error
score_1te = metrics.r2_score(prediction4, y_test)
# Mean Absolute Error
score_2te = metrics.mean_absolute_error(prediction4, y_test)

print("R squared error on test data : %s" % (score_1te))
print("Mean absolute error on test data is %s" % score_2te)

R squared error on test data : 0.7746672264667023
Mean absolute error on test data is 3012.1531794457646


# ***RandomForestRegression***

In [23]:
model3 = RandomForestRegressor()

In [24]:
model3.fit(x_train, y_train)

In [25]:
prediction5 = model3.predict(x_train)

In [26]:
# R squared error
score_3 = metrics.r2_score(prediction5, y_train)
# Mean Absolute Error
score_4 = metrics.mean_absolute_error(prediction5, y_train)

print("R squared error on training data : %s" % (score_3))
print("Mean absolute error on training data is %s" % score_4)

R squared error on training data : 0.9731116510033773
Mean absolute error on training data is 1077.9042983082752


In [27]:
prediction6 = model3.predict(x_test)

In [28]:
# R squared error
score_3te = metrics.r2_score(prediction6, y_test)
# Mean Absolute Error
score_4te = metrics.mean_absolute_error(prediction6, y_test)

print("R squared error on test data : %s" % (score_3te))
print("Mean absolute error on test data is %s" % score_4te)

R squared error on test data : 0.827287527462097
Mean absolute error on test data is 2443.965579814884


In [33]:
x = {'Linear Regression':[score1, score1te], 'XGBRegressor':[score3, score_1te], 'RandomForestRegressor': [score_3, score_3te], 'R squared error':['Train', 'Test']}
xdf = pd.DataFrame(x)
print(xdf)

   Linear Regression  XGBRegressor  RandomForestRegressor R squared error
0           0.661317      0.995838               0.973112           Train
1           0.711312      0.774667               0.827288            Test


In [35]:
xdf = xdf.set_index(['R squared error'])
print(xdf)

                 Linear Regression  XGBRegressor  RandomForestRegressor
R squared error                                                        
Train                     0.661317      0.995838               0.973112
Test                      0.711312      0.774667               0.827288
