# Case: Predicting the total payment of insurance based on many independent variables

In [82]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('multi_insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# 1) Data Pre-processing: Checking for null values

In [83]:
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

There are no null values,

# 2) Checking the features which we actually need

Say we do not need features like region for this particular analysis.

So, we will drop them.

In [84]:
df=df.drop('region',axis=1)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.9,0,yes,16884.924
1,18,male,33.77,1,no,1725.5523
2,28,male,33.0,3,no,4449.462
3,33,male,22.705,0,no,21984.47061
4,32,male,28.88,0,no,3866.8552


# 3) Converting categorical independent variables to continous(int)

We need to convert sex, smoker to numbers.

In [85]:
df['smoker'] = pd.get_dummies(data=df['smoker'], drop_first=True)
df['sex'] = pd.get_dummies(data=df['sex'], drop_first=True)
df

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,0,27.900,0,1,16884.92400
1,18,1,33.770,1,0,1725.55230
2,28,1,33.000,3,0,4449.46200
3,33,1,22.705,0,0,21984.47061
4,32,1,28.880,0,0,3866.85520
...,...,...,...,...,...,...
1333,50,1,30.970,3,0,10600.54830
1334,18,0,31.920,0,0,2205.98080
1335,18,0,36.850,0,0,1629.83350
1336,21,0,25.800,0,0,2007.94500


Thus, now Sex: Female('0'),male('1') and Smoker:Yes('1'),No('0')

# 4)Normalize the data

We normalize the data to bring all the variables to the same range.

In [86]:
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()
d=sc.fit_transform(df)
norm_df = pd.DataFrame(d,columns=['age','sex','bmi','children','smoker','charges'])
norm_df

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,-1.438764,-1.010519,-0.453320,-0.908614,1.970587,0.298584
1,-1.509965,0.989591,0.509621,-0.078767,-0.507463,-0.953689
2,-0.797954,0.989591,0.383307,1.580926,-0.507463,-0.728675
3,-0.441948,0.989591,-1.305531,-0.908614,-0.507463,0.719843
4,-0.513149,0.989591,-0.292556,-0.908614,-0.507463,-0.776802
...,...,...,...,...,...,...
1333,0.768473,0.989591,0.050297,1.580926,-0.507463,-0.220551
1334,-1.509965,-1.010519,0.206139,-0.908614,-0.507463,-0.914002
1335,-1.509965,-1.010519,1.014878,-0.908614,-0.507463,-0.961596
1336,-1.296362,-1.010519,-0.797813,-0.908614,-0.507463,-0.930362


We need not normalize the dependent variable column: Charges. SO, we will delete the normalized column and replace it with original values.

In [87]:
# drop the normalized column:
norm_df=norm_df.drop(['charges'],axis=1)
norm_df

Unnamed: 0,age,sex,bmi,children,smoker
0,-1.438764,-1.010519,-0.453320,-0.908614,1.970587
1,-1.509965,0.989591,0.509621,-0.078767,-0.507463
2,-0.797954,0.989591,0.383307,1.580926,-0.507463
3,-0.441948,0.989591,-1.305531,-0.908614,-0.507463
4,-0.513149,0.989591,-0.292556,-0.908614,-0.507463
...,...,...,...,...,...
1333,0.768473,0.989591,0.050297,1.580926,-0.507463
1334,-1.509965,-1.010519,0.206139,-0.908614,-0.507463
1335,-1.509965,-1.010519,1.014878,-0.908614,-0.507463
1336,-1.296362,-1.010519,-0.797813,-0.908614,-0.507463


In [88]:
# append the original column:
original=df['charges']
df = norm_df.join(original)
df

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,-1.438764,-1.010519,-0.453320,-0.908614,1.970587,16884.92400
1,-1.509965,0.989591,0.509621,-0.078767,-0.507463,1725.55230
2,-0.797954,0.989591,0.383307,1.580926,-0.507463,4449.46200
3,-0.441948,0.989591,-1.305531,-0.908614,-0.507463,21984.47061
4,-0.513149,0.989591,-0.292556,-0.908614,-0.507463,3866.85520
...,...,...,...,...,...,...
1333,0.768473,0.989591,0.050297,1.580926,-0.507463,10600.54830
1334,-1.509965,-1.010519,0.206139,-0.908614,-0.507463,2205.98080
1335,-1.509965,-1.010519,1.014878,-0.908614,-0.507463,1629.83350
1336,-1.296362,-1.010519,-0.797813,-0.908614,-0.507463,2007.94500


In [89]:
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
charges     0
dtype: int64

# 6) Assign the features

In [90]:
x = df.iloc[:,:5]
y = df.iloc[:,-1:]

# 5) Spilt the records - 75 :25 - training:test ratio

In [91]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=2)

# 6)Train the model

In [92]:
from sklearn.linear_model import LinearRegression
mul_reg = LinearRegression()
mul_reg

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [93]:
mul_reg.fit(x_train,y_train)
mul_reg

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

# 7) Test Model

In [94]:
y_pred = mul_reg.predict(x_test)
#pd.concat([y_pred, y_test], axis=1)
pred=list(y_pred)
actual=np.array(y_test)
actual=list(actual)

In [95]:
d=pd.DataFrame(
    {'Predicted charges': pred,
     'Actual charges': actual
    })

In [99]:
z=pd.concat([x_test,d.reindex(x_test.index)], axis=1)
z

Unnamed: 0,age,sex,bmi,children,smoker,Predicted charges,Actual charges
17,-1.153959,0.989591,-1.118520,-0.908614,-0.507463,[7733.09952715156],[5327.40025]
1091,1.124479,-1.010519,-0.136714,-0.908614,-0.507463,,
273,0.768473,0.989591,-0.526320,-0.078767,-0.507463,[38682.69010588796],[46151.1245]
270,-1.509965,0.989591,-0.212175,-0.078767,-0.507463,[13321.363897755602],[11848.141000000001]
874,0.341265,0.989591,-1.445789,1.580926,-0.507463,,
...,...,...,...,...,...,...,...
1034,1.551686,0.989591,1.265866,-0.908614,-0.507463,,
0,-1.438764,-1.010519,-0.453320,-0.908614,1.970587,[1491.4222046316027],[2395.17155]
784,-0.584350,-1.010519,-0.230220,-0.078767,-0.507463,,
462,1.622887,-1.010519,1.219113,0.751079,-0.507463,,


# 8) Forecast by Trained Model

In [105]:
unseen_pred =mul_reg.predict(np.array([[19,1,33,0,0]]))
unseen_pred

array([[143780.01189427]])

# 9)Estimate the cost:

In [107]:
from sklearn.metrics import mean_squared_error,r2_score
RMSE=np.sqrt(mean_squared_error(y_test,y_pred))
r_square=r2_score(y_test,y_pred)

print('The R-Square value is...',r_square)
print('The RMSE value is........',RMSE)

The R-Square value is... 0.7486163979240839
The RMSE value is........ 6159.554567242218


As R-square value is nearer to 1, it is not a very good model

In [108]:
df.corr()

Unnamed: 0,age,sex,bmi,children,smoker,charges
age,1.0,-0.020856,0.109272,0.042469,-0.025019,0.299008
sex,-0.020856,1.0,0.046371,0.017163,0.076185,0.057292
bmi,0.109272,0.046371,1.0,0.012759,0.00375,0.198341
children,0.042469,0.017163,0.012759,1.0,0.007673,0.067998
smoker,-0.025019,0.076185,0.00375,0.007673,1.0,0.787251
charges,0.299008,0.057292,0.198341,0.067998,0.787251,1.0


As we can see the correlation is very less between most of the variables.