# Medical insurence cost prediction
made by: Szabolcs Füle

Columns

age: age of primary beneficiary

sex: insurance contractor gender, female, male

bmi: Body mass index, providing an understanding of body, weights that are relatively high or low relative to height,
objective index of body weight (kg / m ^ 2) using the ratio of height to weight, ideally 18.5 to 24.9

children: Number of children covered by health insurance / Number of dependents

smoker: Smoking

region: the beneficiary's residential area in the US, northeast, southeast, southwest, northwest.

charges: Individual medical costs billed by health insurance

In [2]:
import pandas as pd
import numpy as np
import plotly.figure_factory as ff
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

### EDA - DataFrame infos

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/szabolcsfule/medical_insurance_regression/master/insurance.csv')
# df = pd.read_csv('insurance.csv')
df.head(3) 

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462


In [354]:
df.shape

(1338, 7)

In [355]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [356]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


### EDA - Basic Analysis with Visualisation

Correlation

In [357]:
fig = px.imshow(df.corr(), text_auto=True, aspect="auto", color_continuous_scale='RdBu_r')
fig.update_layout(title_text='Correlation between columns')
fig.show()

Compare nums of females and males

In [358]:
sex_counts = df['sex'].value_counts()
df_sex = pd.DataFrame(sex_counts).reset_index()
df_sex = df_sex.rename(columns={"index": "sex", "sex": "count"})
df_sex


Unnamed: 0,sex,count
0,male,676
1,female,662


In [359]:
fig = px.pie(df_sex, values='count', names='sex', color='sex',
             color_discrete_map={'male':'lightblue','female':'pink'})
fig.update_layout(title_text='Count of females and males')
fig.show()

compare nums of smokers and non smokers

In [360]:
smoker_counts = df['smoker'].value_counts()
df_smoker = pd.DataFrame(smoker_counts).reset_index()
df_smoker = df_smoker.rename(columns={"index": "smoker", "smoker": "count"})
df_smoker

Unnamed: 0,smoker,count
0,no,1064
1,yes,274


In [361]:
fig = px.pie(df_smoker, values='count', names='smoker', color='smoker',
             color_discrete_map={'no':'lightgreen','yes':'gray'})
fig.update_layout(title_text='Count of smokers and non smokers')
fig.show()

Distribution of age

In [362]:
fig = px.histogram(df, x="age", color=df.sex, marginal="box", nbins=5, text_auto=True, 
    color_discrete_map={'male':'lightblue','female':'pink'})
fig.update_layout(bargap=0.05)
fig.update_layout(title_text='Distribution of ages')
fig.show()

In [363]:

hist_data = [df.age]
group_labels = ['Age'] # name of the dataset

fig = ff.create_distplot(hist_data, group_labels, show_hist=True)
fig.update_traces(nbinsx=5, autobinx=True, selector={'type':'histogram'})
fig.update_layout(title_text='Distribution of ages', bargap=0.05)
fig.show()

Distribution of charges

In [364]:
fig = px.histogram(df, x="charges", color=df.sex, marginal="box", nbins=50, text_auto=True, 
    color_discrete_map={'male':'lightblue','female':'pink'})
fig.update_layout(bargap=0.05)
fig.update_layout(title_text='Distribution of charges')
fig.show()

In [365]:
hist_data = [df.charges]
group_labels = ['Charges'] # name of the dataset

fig = ff.create_distplot(hist_data, group_labels, show_hist=True)
fig.update_traces(nbinsx=50, autobinx=True, selector={'type':'histogram'})
fig.update_layout(title_text='Distribution of charges', bargap=0.05)
fig.show()

Distribution of children

In [366]:
fig = px.histogram(df, x="children", marginal="box", nbins=6, text_auto=True)
fig.update_layout(bargap=0.05)
fig.update_layout(title_text='How many children do the customers have')
fig.show()

Distribution of BMI

In [367]:
fig = px.histogram(df, x="bmi", color='sex', marginal="box", nbins=50, text_auto=True, 
    color_discrete_map={'male':'lightblue','female':'pink'})
fig.update_layout(bargap=0.05)
fig.update_layout(title_text='Distribution of Body mass index')
fig.show()

In [368]:
fig = px.histogram(df, x="bmi", color='smoker', marginal="box", nbins=50, text_auto=True, 
    color_discrete_map={'yes':'gray','no':'lightgreen'})
fig.update_layout(bargap=0.05)
fig.update_layout(title_text='Distribution of Body mass index')
fig.show()

In [369]:
hist_data = [df.bmi]
group_labels = ['BMI'] # name of the dataset

fig = ff.create_distplot(hist_data, group_labels, show_hist=True)
fig.update_traces(nbinsx=50, autobinx=True, selector={'type':'histogram'})
fig.update_layout(title_text='Distribution of Body mass index', bargap=0.05)
fig.show()

### Data preprocessing

Encode categorical features

In [370]:
df.replace({'sex':{'male': 0, 'female':1}}, inplace=True)
df.replace({'smoker':{'no': 0, 'yes':1}}, inplace=True)

In [371]:
df['region'].value_counts()

southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64

In [372]:
df.replace({'region':{'southeast': 0, 'southwest':1, 'northwest':2, 'northeast':3}}, inplace=True)

Split features and targets

In [373]:
target = pd.DataFrame(df['charges'])
target.head(2)

Unnamed: 0,charges
0,16884.924
1,1725.5523


In [374]:
features = df.drop(columns='charges', axis=1)
features.head(2)

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,1,27.9,0,1,1
1,18,0,33.77,1,0,0


In [375]:
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=2)

In [376]:
print(features.shape, features_train.shape, features_test.shape)
print(target.shape, target_train.shape, target_test.shape)

(1338, 6) (1070, 6) (268, 6)
(1338, 1) (1070, 1) (268, 1)


### Model training

Linear regression

In [377]:
model = LinearRegression()

In [378]:
model.fit(features_train, target_train)

Model evaluation - prediction on training

In [379]:
training_data_prediction = model.predict(features_train)

In [380]:
r2_train = metrics.r2_score(target_train, training_data_prediction)
print(f'R squared value: {r2_train}')

R squared value: 0.7518195459072954


Model evaluation - prediction on test

In [381]:
test_data_prediction = model.predict(features_test)

In [382]:
r2_test = metrics.r2_score(target_test, test_data_prediction)
print(f'R squared value: {r2_test}')

R squared value: 0.7454471618659975


### Building predictive systems

In [383]:
input_data = (31,1,25.74,0,0,0)

# change tuple to np array
input_data = np.asarray(input_data)

# reshape array
input_data = input_data.reshape(1,-1)

# prediction
prediction = model.predict(input_data)
print(f'Expected: 3756.6216 USD')
print(f'Prediction on custom data: {prediction[0]} USD')

Expected: 3756.6216 USD
Prediction on custom data: [3632.02043291] USD



X does not have valid feature names, but LinearRegression was fitted with feature names

