# Medical Expenses Predictor for Insurance Company

In [1]:
medical_dataset_url = 'https://raw.githubusercontent.com/JovianML/opendatasets/master/data/medical-charges.csv'

In [2]:
from urllib.request import urlretrieve

In [3]:
urlretrieve(medical_dataset_url,'medical.csv')

In [4]:
import pandas as pd

In [5]:
medical_df = pd.read_csv('medical.csv')

In [6]:
medical_df

The dataset contains 1338 rows and 7 columns. Each row of the dataset contains information about one customer.

Our objective is to find a way to estimate the value in the "charges" column using the values in the other columns. If we can do so for the historical data, then we should able to estimate charges for new customers too, simply by asking for information like their age, sex, BMI, no. of children, smoking habits and region.

Let's check the data type for each column.

In [7]:
medical_df.info()

In [8]:
medical_df.describe()

## EDA and Visualization

In [9]:
!pip install plotly matplotlib seaborn --quiet

In [10]:
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [11]:
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10,6)
matplotlib.rcParams['figure.facecolor'] = '#000000'

### Age

In [12]:
fig = px.histogram(medical_df,x = 'age',
                  marginal='box',
                  nbins=47,
                  title='Distribution of Age')
fig.update_layout(bargap=0.1)
fig.show()

### Body Mass Index

In [13]:
fig = px.histogram(medical_df,x = 'bmi',
                  marginal='box',
                  color_discrete_sequence=['red'], 
                  title='Distribution of BMI')
fig.update_layout(bargap=0.1)
fig.show()

### Charges

In [14]:
fig = px.histogram(medical_df,x = 'charges',
                  marginal='box',
                   color='smoker',
                   color_discrete_sequence=['green','grey'],
                  title='Annual Medical Charges')
fig.update_layout(bargap=0.1)
fig.show()

### Sex

In [15]:
fig = px.histogram(medical_df,x = 'sex',
                  marginal='box',
                   color='smoker',
                   color_discrete_sequence=['purple','grey'],
                  title='Annual Medical Charges')
fig.update_layout(bargap=0.1)
fig.show()

### Region

In [16]:
fig = px.histogram(medical_df,x = 'region',
                  marginal='box',
                   color='smoker',
                   color_discrete_sequence=['green','grey'],
                  title='Annual Medical Charges')
fig.update_layout(bargap=0.1)
fig.show()

### Smoker

In [17]:
px.histogram(medical_df,x='smoker',color='sex',title='Smoker')

### Relation b/w Age and Charges

In [18]:
fig = px.scatter(medical_df,x='age',
                y='charges',
                color='smoker',
                opacity=0.8,
                hover_data=['sex'],title = 'Age vs Charges')
fig.update_traces(marker_size=5)
fig.show()

### Relation b/w BMI and Charges

In [19]:
fig = px.scatter(medical_df,x='bmi',
                y='charges',
                color='smoker',
                opacity=0.8,
                hover_data=['sex'],title = 'BMI vs Charges')
fig.update_traces(marker_size=5)
fig.show()

### Correlation

In [20]:
medical_df.charges.corr(medical_df.age)

In [21]:
medical_df.charges.corr(medical_df.bmi)

In [22]:
smoker_values = {'no':0 , 'yes':1}
smoker_numeric = medical_df.smoker.map(smoker_values)
medical_df.charges.corr(smoker_numeric)

### Heatmap for the Correlation

In [23]:
sns.heatmap(medical_df.corr(),cmap='Reds',annot=True)
plt.title('Correlation Matrix');

### Linear Regression using a Single Feature

In [24]:
non_smoker_df = medical_df[medical_df.smoker == 'no']

In [25]:
plt.title('Age vs Charges')
sns.scatterplot(data=non_smoker_df,x='age',y='charges',alpha=0.7,s=15)

### Model

charges = w * age + b

In [26]:
def estimate_charges(age,w,b):
    return w * age + b 

In [27]:
w = 50
b = 100
ages = non_smoker_df.age
estimated_charges = estimate_charges(ages,w,b)

In [28]:
plt.plot(ages,estimated_charges,'r-o');
plt.xlabel('Age');
plt.ylabel('Estimated Charges');

In [29]:
target = non_smoker_df.charges

plt.plot(ages,estimated_charges,'r',alpha=0.9);
plt.scatter(ages,target,s=0,alpha=0.8);
plt.xlabel('Age');
plt.ylabel('Charges')
plt.legend(['Estimate','Actual']);

In [30]:
def try_parameters(w,b):
    ages = non_smoker_df.age
    target = non_smoker_df.charges
    
    estimated_charges = estimate_charges(ages,w,b)
    
    plt.plot(ages,estimated_charges,'r',alpha=0.9);
    plt.scatter(ages,target,s=8,alpha=0.8);
    plt.xlabel('Age');
    plt.ylabel('Charges')
    plt.legend(['Estimate','Actual']);

In [31]:
try_parameters(60,200)

In [32]:
try_parameters(300,-4500)

In [33]:
predictions = estimated_charges
predictions

In [34]:
!pip install numpy --quiet

In [35]:
import numpy as np

In [36]:
def rmse(targets, predictions):
    return np.sqrt(np.mean(np.square(targets - predictions)))

In [37]:
w = 50
b= 100
try_parameters(w,b)

In [38]:
targets = non_smoker_df['charges']
predicted = estimate_charges(non_smoker_df.age,w,b)

In [39]:
rmse(targets,predicted)

In [40]:
def try_parameters(w,b):
    ages = non_smoker_df.age
    target = non_smoker_df.charges
    predictions = estimate_charges(ages,w,b)
    
    plt.plot(ages,predictions,'r',alpha=0.9)
    plt.scatter(ages,target,s=8,alpha=0.8)
    plt.xlabel('Age');
    plt.ylabel('Charges')
    plt.legend(['Prediction','Actual']);
    
    loss = rmse(target,predictions)
    print("RMSE Loss:",loss)

In [41]:
try_parameters(267.248,-2901.4200)

In [42]:
!pip install scikit-learn --quiet

In [43]:
 from sklearn.linear_model import LinearRegression


In [44]:
model = LinearRegression()

In [45]:
inputs = non_smoker_df[['age']]
targets = non_smoker_df.charges
print('inputs.shape:',inputs.shape)
print('targets.shape',targets.shape)

In [46]:
model.fit(inputs,targets)

In [47]:
model.predict(np.array([[23],[37],[61]]))

In [48]:
model.coef_

In [49]:
model.intercept_

In [50]:
smoker_codes = {'no': 0, 'yes': 1}
medical_df['smoker_code'] = medical_df.smoker.map(smoker_codes)
sex_codes = {'female': 0, 'male': 1}
medical_df['sex_code'] = medical_df.sex.map(sex_codes)

In [51]:
from sklearn import preprocessing
enc = preprocessing.OneHotEncoder()
enc.fit(medical_df[['region']])
enc.categories_

In [52]:
one_hot = enc.transform(medical_df[['region']]).toarray()
one_hot

In [53]:
medical_df[['northeast', 'northwest', 'southeast', 'southwest']] = one_hot

In [54]:
medical_df

In [55]:
input_cols = ['age', 'bmi', 'children', 'smoker_code', 'sex_code', 'northeast', 'northwest', 'southeast', 'southwest']
inputs, targets = medical_df[input_cols], medical_df['charges']

model = LinearRegression().fit(inputs, targets)

predictions = model.predict(inputs)

loss = rmse(targets, predictions)
print('Loss:', loss)

In [56]:
from sklearn.preprocessing import StandardScaler

In [57]:
numeric_cols = ['age', 'bmi', 'children'] 
scaler = StandardScaler()
scaler.fit(medical_df[numeric_cols])

In [58]:
scaled_inputs = scaler.transform(medical_df[numeric_cols])
scaled_inputs

In [59]:
cat_cols = ['smoker_code', 'sex_code', 'northeast', 'northwest', 'southeast', 'southwest']
categorical_data = medical_df[cat_cols].values

In [60]:
weights_df = pd.DataFrame({
    'feature': np.append(numeric_cols + cat_cols, 1),
    'weight': np.append(model.coef_, model.intercept_)
})
weights_df.sort_values('weight', ascending=False)

In [61]:
from sklearn.model_selection import train_test_split

In [62]:
inputs_train, inputs_test, targets_train, targets_test = train_test_split(inputs, targets, test_size=0.1)

In [63]:
model = LinearRegression().fit(inputs_train, targets_train)

predictions_test = model.predict(inputs_test)

loss = rmse(targets_test, predictions_test)
print('Test Loss:', loss)

In [64]:
predictions_train = model.predict(inputs_train)

loss = rmse(targets_train, predictions_train)
print('Training Loss:', loss)