Import Packages and Load Data 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression 


from google.colab import drive
drive.mount('/content/drive')
path = "/content/drive/MyDrive/Resume Supporting Material /Personal Projects/Medical Costs Predictor /medicalcharges.txt"
data = pd.read_csv(path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
print(data.head(10))

rows = data.shape[0]
columns = data.shape[1]

print('# rows = {}, # columns = {}'.format(rows,columns))


In [None]:
data.info()
data.describe()

Distribution of Charges 

In [None]:
data['charges'].plot(kind='hist')
plt.title("Dsitribution of Charges")
plt.xlabel("Charges")
plt.ylabel("Frequency")
plt.show

Correlation between Smoking and Treatment Cost 

In [None]:
# select smokers 
smokers = data[data.smoker == 'yes']

# select non smokers
non_smokers = data[data.smoker == 'no']

# print the number of smokers and non-smokers
print("# Smokers = {} # Non Smokers = {}".format(smokers.shape[0], non_smokers.shape[0]))


# create the figure
fig = plt.figure(figsize=(12,5))

# add first sub plot for smokers
ax = fig.add_subplot(121) # number rows, num columns, position 
# draw distribution of charges for smokers
ax.hist(smokers.charges)
# set sub plot title
ax.set_title('Distribution of charges for smokers')

# add second sub plot for non smokers
ax = fig.add_subplot(122)
# draw distribution of charges for non-smokers
ax.hist(non_smokers.charges)
# set sub plot title
ax.set_title('Distribution of charges for non-smokers')

Correlation between Age and Cost of Treatment 



In [None]:
plt.scatter(data.age, data.charges)
plt.title("Cost of treatment for different ages")
plt.xlabel("Age")
plt.ylabel("Charges")
plt.show()

Correlation between Age and Cost of Treatment and Smoking

In [None]:
plt.scatter(smokers.age, smokers.charges, color = 'r')
plt.scatter(non_smokers.age, non_smokers.charges, color = 'b')
plt.title("Charges with age for smokers(red) and non- smokers (blue)")
plt.xlabel('Age')
plt.ylabel('Charges')
plt.show()

Correlation between BMI and Treatment Cost 

In [None]:
data["bmi"].plot(kind = 'hist')
plt.title("BMI distribution")
plt.xlabel("BMI")
plt.ylabel("Frequency")
plt.show() # Histogram showing distribution of BMI
print('')

# Categorize patients by BMI (healthy, obese, overweight)
obese = data[data.bmi > 29]
overweight = data[(data.bmi > 24) & (data.bmi < 29)]
healthy = data[(data.bmi > 18) & (data.bmi < 24)]
print('There are {} obese, {} overweight and {} healthy individuals.'.format(obese.shape[0], overweight.shape[0], healthy.shape[0]))
print('')

#Compare the 3 groups with respect to treatment cost 
plt.hist(obese.charges, color = 'r')
plt.hist(overweight.charges, color = 'y')
plt.hist(healthy.charges, color = 'g')
plt.title("Charges distribution")
plt.xlabel("Charges")
plt.ylabel("Frequency")
plt.show()

In [None]:
corrMatrix = data.corr()
sn.heatmap(corrMatrix, annot= True)
plt.show

Data Preprocessing

In [None]:
# print how many missing value in each column
data.isnull().sum()

In [None]:
# drop rows with missing values
data = data.dropna()
data.isnull().sum()

In [None]:
# drop the region feature
data.drop("region", axis = 1, inplace = True)
data.head()

Convert Categorical to Numerical

In [None]:
# define dictionary male-female
gender = {'male':0, 'female':1}
# replace sex column with 0/1 # x = 1; gender[x] = x^2; 
data['sex'] = data.sex.apply(lambda x:gender[x])

In [None]:
data.head()

In [None]:
# Edit Smokers Column to become numerical
smoking = {'no':0, 'yes':1}
# replace smokers column with 0/1
data['smoker'] = data.smoker.apply(lambda x:smoking[x])
data.head()

In [None]:
#Normalize the Data 
data_max = data.max()
data = data.divide(data_max)
data.describe()

Model Training and Testing 

In [None]:
# store all columns excpet last one ( charges) as inputs in X
x = data.iloc[:,0:-1].values
# store the last column as the output in y  
y = data.iloc[:,-1].values  

# split dataset in a 80/20 (train/test)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 40)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

In [None]:
#Linear Regression Model
model = LinearRegression()
model.fit(x_train, y_train)
print('Coefficient of Determination {}'.format(model.score(x_test, y_test)))

In [None]:
#Predict Response 
y_pred = model.predict(x)
print('predicted response:', y_pred, sep='\n')

In [None]:
# Display Weights of Features 
columns_names = data.columns[0:-1].values
features_importance = model.coef_
plt.barh(columns_names, features_importance)
plt.title('Feature Importance')
plt.xlabel('Weight')
plt.ylabel('Feature')
plt.show