<a href="https://colab.research.google.com/github/valenlopez993/Linear_Regression_Health_Costs_Calculator/blob/main/linear_regression_health_costs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [None]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

## Getting the data

In [None]:
import sys
import os

if 'google.colab' in sys.modules:
    !wget https://cdn.freecodecamp.org/project-data/health-costs/insurance.csv
else:
    import subprocess

    def runCommand(command, verbose=False):
        process = subprocess.run(command, stdout = subprocess.PIPE, stderr = subprocess.PIPE, text = True, shell = True)
        std_out, std_err = process.stdout, process.stderr
        if verbose:
            print(std_out.strip(), std_err)
            
    files = os.listdir()
    if 'insurance.csv' not in files:
        runCommand("wget https://cdn.freecodecamp.org/project-data/health-costs/insurance.csv")

In [None]:
dataset = pd.read_csv('insurance.csv')
dataset.tail()

## Preprocessing the Data

Firstly all the categorical data must be converted to numbers

In [None]:
dataset.sex = pd.Categorical(dataset.sex)
dataset.smoker = pd.Categorical(dataset.smoker)
dataset.region = pd.Categorical(dataset.region)

dataset.sex = dataset.sex.cat.codes
dataset.smoker = dataset.smoker.cat.codes
dataset.region = dataset.region.cat.codes

dataset.tail()

The data will be splited in a `train_dataset` and a `test_dataset` in a proportion of 80% and 20% respectively

In [None]:
train_dataset, test_dataset = train_test_split(dataset, train_size=0.8, random_state=42)

As the health cost is what we want to calculate, we take the `expenses` as the labels for the model. As before we have `train_labels` and `test_labels`

In [None]:
train_labels = train_dataset.pop('expenses')
test_labels = test_dataset.pop('expenses')

## Linear Regression Model

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(64, activation='relu', input_dim=6))
model.add(tf.keras.layers.Dropout(0.15))
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(40, activation='relu'))
model.add(tf.keras.layers.Dropout(0.18))
model.add(tf.keras.layers.Dense(1))

model.compile(optimizer='adam', loss="mean_squared_error", metrics=['mae', 'mse'])

In [None]:
model.fit(train_dataset, train_labels, epochs=300)

## Making Predictions

In [None]:
loss, mae, mse = model.evaluate(test_dataset, test_labels, verbose=2)

print("Testing set Mean Abs Error: {:5.2f} expenses".format(mae))

# Plot predictions.
test_predictions = model.predict(test_dataset).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True values (expenses)')
plt.ylabel('Predictions (expenses)')
lims = [0, 50000]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims,lims)
