In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
data = pd.read_csv('insurance.csv')

# Treat the following features as categorical, assigning a numeric code to each category
data['sex'] = data['sex'].astype('category').cat.codes
data['smoker'] = data['smoker'].astype('category').cat.codes
data['region'] = data['region'].astype('category').cat.codes

X = data.drop(columns = 'charges')
y = data['charges']

In [None]:
# Wrapper function for saving figures

def save_figure(name):
    plt.savefig(
        name + '.png',
        bbox_inches = 'tight',
        facecolor = 'white',
        transparent = False
        )

In [None]:
# Basic visualization

null_check = data.isnull().sum()

plt.figure()
sns.scatterplot(x = data['bmi'], y = data['charges'])
save_figure('scatter_bmi')

plt.figure()
sns.scatterplot(x = data['age'], y = data['charges'])
save_figure('scatter_age')

plt.figure(figsize = (10, 8))
sns.heatmap(X.corr(), cmap = 'Blues')
save_figure('heatmap')

#sns.pairplot(data)

In [None]:
# Decision tree model

# Note that cost complexity pruning does not appreciably improve model performance.

dt_params = {
    'max_depth': np.arange(3, 11, 1),
    #'ccp_alpha': np.linspace(0, 0.5)
    }

dt_search = GridSearchCV(
    DecisionTreeRegressor(),
    dt_params,
    cv = 5,
    n_jobs = 8
    )
    
dt_search.fit(X, y)

print(
    'Best parameters: ', dt_search.best_params_, '\n',
    'Best accuracy: ', dt_search.best_score_, '\n',
    'Feature importances: ', dt_search.best_estimator_.feature_importances_
    )

In [None]:
# Random forest model

rf_params = {
    'n_estimators': np.arange(50, 525, 25),
    'max_depth': np.arange(3, 11, 1),
    }

rf_search = GridSearchCV(
    RandomForestRegressor(),
    rf_params,
    cv = 5,
    n_jobs = 8
    )
    
rf_search.fit(X, y)

print(
    'Best parameters: ', rf_search.best_params_, '\n',
    'Best accuracy: ', rf_search.best_score_,
    )

In [None]:
# AdaBoost model

base_est = [DecisionTreeRegressor(max_depth = n) for n in np.arange(3, 11, 1)]

ab_params = {
    'base_estimator': base_est,
    'n_estimators': np.arange(10, 55, 5),
    'learning_rate': np.linspace(0.01, 0.2),
    }

ab_search = GridSearchCV(
    AdaBoostRegressor(),
    ab_params,
    cv = 5,
    n_jobs = 8
    )

ab_search.fit(X, y)

print(
    'Best parameters: ', ab_search.best_params_, '\n',
    'Best accuracy: ', ab_search.best_score_
    )

In [None]:
# Gradient boosting model

gb_params = {
    'n_estimators': np.arange(10, 155, 10),
    'learning_rate': np.linspace(0.01, 0.3),
    'max_depth': np.arange(3, 11, 1)
    }

gb_search = GridSearchCV(
    GradientBoostingRegressor(),
    gb_params,
    cv = 5,
    n_jobs = 8
    )

gb_search.fit(X, y)

print(
    'Best parameters: ', gb_search.best_params_, '\n',
    'Best accuracy: ', gb_search.best_score_
    )

In [None]:
# Generate scatterplots for each model

def plot_scatter(ytrue, yhat, name):
    plt.figure()
    sns.scatterplot(x = ytrue, y = yhat)
    plt.xlabel('True')
    plt.ylabel('Predicted')
    plt.xlim([0, 70000])
    plt.ylim([0, 70000])
    save_figure(name)

dt_yhat = dt_search.best_estimator_.predict(X)
plot_scatter(y, dt_yhat, 'dt_scatter')

rf_yhat = rf_search.best_estimator_.predict(X)
plot_scatter(y, rf_yhat, 'rf_scatter')

ab_yhat = ab_search.best_estimator_.predict(X)
plot_scatter(y, ab_yhat, 'ab_scatter')

gb_yhat = gb_search.best_estimator_.predict(X)
plot_scatter(y, gb_yhat, 'gb_scatter')