## Overview of the ML Pipeline in Python Jupyter Notebook
Begin by importing the data from S3 bucket

In [None]:
import pandas as pd

# We can import the data directly from s3. If you completed the Basic SageMaker Examples file, 
# then you have already uploaded the data to s3 using the default SageMaker bucket.
import sagemaker
bucket = sagemaker.Session().default_bucket()
df = pd.read_csv(f's3://{bucket}/projectdata/insurance.csv') # If connecting to S3 bucket

# Alternative 1 - You can also just hard code the s3 path
# df = pd.read_csv('s3://bucketname/path/insurance.csv') # If connecting to S3 bucket

# Alternative 2 - You can also just upload the file directly in the the SageMaker filesystem and call it directly
# df = pd.read_csv('data/insurance.csv')

df.head()

# Later to write back to CSV:
# df.to_csv('s3://bucket-name/file.csv', index=False)

  
    
    
    
    
## Exploratory Data Analysis (i.e. Data Understanding Phase)
### Begin with univariate analyses

In [None]:
df.describe()

In [None]:
df.skew()


  
### Continue with bivariate analyses

In [None]:
df.corr()

In [None]:
import seaborn as sns
sns.pairplot(df);

In [None]:
sns.set(color_codes=True)
sns.jointplot(x='bmi', y='charges', data=df);

In [None]:
# Hexbin plot
sns.axes_style("white")
sns.jointplot(data=df, x='bmi', y='charges', kind="hex", color="k");

In [None]:
# Kernel density plot
sns.jointplot(data=df, x="bmi", y="charges", kind="kde");

In [None]:
import matplotlib.pyplot as plt

# Deep contour plot
f, ax = plt.subplots(figsize=(6, 6))
cmap = sns.cubehelix_palette(as_cmap=True, dark=0, light=1, reverse=True)
sns.kdeplot(df.bmi, df.charges, cmap=cmap, n_levels=60, shade=True);

In [None]:
sns.barplot(data=df, x="region", y="charges");

In [None]:
from numpy import median
sns.barplot(data=df, x="sex", y="charges", hue="smoker", estimator=median, ci="sd", capsize=.2, palette="Blues_d");

In [None]:
# Do smokers cost significantly more than non-smokers?

from scipy import stats

smoker_yes = df[df.smoker == 'yes']
smoker_no = df[df.smoker == 'no']
t, p = stats.ttest_ind(smoker_yes.charges, smoker_no.charges)

print('t-Statistic:\t' + str(round(t, 2)))
print('p-value:\t' + str(round(p, 2)))

In [None]:
# Do men cost significantly more than women?

men = df[df.sex == 'male']
women = df[df.sex == 'female']
t, p = stats.ttest_ind(men.charges, women.charges)

print('t-Statistic:\t' + str(round(t, 2)))
print('p-value:\t' + str(round(p, 2)))

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor # Import Decision Tree Regression algorithm
from sklearn.ensemble import GradientBoostingRegressor # Import XGBoost algorithm 
from sklearn.model_selection import train_test_split # Import train_test_split function
# for a completelist of available algorithms: https://scikit-learn.org/stable/supervised_learning.html
# Which one should I use?: https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

In [None]:
# Create dummy codes for all features and not the label

for col in df.columns:
  if not pd.api.types.is_numeric_dtype(df[col]):
    df = pd.get_dummies(df, columns=[col], prefix=col)

df.head()

In [None]:
# Split dataset in features and target variable

y = df.charges # Label
X = df.drop(columns=['charges']) # Features
X = X.select_dtypes(np.number)
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
X_train.head()

In [None]:
# Create Decision Tree regressor object
clf = DecisionTreeRegressor()

# Train Decision Tree regressor
clf = clf.fit(X_train,y_train)

# Predict the labels for test dataset
y_pred = clf.predict(X_test)

In [None]:
output_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred,})
output_df.head(10)

In [None]:
# Import scikit-learn metrics module. See complete list of Classification metrics here: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
from sklearn import metrics
    
print(f'R squared:\t{metrics.r2_score(y_test, y_pred)}')
print(f'MAE:\t\t{metrics.mean_absolute_error(y_test, y_pred)}')
print(f'RMSE:\t\t{metrics.mean_squared_error(y_test, y_pred)**(1/2)}')

In [None]:
# Create XGBoost regressor object
clr = GradientBoostingRegressor()

# Train Decision Tree regression
clr = clf.fit(X_train,y_train)

# Predict the labels for test dataset
y_pred = clr.predict(X_test)

In [None]:
output_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred,})
output_df.head(10)

In [None]:
from sklearn import metrics
    
print(f'R squared:\t{metrics.r2_score(y_test, y_pred)}')
print(f'MAE:\t\t{metrics.mean_absolute_error(y_test, y_pred)}')
print(f'RMSE:\t\t{metrics.mean_squared_error(y_test, y_pred)**(1/2)}')

In [None]:
import pickle

# Save the model with the highest fit metric
pickle.dump(clr, open('stored_model.sav', 'wb'))  # OPTION 1: pickle

In [None]:
# ...some time later

import pickle
import numpy as np

# OPTION 1: Using pickle
# load the model from 'stored_model.sav'
loaded_model = pickle.load(open('stored_model.sav', 'rb'))
print(type(loaded_model))

# for a single prediction, enter a row of data and reshape into numpy array
case = [0.543478, 0.245359, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0]
print(f'Single prediction {case}: {loaded_model.predict(np.array(case).reshape(1, -1))[0]}\n')

# for a batch prediction, enter a Pandas DataFrame or a Numpy array of arrays
predictions = loaded_model.predict(X_test) 
batch_results = pd.DataFrame({'Actual':y_test, 'Predicted':predictions, 'Diff':(predictions - y_test)})
print(f'MAE:\t{batch_results.Diff.abs().mean()}\n')
batch_results.head(5)