## Regression AutoML demo with PyCaret

In [None]:
conda install -p C:/Users/sudha/miniconda3/envs/pycaretenv ipykernel --update-deps --force-reinstall

In [None]:
print("hello world")

### 1. Import libraries

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

# if VS Code Pylance is marking this as unresolved import with syntax highlighting
# but the code works nonetheless, then it is because the runtime kerncel/env from conda is correct
# but VSCode PyLance is using a different python interpreter, possibly from a different conda environment
# To fix this, go to view -> Command Pallette -> Python Interpreter. 
# Change the python interpreter corresponding to this env   
from pycaret.datasets import get_data
from pycaret.regression import *

mpl.rcParams['figure.dpi'] = 300

### 2. Load dataset

In [None]:
# This downloads the insurance.csv from the location
# https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/insurance.csv
# df = get_data('insurance')

# Some name servers of ISPs, cant resolve this correctly (e.g. Jio)
# Hence the alternate way is to manually load the csv (provided locally in this project) 

In [None]:
df = pd.read_csv("insurance.csv")
df.head()
# df.tail()

### 3. Mini EDA (Look for detailed EDA template later in the course)

In [None]:
df.info()

#### 3.1 Different Plots

1. Certain plots can be done directly from Pandas
2. General plots can be done from matplotlib
3. Beautiful and powerful plots can be done with Seaborn
4. Interactive plots can be done with Plotly (community edition is sufficient)
5. 3D plots can be done with mpl_toolkits.mplot3d

In [None]:
numeric = ['age', 'bmi', 'children', 'charges']

# using hist() on pandas for freqency histogram plot of numeric variables.
# x is the numeric field. y axis is count for that particular x 
df[numeric].hist(bins=20, figsize = (8,8))
plt.show()

##### 3.1.1 Digression: demonstration of various constructs used in subsequent code

In [None]:
# Demonstrate usage of formatted string (f string) to print for debugging
# https://note.nkmk.me/en/python-f-strings/
name_str_var = "John Smith"
age_int_var = 25
height_float_var = 185.04567891 
string_val = f"Name = {name_str_var}, Age = {age_int_var}, Height = {height_float_var:.3f}"
print(string_val)

In [None]:
# Demonstrate usage of numpy array flatten
array_2d = np.array([[1, 2],[3,4]])
array_2d

In [None]:
array_2d.flatten()

In [None]:
# Demonstrate usage of value_counts() to generate a frequency table
categorical = ['smoker', 'sex', 'region']
col = categorical[0]
freq_series = df[col].value_counts()

# Use print with formatted string instead of old style concatenation 
print(f"data type of freq_series={type(freq_series)}")
freq_series

In [None]:
categorical = ['smoker', 'sex', 'region']
color = ['C0', 'C1', 'C2', 'C3']

# define how many rows and columns are needed in plot because we will be plotting one at a time here
fig, axes = plt.subplots(2, 2, figsize = (8,8)) 

# print details about the axes object
print(f"data type of axes is {type(axes)}") # axes is a ndarray
print(f"axes shape is {axes.shape}") # axes shape is 2 x 2 matrix
print(f"first element in axes is {axes[0,0]}") # axes is a ndarray

# Turn the 4th Axes object off bcoz we are plotting only 3 categorical fields
axes[1,1].set_axis_off() # comment this line to see how empty 4th axes object shows the grid lines 

for ax, col in zip(axes.flatten(), categorical) :
    df[col].value_counts().plot(kind = 'bar', ax = ax, color = color)
    ax.set_xlabel(col)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12,8))
axes[1,1].set_axis_off()

for ax, col in zip(axes.flatten(), categorical):
    sns.histplot(df, x='charges', hue=col, multiple='stack', ax=ax)

In [None]:
cols = ['age', 'bmi', 'charges', 'smoker']

sns.pairplot(df[cols], hue='smoker')
plt.show()

### 4. Initialize PyCaret environment

In [None]:
reg = setup(data=df, 
            target='charges', 
            train_size = 0.8, 
            session_id = 145,
            numeric_features = numeric[:-1], # Exclude the "charges" field contained as last item in list "numeric"  
            categorical_features = categorical,
            transformation = True, # what does this mean
            normalize = True
            ) 

In [None]:
reg

In [None]:
df_transformed = get_config('X_transformed')
df_transformed.head()

### Comparing Regression Models

In [None]:
best_model = compare_models(sort='RMSE')
best_model

### Creating the Model

In [None]:
model = create_model('gbr', fold = 10)

### Tuning the Model

In [None]:
params = {
        'learning_rate': [0.05, 0.08, 0.1],
        'max_depth': [1,2, 3, 4, 5, 6, 7, 8],
        'subsample': [0.8, 0.9, 1, 1.1],
        'n_estimators' : [100, 200, 300, 400, 500]
    }

tuned_model = tune_model(model, optimize = 'RMSE', fold = 10,
                       custom_grid = params, n_iter = 100)

### Making Predictions

In [None]:
predictions = predict_model(model)
predictions.head()

In [None]:
predictions2 = predict_model(tuned_model)
predictions2.head()

### Plotting the Model

In [None]:
plot_model(tuned_model, 'feature', scale = 4)

In [None]:
plot_model(model, 'error')

### Finalizing and Saving the Model

In [None]:
final_model = finalize_model(tuned_model)

save_model(final_model, 'regression_model')