# Model Inspection

<a href="https://colab.research.google.com/github/thomasjpfan/ml-workshop-intermediate-2-of-2/blob/master/notebooks/03-model-inspection.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab" title="Open and Execute in Google Colaboratory"></a>

In [None]:
# Install dependencies for google colab
import sys
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    %pip install -r https://raw.githubusercontent.com/thomasjpfan/ml-workshop-intermediate-2-of-2/master/requirements.txt

In [None]:
import sklearn
assert sklearn.__version__.startswith("1.0"), "Plese install scikit-learn 1.0"

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
sns.set_theme(font_scale=1.2)
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['savefig.bbox'] = 'tight'

sklearn.set_config(display='diagram')

## Load the dataset

In [None]:
from sklearn.datasets import fetch_california_housing

california = fetch_california_housing(as_frame=True)
X, y = california.data, california.target

In [None]:
X.head()

In [None]:
y.head()

### Insert random data for demonstration

In [None]:
import numpy as np

X = X.assign(ran_num=np.arange(0, X.shape[0]))

### Split dataset

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42)

## Train linear model

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline

ridge = Pipeline([
    ('scale', StandardScaler()),
    ('reg', Ridge())
])
ridge.fit(X_train, y_train)

In [None]:
ridge.score(X_train, y_train)

In [None]:
ridge.score(X_test, y_test)

## Plot coefficients

Coefficients represent the relationship between a feature and the target assuming that all other features remain constant.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def plot_linear_coef(coefs, names, ax=None, sorted=True):
    if ax is None:
        fig, ax = plt.subplots()
    coefs = pd.DataFrame(
       coefs, columns=['Coefficients'],
       index=names
    )
    
    if sorted:
        coefs = coefs.sort_values(by='Coefficients')

    coefs.plot(kind='barh', ax=ax)
    ax.axvline(x=0, color='.5')
    return ax

plot_linear_coef(ridge['reg'].coef_, names=X_train.columns);

## Coefficient variability

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold

In [None]:
ridges_cv = cross_validate(
    ridge, X_train, y_train, cv=RepeatedKFold(n_splits=5, n_repeats=5),
    return_estimator=True)

In [None]:
ridges_cv

In [None]:
ridge_coefs = pd.DataFrame(
   [model['reg'].coef_ for model in ridges_cv['estimator']],
   columns=X.columns
)

In [None]:
ridge_coefs.head()

### Plotting the variability of the cofficients

In [None]:
sorted_ridge_coefs = ridge_coefs.mean().argsort()

In [None]:
ridge_coefs.iloc[:, sorted_ridge_coefs].boxplot(vert=False);

## Exercise 1

1. Use a `Lasso` to fit the training dataset with `alpha=0.06`. **Hint:** Be sure to use a pipeline.
3. Plot `Lasso`'s coefficients next to the `Ridge` coefficients. How do they differ? **Hint** Use `plot_linear_coef`.
3. Use `RepeatedKFold` and `cross_validate` to check the variability of cofficients for `Lasso`.
    - **Hint**: Be sure to use `return_estimator=True`

In [None]:
from sklearn.linear_model import Lasso

**If you are running locally**, you can uncomment the following cell to load the solution into the cell. On **Google Colab**, [see solution here](https://github.com/thomasjpfan/ml-workshop-intermediate-2-of-2/blob/master/notebooks/solutions/03-ex01-solutions.py).

In [None]:
# %load solutions/03-ex01-solutions.py

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

In [None]:
rf.score(X_train, y_train)

In [None]:
rf.score(X_test, y_test)

In [None]:
def plot_importances(importances, names, ax=None):
    if ax is None:
        fig, ax = plt.subplots()
    indices = np.argsort(importances)
    ax.barh(range(len(importances)), importances[indices])
    ax.set(yticks=range(len(importances)),
           yticklabels=np.array(names)[indices]);

In [None]:
importances = rf.feature_importances_
plot_importances(importances, X_train.columns);

Pay attention to `ran_num`!

#### Back to slides!

## Permutation Feature Importance

### Can be used on the test data!

In [None]:
from sklearn.inspection import permutation_importance

rf_perm_results = permutation_importance(rf, X_test, y_test, n_repeats=5, n_jobs=-1)

In [None]:
def plot_permutation_importance(perm_results, names, ax=None):
    perm_sorted_idx = perm_results.importances_mean.argsort()
    if ax is None:
        fig, ax = plt.subplots()
    _ = ax.boxplot(perm_results.importances[perm_sorted_idx].T, vert=False,
                   labels=np.array(names)[perm_sorted_idx])
    return ax

In [None]:
_ = plot_permutation_importance(rf_perm_results, X_test.columns)

## Exercise 2

1. Compute the permutation importance for `Lasso` on the test set.

**If you are running locally**, you can uncomment the following cell to load the solution into the cell. On **Google Colab**, [see solution here](https://github.com/thomasjpfan/ml-workshop-intermediate-2-of-2/blob/master/notebooks/solutions/03-ex02-solutions.py).  

In [None]:
# %load solutions/03-ex02-solutions.py

## Partial Dependence

### Train a HistGradientBostingClassifer

In [None]:
from sklearn.ensemble import HistGradientBoostingRegressor 

In [None]:
hist = HistGradientBoostingRegressor(random_state=0)
hist.fit(X_train, y_train)

In [None]:
hist.score(X_test, y_test)

### Get permutation importances

In [None]:
hist_perm_results = permutation_importance(hist, X_test, y_test, n_repeats=5, random_state=0)

In [None]:
feature_importance_ranking = hist_perm_results.importances_mean.argsort()
hist_top_2 = feature_importance_ranking[-2:].tolist()
hist_bottom_2 = feature_importance_ranking[:2].tolist()

In [None]:
features = hist_top_2 + hist_bottom_2

#### Plot Partial Dependence

In [None]:
from sklearn.inspection import PartialDependenceDisplay
PartialDependenceDisplay.from_estimator(
    hist, X_test, features=features, n_cols=2)

## Exercise 3

1. Dataset using `fetch_openml` with `data_id=531` and `as_frame=True`
1. Split the data into a training and test set.
1. Train a `HistGradientBoostingRegressor` on the training set and evalute on the test set.
1. What are the 4 most important features according to permutation importance on the test set?
1. Plot the partial dependence for the 4 most important features according to permutation importance.

In [None]:
from sklearn.datasets import fetch_openml

**If you are running locally**, you can uncomment the following cell to load the solution into the cell. On **Google Colab**, [see solution here](https://github.com/thomasjpfan/ml-workshop-intermediate-2-of-2/blob/master/notebooks/solutions/03-ex03-solutions.py). 

In [None]:
# %load solutions/03-ex03-solutions.py