# Model Calibration

<a href="https://colab.research.google.com/github/thomasjpfan/ml-workshop-intermediate-2-of-2/blob/master/notebooks/02-model-calibration.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab" title="Open and Execute in Google Colaboratory"></a>

In [None]:
# Install dependencies for google colab
import sys
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    %pip install -r https://raw.githubusercontent.com/thomasjpfan/ml-workshop-intermediate-2-of-2/master/requirements.txt

In [None]:
import sklearn
assert sklearn.__version__.startswith("1.0"), "Plese install scikit-learn 1.0"

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams['font.size'] = 16
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['savefig.bbox'] = 'tight'
plt.rcParams["savefig.dpi"] = 300

sklearn.set_config(display='diagram')

In [None]:
def plot_calibration_curve(y_true, y_prob, n_bins=5, ax=None, hist=True, normalize=False):
    prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=n_bins, normalize=normalize)
    if ax is None:
        ax = plt.gca()
    if hist:
        ax.hist(y_prob, weights=np.ones_like(y_prob) / len(y_prob), alpha=.4,
               bins=np.maximum(10, n_bins))
    ax.plot([0, 1], [0, 1], ':', c='k')
    curve = ax.plot(prob_pred, prob_true, marker="o")

    ax.set_xlabel("predicted probability")
    ax.set_ylabel("fraction of positive samples")

    ax.set(aspect='equal')
    return curve

## Create dummy dataset

In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X, y = make_classification(n_samples=10000, n_features=20,
                           n_informative=2, n_redundant=2)

train_samples = 100  # Samples used for training the models

X_train = X[:train_samples]
X_test = X[train_samples:]
y_train = y[:train_samples]
y_test = y[train_samples:]

### Train linear model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

lr = make_pipeline(StandardScaler(), LogisticRegression(random_state=42))
lr.fit(X_train, y_train)

## Calibration curve

In [None]:
from sklearn.calibration import calibration_curve

In [None]:
lr_proba = lr.predict_proba(X_test)

In [None]:
prob_true, prod_pred = calibration_curve(y_test, lr_proba[:, 1], n_bins=5)

print(prob_true)
print(prod_pred)

In [None]:
plot_calibration_curve(y_test, lr_proba[:, 1]);

In [None]:
from sklearn.metrics import brier_score_loss

In [None]:
lr_brier = brier_score_loss(y_test, lr_proba[:, 1])

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 8))
plot_calibration_curve(y_test, lr_proba[:, 1], n_bins=5, ax=ax1)
ax1.set_title("n_bins=5")
plot_calibration_curve(y_test, lr_proba[:, 1], n_bins=10, ax=ax2)
ax2.set_title("n_bins=10")
plot_calibration_curve(y_test, lr_proba[:, 1], n_bins=30, ax=ax3)
ax3.set_title("n_bins=30")

### Train Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)

In [None]:
rf_proba = rf.predict_proba(X_test)

In [None]:
rf_brier = brier_score_loss(y_test, rf_proba[:, 1])
rf_brier

### Train Single Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)

In [None]:
tree_proba = tree.predict_proba(X_test)

In [None]:
tree_brier = brier_score_loss(y_test, tree_proba[:, 1])
tree_brier

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 8))
plot_calibration_curve(y_test, lr_proba[:, 1], n_bins=10, ax=ax1)
ax1.set_title(f"LogisticRegression: {lr_brier:0.4f}")
plot_calibration_curve(y_test, tree_proba[:, 1], n_bins=10, ax=ax2)
ax2.set_title(f"DecisionTreeClassifier: {tree_brier:0.4f}")
plot_calibration_curve(y_test, rf_proba[:, 1], n_bins=10, ax=ax3)
ax3.set_title(f"RandomForestClassifier: {rf_brier:0.4f}");

## Exercise 1

1. Train a `sklearn.naive_bayes.GaussianNB` on the training set.
2. Compute the brier score loss on the test set for the `GuassianNB`.
3. Plot the calibration curve with `n_bins=10`.

In [None]:
# %load solutions/02-ex01-solutions.py

## Calibration

In [None]:
from sklearn.calibration import CalibratedClassifierCV

In [None]:
rf = RandomForestClassifier(random_state=0)
cal_rf = CalibratedClassifierCV(rf, method="isotonic")
cal_rf.fit(X_train, y_train)

In [None]:
cal_rf_proba = cal_rf.predict_proba(X_test)

In [None]:
cal_rf_brier = brier_score_loss(y_test, cal_rf_proba[:, 1])

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2)
plot_calibration_curve(y_test, rf_proba[:, 1], ax=ax1, n_bins=10)
ax1.set_title(f"forest no calibration: {rf_brier:0.4f}")
plot_calibration_curve(y_test, cal_rf_proba[:, 1], ax=ax2, n_bins=10)
ax2.set_title(f"calibrated: {cal_rf_brier:0.4f}");

### Calibrating the linear model

In [None]:
lr = make_pipeline(StandardScaler(), LogisticRegression(random_state=42))
cal_lr = CalibratedClassifierCV(lr, method='isotonic')
cal_lr.fit(X_train, y_train)

In [None]:
cal_lr_proba = cal_lr.predict_proba(X_test)

cal_lr_brier = brier_score_loss(y_test, cal_lr_proba[:, 1])

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2)
plot_calibration_curve(y_test, lr_proba[:, 1], ax=ax1, n_bins=10)
ax1.set_title(f"no calibration: {lr_brier:0.4f}")
plot_calibration_curve(y_test, cal_lr_proba[:, 1], ax=ax2, n_bins=10)
ax2.set_title(f"calibrated: {cal_lr_brier:0.4f}");

## Exercise 2

0. Finish Exercise 1 for training an uncalibrated `GaussianNB`.
1. Calibrate the `sklearn.naive_bayes.GaussianNB` on the training set.
2. Compute the brier score loss on the test set.
3. Plot the calibration curve with `n_bins=10`.
4. Did the calibration improve with `CalibratedClassifierCV`?

In [None]:
# %load solutions/02-ex02-solutions.py