# Imbalanced data

<a href="https://colab.research.google.com/github/thomasjpfan/ml-workshop-advanced/blob/master/notebooks/02-imbalanced-data.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab" title="Open and Execute in Google Colaboratory"></a>

In [None]:
# Install dependencies for google colab
import sys
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    %pip install -r https://raw.githubusercontent.com/thomasjpfan/ml-workshop-advanced/master/requirements.txt

In [None]:
import sklearn
assert sklearn.__version__.startswith("1.0"), "Plese install scikit-learn 1.0"

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

plt.rcParams['font.size'] = 16
plt.rcParams['figure.figsize'] = [12, 8]

sklearn.set_config(display='diagram')

## Load Mammography Data

In [None]:
from sklearn.datasets import fetch_openml

mammography = fetch_openml(data_id=310)
X, y = mammography.data, mammography.target

In [None]:
y = (y == '1').astype(int)

In [None]:
np.bincount(y)

## Split data into train test split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=0)

### Base models

#### DummyClassifier

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.dummy import DummyClassifier

In [None]:
dc = DummyClassifier()

In [None]:
dc_reg_scores = cross_validate(dc, X_train, y_train, scoring=['roc_auc', 'average_precision', "accuracy"], cv=5)

In [None]:
dc_reg_scores

In [None]:
dc_reg_scores["test_accuracy"].mean()

In [None]:
dc_reg_scores["test_average_precision"].mean()

In [None]:
dc_reg_scores["test_roc_auc"].mean()

In [None]:
from sklearn.metrics import get_scorer

def compute_metrics(estimator):
    cv_results = cross_validate(estimator,
                                X_train, y_train, scoring=['roc_auc', 'average_precision'],
                                cv=5)
    return {
        "roc_auc": cv_results["test_roc_auc"].mean(),
        "avg_precision": cv_results["test_average_precision"].mean(),
    }

#### Linear model

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression

In [None]:
base_log_reg = LogisticRegression()

In [None]:
base_log_reg_metrics = compute_metrics(base_log_reg)
base_log_reg_metrics

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
base_rf = RandomForestClassifier(random_state=42, n_jobs=-1)

In [None]:
base_rf_metrics = compute_metrics(base_rf)
base_rf_metrics

### Imbalance-learn sampler

#### Under sampler

In [None]:
np.bincount(y_train)

In [None]:
from imblearn.under_sampling import RandomUnderSampler

In [None]:
under_sampler = RandomUnderSampler(random_state=42)

In [None]:
X_train_subsample, y_train_subsample = under_sampler.fit_resample(X_train, y_train)

In [None]:
X_train.shape

In [None]:
X_train_subsample.shape

In [None]:
np.bincount(y_train_subsample)

#### Oversampling

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
over_sampler = RandomOverSampler(random_state=42)

In [None]:
X_train_subsample, y_train_subsample = over_sampler.fit_resample(X_train, y_train)

In [None]:
X_train_subsample.shape

In [None]:
np.bincount(y_train_subsample)

## Pipelines with imblean

### Linear model with under sampling

In [None]:
from imblearn.pipeline import make_pipeline as make_imb_pipeline

In [None]:
under_log_reg = make_imb_pipeline(
    RandomUnderSampler(random_state=42), LogisticRegression(random_state=42))

In [None]:
base_log_reg_metrics

In [None]:
compute_metrics(under_log_reg)

### Random Forest with under sampling

In [None]:
under_rf = make_imb_pipeline(
    RandomUnderSampler(random_state=42), RandomForestClassifier(random_state=42))

In [None]:
base_rf_metrics

In [None]:
compute_metrics(under_rf)

### Linear model with over sampling

In [None]:
over_log_reg = make_imb_pipeline(
    RandomOverSampler(random_state=42), LogisticRegression(random_state=42))

In [None]:
base_log_reg_metrics

In [None]:
compute_metrics(over_log_reg)

## Exercise 1

1. Use `make_imb_pipeline` with `RandomOverSampler(random_state=42)` to create a pipline with random forset called `over_rf`.
1. Compute our metrics using `compute_metrics`.

**If you are running locally**, you can uncomment the following cell to load the solution into the cell. On **Google Colab**, [see solution here](https://github.com/thomasjpfan//ml-workshop-advanced/blob/master/notebooks/solutions/02-ex01-solutions.py).

In [None]:
# %load solutions/02-ex01-solutions.py

## Plotting curves for logistic regression

In [None]:
base_log_reg.fit(X_train, y_train)
under_log_reg.fit(X_train, y_train)
over_log_reg.fit(X_train, y_train);

### Plotting

In [None]:
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.metrics import RocCurveDisplay

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 8))
RocCurveDisplay.from_estimator(base_log_reg, X_test, y_test, ax=ax1, name="original")
RocCurveDisplay.from_estimator(under_log_reg, X_test, y_test, ax=ax1, name="undersampling")
RocCurveDisplay.from_estimator(over_log_reg, X_test, y_test, ax=ax1, name="oversampling")

PrecisionRecallDisplay.from_estimator(base_log_reg, X_test, y_test, ax=ax2, name="original")
PrecisionRecallDisplay.from_estimator(under_log_reg, X_test, y_test, ax=ax2, name="undersampling")
PrecisionRecallDisplay.from_estimator(over_log_reg, X_test, y_test, ax=ax2, name="oversampling");

We create a custom plotting function for future use:

In [None]:
def plot_roc_and_precision_recall_curves(estimators):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 8))
    for name, est in estimators:
        RocCurveDisplay.from_estimator(est, X_test, y_test, ax=ax1, name=name)
        PrecisionRecallDisplay.from_estimator(est, X_test, y_test, ax=ax2, name=name)

In [None]:
plot_roc_and_precision_recall_curves([("original", base_log_reg),
                                      ("undersampling", under_log_reg),
                                      ("oversampling", over_log_reg)])

## Exercise 2

1. Train the three random forest models, `base_rf`, `under_rf`, `over_rf`.
1. Plot the roc and precision recall for the three random forest models.

**If you are running locally**, you can uncomment the following cell to load the solution into the cell. On **Google Colab**, [see solution here](https://github.com/thomasjpfan//ml-workshop-advanced/blob/master/notebooks/solutions/02-ex02-solutions.py).

In [None]:
# %load solutions/02-ex02-solutions.py

#### Back to slides!

## Class-Weights

#### Linear model with class weights

In [None]:
class_weight_log_reg = LogisticRegression(class_weight='balanced')

class_weight_log_reg.fit(X_train, y_train)

In [None]:
plot_roc_and_precision_recall_curves([("original", base_log_reg),
                                      ("class weighted", class_weight_log_reg)])

#### Random forest with class weights 

In [None]:
class_weight_rf = RandomForestClassifier(class_weight='balanced', random_state=42)
class_weight_rf.fit(X_train, y_train)

In [None]:
base_rf.fit(X_train, y_train)

In [None]:
plot_roc_and_precision_recall_curves([("original", base_rf),
                                      ("class weighted", class_weight_rf)])

#### Back to slides!

## Ensemble Resampling

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
balanced_rf = BalancedRandomForestClassifier(random_state=0)

balanced_rf.fit(X_train, y_train)

In [None]:
under_rf.fit(X_train, y_train)

over_rf = make_imb_pipeline(RandomOverSampler(random_state=42), RandomForestClassifier(random_state=42))
over_rf.fit(X_train, y_train)

In [None]:
plot_roc_and_precision_recall_curves(
    [
        ("original", base_rf),
        ("undersampling", under_rf),
        ("oversampling", over_rf),
        ("balanced bagging", balanced_rf)
    ]
)

#### Back to slides!

## SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

X_train_smote.shape

In [None]:
np.bincount(y_train_smote)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 8))
sorting = np.argsort(y_train)

axes[0].set_title("Original")
axes[0].scatter(X_train.iloc[sorting, 3], X_train.iloc[sorting, 4], c=plt.cm.tab10(y_train.iloc[sorting]), alpha=.3, s=2)

axes[1].set_title("SMOTE")
axes[1].scatter(X_train_smote.iloc[:, 3], X_train_smote.iloc[:, 4], c=plt.cm.tab10(y_train_smote), alpha=1, s=2)

In [None]:
base_log_reg_metrics

In [None]:
smote_log_reg = make_imb_pipeline(
    SMOTE(random_state=42), LogisticRegression())
compute_metrics(smote_log_reg)

In [None]:
base_rf_metrics

In [None]:
smote_rf = make_imb_pipeline(SMOTE(random_state=42), RandomForestClassifier(random_state=42, n_jobs=-1))
compute_metrics(smote_rf)

## Plotting all the version of random forest

In [None]:
smote_rf.fit(X_train, y_train)

In [None]:
plot_roc_and_precision_recall_curves(
    [
        ("original", base_rf),
        ("smote", smote_rf),
    ]
)

## Exercise 3

1. Train a `HistGradientBoostingClassifer` on the training set.
2. Construct a pipline with `SMOTE` and `HistGradientBoostingClassifer` fit it on the training set.
3. Plot the ROC and PR curves between the two models.

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier

**If you are running locally**, you can uncomment the following cell to load the solution into the cell. On **Google Colab**, [see solution here](https://github.com/thomasjpfan//ml-workshop-advanced/blob/master/notebooks/solutions/02-ex03-solutions.py).

In [None]:
# %load solutions/02-ex03-solutions.py