# Demonstration -- A standard machine learning pipeline without Memento

- The same virtual environment can be used as in `demo_with_memento.ipynb`.

To start this project:

```bash
# Using Python 3.9.x (Memento support Python 3.7, 3.8 and 3.9)
conda create -n memento python=3.9
conda activate memento

# Install dependencies
pip install memento-ml scikit-learn jupyterlab
```


In [1]:
import numpy as np
from sklearn import datasets
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import (FunctionTransformer, MinMaxScaler,
                                   StandardScaler)
from sklearn.svm import SVC

## Add Missing Values

In [2]:
def add_missing_values(X, missing_rate=0.1):
    """Add missing features to n percent of samples. Remove 1 feature per sample."""
    n_samples, n_features = X.shape
    n_missing_samples = int(n_samples * missing_rate)

    idx_missing_samples = np.random.choice(
        n_samples, size=n_missing_samples, replace=True
    )
    idx_missing_features = np.random.randint(0, n_features, n_missing_samples)

    X_missing = X.copy()
    X_missing[idx_missing_samples, idx_missing_features] = np.nan
    return X_missing

def load_breast_cancer():
    """Add missing values to Breast Cancer dataset."""
    X, y = datasets.load_breast_cancer(return_X_y=True)
    X_missing = add_missing_values(X, missing_rate=0.1)
    return X_missing, y

In [3]:
digits_x, digits_y = datasets.load_digits(return_X_y=True)
wine_x, wine_y = datasets.load_wine(return_X_y=True)
breast_x, breast_y = load_breast_cancer()

## Benchmark 2 preprocessing methods and 3 classifiers

- `make_pipeline` is used to ensure `scaler` is applied after splitting.
- To make the code as short as possible, we use `cross_val_score` which calls `StratifiedKFold` internally by default.


In [4]:
NUM_FOLD = 5

def run_dataset(X, y, with_missing_val=False):
    """Given a dataset, benchmark 2 preprocessing methods and 3 classifiers
    Preprocessing methods:
    1. No preprocessing
    2. Apply MinMax scaler (Rescale features to 0-1)
    3. Apply Standard scaler (0 mean, 1 standard deviation)

    Classifiers:
    1. AdaBoost
    2. RandomForest
    3. SVM
    """
    if with_missing_val:
        Imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
    else:
        # A dummy preprocessing class which returns X unchanged.
        Imputer = FunctionTransformer(lambda x: x)

    # No preprocessing step
    pipeline_ada = make_pipeline(Imputer, AdaBoostClassifier())
    pipeline_rf = make_pipeline(Imputer, RandomForestClassifier())
    pipeline_svm = make_pipeline(Imputer, SVC())

    # With MinMax scaler
    pipeline_minmax_ada = make_pipeline(Imputer, MinMaxScaler(), AdaBoostClassifier())
    pipeline_minmax_rf = make_pipeline(Imputer, MinMaxScaler(), RandomForestClassifier())
    pipeline_minmax_svm = make_pipeline(Imputer, MinMaxScaler(), SVC())

    # With Standard scaler
    pipeline_std_ada = make_pipeline(Imputer, StandardScaler(), AdaBoostClassifier())
    pipeline_std_rf = make_pipeline(Imputer, StandardScaler(), RandomForestClassifier())
    pipeline_std_svm = make_pipeline(Imputer, StandardScaler(), SVC())

    scores = []
    avg_score = cross_val_score(pipeline_ada, X, y, cv=NUM_FOLD)
    scores.append(("None-AdaBoost", np.mean(avg_score)))
    avg_score = cross_val_score(pipeline_rf, X, y, cv=NUM_FOLD)
    scores.append(("None-RandomForest", np.mean(avg_score)))
    avg_score = cross_val_score(pipeline_svm, X, y, cv=NUM_FOLD)
    scores.append(("None-SVM", np.mean(avg_score)))

    avg_score = cross_val_score(pipeline_minmax_ada, X, y, cv=NUM_FOLD)
    scores.append(("MinMax-AdaBoost", np.mean(avg_score)))
    avg_score = cross_val_score(pipeline_minmax_rf, X, y, cv=NUM_FOLD)
    scores.append(("MinMax-RandomForest", np.mean(avg_score)))
    avg_score = cross_val_score(pipeline_minmax_svm, X, y, cv=NUM_FOLD)
    scores.append(("MinMax-SVM", np.mean(avg_score)))

    avg_score = cross_val_score(pipeline_std_ada, X, y, cv=NUM_FOLD)
    scores.append(("STD-AdaBoost", np.mean(avg_score)))
    avg_score = cross_val_score(pipeline_std_rf, X, y, cv=NUM_FOLD)
    scores.append(("STD-RandomForest", np.mean(avg_score)))
    avg_score = cross_val_score(pipeline_std_svm, X, y, cv=NUM_FOLD)
    scores.append(("STD-SVM", np.mean(avg_score)))
    return scores


In [5]:
scores = run_dataset(digits_x, digits_y)
print("=== Digits ===")
for name, score in scores:
    print("[{:20s}] Avg. score: {:.2f}%".format(name, score * 100))

scores = run_dataset(wine_x, wine_y)
print("\n=== Wine ===")
for name, score in scores:
    print("[{:20s}] Avg. score: {:.2f}%".format(name, score * 100))

scores = run_dataset(breast_x, breast_y, with_missing_val=True)
print("\n=== Breast Cancer ===")
for name, score in scores:
    print("[{:20s}] Avg. score: {:.2f}%".format(name, score * 100))


=== Digits ===
[None-AdaBoost       ] Avg. score: 26.77%
[None-RandomForest   ] Avg. score: 93.10%
[None-SVM            ] Avg. score: 96.33%
[MinMax-AdaBoost     ] Avg. score: 26.71%
[MinMax-RandomForest ] Avg. score: 93.32%
[MinMax-SVM          ] Avg. score: 95.99%
[STD-AdaBoost        ] Avg. score: 26.77%
[STD-RandomForest    ] Avg. score: 93.94%
[STD-SVM             ] Avg. score: 94.60%

=== Wine ===
[None-AdaBoost       ] Avg. score: 80.84%
[None-RandomForest   ] Avg. score: 96.65%
[None-SVM            ] Avg. score: 66.35%
[MinMax-AdaBoost     ] Avg. score: 80.29%
[MinMax-RandomForest ] Avg. score: 97.76%
[MinMax-SVM          ] Avg. score: 97.76%
[STD-AdaBoost        ] Avg. score: 80.29%
[STD-RandomForest    ] Avg. score: 96.10%
[STD-SVM             ] Avg. score: 98.33%

=== Breast Cancer ===
[None-AdaBoost       ] Avg. score: 96.84%
[None-RandomForest   ] Avg. score: 95.78%
[None-SVM            ] Avg. score: 91.22%
[MinMax-AdaBoost     ] Avg. score: 96.66%
[MinMax-RandomForest ] A