# Part 1: Demonstration -- A standard machine learning pipeline without Memento

To start this project:

```bash
# Using Python 3.9.x
conda create -n memento python=3.9
conda activate memento

# Install dependencies
pip install -r requirements.txt

# Install local package
pip install .
```

In [1]:
from sklearn import datasets
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
import numpy as np

## Example 1: Train a classifier and return the average score from 5-fold CV

In [2]:
# Load data
digits_x, digits_y = datasets.load_digits(return_X_y=True)

# Preprocessing
minmax_scaler = MinMaxScaler()

# Classifiers
ada_boost_classifier = AdaBoostClassifier()

k_fold = StratifiedKFold(5, shuffle=True)  # Stratified 5-split cross-validation
scores = []
for train_idx, test_idx in k_fold.split(digits_x, digits_y):
    # Split into train-test sets
    train_X = digits_x[train_idx]
    train_y = digits_y[train_idx]
    test_X = digits_x[test_idx]
    test_y = digits_y[test_idx]

    # Only train the scaler on the training set
    train_X = minmax_scaler.fit_transform(train_X)
    test_X = minmax_scaler.transform(test_X)

    # Train model
    ada_boost_classifier.fit(train_X, train_y)
    score = ada_boost_classifier.score(test_X, test_y)
    scores.append(score)

print('[MinMax-AdaBoost] Avg. score: {:.2f}%'.format(np.mean(scores) * 100))

[MinMax-AdaBoost] Avg. score: 27.88%


## Example 2: Benchmark 2 preprocessing methods and 3 classifiers

- `make_pipeline` is used to ensure `scaler` is applied after splitting.
- To make the code as short as possible, we use `cross_val_score` which calls `StratifiedKFold` internally by default.

In [3]:
from sklearn import datasets
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVC
import numpy as np

In [4]:
NUM_FOLD = 5

def run_dataset(X, y):
    """Given a dataset, benchmark 2 preprocessing methods and 3 classifiers
    Preprocessing methods:
    1. No preprocessing
    2. Apply MinMax scaler (Rescale features to 0-1)
    3. Apply Standard scaler (0 mean, 1 standard deviation)

    Classifiers:
    1. AdaBoost
    2. RandomForest
    3. SVM
    """
    # No preprocessing step
    pipeline_ada = make_pipeline(AdaBoostClassifier())
    pipeline_rf = make_pipeline(RandomForestClassifier())
    pipeline_svm = make_pipeline(SVC())

    # With MinMax scaler
    pipeline_minmax_ada = make_pipeline(MinMaxScaler(), AdaBoostClassifier())
    pipeline_minmax_rf = make_pipeline(MinMaxScaler(), RandomForestClassifier())
    pipeline_minmax_svm = make_pipeline(MinMaxScaler(), SVC())

    # With Standard scaler
    pipeline_std_ada = make_pipeline(StandardScaler(), AdaBoostClassifier())
    pipeline_std_rf = make_pipeline(StandardScaler(), RandomForestClassifier())
    pipeline_std_svm = make_pipeline(StandardScaler(), SVC())

    scores = []
    avg_score = cross_val_score(pipeline_ada, X, y, cv=NUM_FOLD)
    scores.append(('None-AdaBoost', np.mean(avg_score)))
    avg_score = cross_val_score(pipeline_rf, X, y, cv=NUM_FOLD)
    scores.append(('None-RandomForest', np.mean(avg_score)))
    avg_score = cross_val_score(pipeline_svm, X, y, cv=NUM_FOLD)
    scores.append(('None-SVM', np.mean(avg_score)))

    avg_score = cross_val_score(pipeline_minmax_ada, X, y, cv=NUM_FOLD)
    scores.append(('MinMax-AdaBoost', np.mean(avg_score)))
    avg_score = cross_val_score(pipeline_minmax_rf, X, y, cv=NUM_FOLD)
    scores.append(('MinMax-RandomForest', np.mean(avg_score)))
    avg_score = cross_val_score(pipeline_minmax_svm, X, y, cv=NUM_FOLD)
    scores.append(('MinMax-SVM', np.mean(avg_score)))

    avg_score = cross_val_score(pipeline_std_ada, X, y, cv=NUM_FOLD)
    scores.append(('STD-AdaBoost', np.mean(avg_score)))
    avg_score = cross_val_score(pipeline_std_rf, X, y, cv=NUM_FOLD)
    scores.append(('STD-RandomForest', np.mean(avg_score)))
    avg_score = cross_val_score(pipeline_std_svm, X, y, cv=NUM_FOLD)
    scores.append(('STD-SVM', np.mean(avg_score)))
    return scores

In [5]:
digits_x, digits_y = datasets.load_digits(return_X_y=True)
scores = run_dataset(digits_x, digits_y)
print('=== Digits ===')
for name, score in scores:
    print('[{:20s}] Avg. score: {:.2f}%'.format(name, score * 100))

wine_x, wine_y = datasets.load_wine(return_X_y=True)
scores = run_dataset(wine_x, wine_y)
print('\n=== Wine ===')
for name, score in scores:
    print('[{:20s}] Avg. score: {:.2f}%'.format(name, score * 100))

breast_x, breast_y = datasets.load_breast_cancer(return_X_y=True)
scores = run_dataset(breast_x, breast_y)
print('\n=== Breast Cancer ===')
for name, score in scores:
    print('[{:20s}] Avg. score: {:.2f}%'.format(name, score * 100))


=== Digits ===
[None-AdaBoost       ] Avg. score: 26.71%
[None-RandomForest   ] Avg. score: 93.94%
[None-SVM            ] Avg. score: 96.33%
[MinMax-AdaBoost     ] Avg. score: 26.77%
[MinMax-RandomForest ] Avg. score: 93.88%
[MinMax-SVM          ] Avg. score: 95.99%
[STD-AdaBoost        ] Avg. score: 26.77%
[STD-RandomForest    ] Avg. score: 93.32%
[STD-SVM             ] Avg. score: 94.60%

=== Wine ===
[None-AdaBoost       ] Avg. score: 80.84%
[None-RandomForest   ] Avg. score: 97.76%
[None-SVM            ] Avg. score: 66.35%
[MinMax-AdaBoost     ] Avg. score: 80.29%
[MinMax-RandomForest ] Avg. score: 97.78%
[MinMax-SVM          ] Avg. score: 97.76%
[STD-AdaBoost        ] Avg. score: 80.29%
[STD-RandomForest    ] Avg. score: 97.21%
[STD-SVM             ] Avg. score: 98.33%

=== Breast Cancer ===
[None-AdaBoost       ] Avg. score: 97.01%
[None-RandomForest   ] Avg. score: 95.96%
[None-SVM            ] Avg. score: 91.22%
[MinMax-AdaBoost     ] Avg. score: 97.19%
[MinMax-RandomForest ] A