# Part 2: Demonstration -- A machine learning pipeline with Memento

The benefit of using Memento:

- Avoiding all the copy and paste when running repeated experiments;
- Experiments are running in parallel.
- Focusing on the workflow of one experiment;
- Keep all configurations in one place;
- Using checkpoints to keep tracking progress;
- Send notification when the experiments fail or finish;

To start this project:

```bash
# Using Python 3.9.x (Memento support Python 3.7, 3.8 and 3.9)
conda create -n memento python=3.9
conda activate memento

# Install dependencies
pip install memento-ml scikit-learn jupyterlab
```

In [1]:
import functools
import logging

import numpy as np
from sklearn import datasets
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import (FunctionTransformer, MinMaxScaler,
                                   StandardScaler)
from sklearn.svm import SVC

from memento import Config, ConsoleNotificationProvider, Context, Memento

In [2]:
logging.basicConfig(level=logging.INFO)

In [3]:
def add_missing_values(X, missing_rate=0.1):
    """Add missing features to n percent of samples. Remove 1 feature per sample."""
    n_samples, n_features = X.shape
    n_missing_samples = int(n_samples * missing_rate)

    idx_missing_samples = np.random.choice(n_samples, size=n_missing_samples, replace=True)
    idx_missing_features = np.random.randint(0, n_features, n_missing_samples)
    
    X_missing = X.copy()
    X_missing[idx_missing_samples, idx_missing_features] = np.nan
    return X_missing

In [4]:
# A dummy preprocessing class which returns X unchanged.
DummyPreprocessor = FunctionTransformer(lambda x: x)

# Using `partial` to avoid passing parameter in the experiment function.
load_digits = functools.partial(datasets.load_digits, return_X_y=True)
load_wine = functools.partial(datasets.load_wine, return_X_y=True)
def load_breast_cancer():
    """Add missing values to Breast Cancer dataset."""
    X, y = datasets.load_breast_cancer(return_X_y=True)
    X_missing = add_missing_values(X, missing_rate=0.1)
    return X_missing, y

Imputer = SimpleImputer(missing_values=np.nan, strategy="mean")

# Put all parameters in a configuration matrix
matrix = {
    "parameters": {
        "dataset": [
            load_digits,
            load_wine,
            load_breast_cancer,
        ],
        "preprocessing1": [
            DummyPreprocessor,
            Imputer,
        ],
        "preprocessing2": [
            DummyPreprocessor,
            MinMaxScaler(),
            StandardScaler(),
        ],
        "classifier": [
            AdaBoostClassifier,
            RandomForestClassifier,
            SVC,
        ],
    },
    "settings": { # Set global values here
        "n_fold": 5,
    },
    "exclude": [
        {"dataset": load_breast_cancer, "preprocessing1": DummyPreprocessor},
        {"dataset": load_digits, "preprocessing1": Imputer},
        {"dataset": load_wine, "preprocessing1": Imputer},
    ]
}


The `experiment` function is the building block for **Memento**. 
It takes two parameters: `Context` and `Config`.
Memento will automatically figure out how many tasks it need to create based on the configuration matrix, and execute them in parallel. 
Each task will execute this `experiment` function but with different parameters (inside `Config`).

- The `Context` exposes a handler, so the user can access `checkpoint` in the `experiment` function.
- The `Config` provides one set of parameter (from the configuration matrix) to the experiment.


In [5]:
def experiment(context: Context, config: Config):
    """This block contains the experiment with one set of parameters.
    """
    X, y = config.dataset()
    clf = config.classifier()

    pipeline = make_pipeline(config.preprocessing1, config.preprocessing2, clf)
    cv = config.settings["n_fold"]

    if context.checkpoint_exist():
        scores = context.restore()
    else:
        scores = cross_val_score(pipeline, X, y, cv=cv)
        context.checkpoint(scores)
    return scores.mean() * 100

In [6]:
notification_provider = ConsoleNotificationProvider()
results = Memento(experiment, notification_provider).run(matrix)

INFO:memento.memento:Running configurations:
INFO:memento.memento:  {'dataset': functools.partial(<function load_digits at 0x0000015495312310>, return_X_y=True), 'preprocessing1': FunctionTransformer(func=<function <lambda> at 0x00000154B5A6D670>), 'preprocessing2': FunctionTransformer(func=<function <lambda> at 0x00000154B5A6D670>), 'classifier': <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>}
INFO:memento.memento:  {'dataset': functools.partial(<function load_digits at 0x0000015495312310>, return_X_y=True), 'preprocessing1': FunctionTransformer(func=<function <lambda> at 0x00000154B5A6D670>), 'preprocessing2': FunctionTransformer(func=<function <lambda> at 0x00000154B5A6D670>), 'classifier': <class 'sklearn.ensemble._forest.RandomForestClassifier'>}
INFO:memento.memento:  {'dataset': functools.partial(<function load_digits at 0x0000015495312310>, return_X_y=True), 'preprocessing1': FunctionTransformer(func=<function <lambda> at 0x00000154B5A6D670>), 'preprocessing2': 

All tasks completed


INFO:memento.memento:0/27 results retrieved from cache


If we rerun the cell above, since there is no parameter changes and all results have been save the in the cache, the code will complete instantly.

In [7]:
# Show avg. accuracy in percentage (Note that we multiple 100 in the experiment block)
[result.inner for result in results]

[26.765552460538533,
 93.7143299288146,
 96.32838130609717,
 26.765552460538533,
 93.60229031259672,
 95.9942742185082,
 26.765552460538533,
 94.2700402352213,
 94.60229031259672,
 80.84126984126983,
 96.09523809523809,
 66.34920634920634,
 80.28571428571428,
 97.20634920634922,
 97.76190476190477,
 80.28571428571428,
 97.2063492063492,
 98.33333333333334,
 97.19142990218911,
 95.786368576308,
 90.86632510479738,
 97.1914299021891,
 95.78481602235678,
 97.54075454122031,
 96.66200900481292,
 95.43238627542307,
 97.53920198726907]

In [8]:
# Show experiment"s runtime 
[result.runtime for result in results]

[datetime.timedelta(seconds=1, microseconds=28007),
 datetime.timedelta(seconds=1, microseconds=345212),
 datetime.timedelta(microseconds=430004),
 datetime.timedelta(seconds=1, microseconds=49011),
 datetime.timedelta(seconds=1, microseconds=430011),
 datetime.timedelta(microseconds=421003),
 datetime.timedelta(seconds=1, microseconds=146010),
 datetime.timedelta(seconds=1, microseconds=312587),
 datetime.timedelta(microseconds=560004),
 datetime.timedelta(microseconds=401004),
 datetime.timedelta(microseconds=798006),
 datetime.timedelta(microseconds=38999),
 datetime.timedelta(microseconds=389002),
 datetime.timedelta(microseconds=671003),
 datetime.timedelta(microseconds=27999),
 datetime.timedelta(microseconds=406001),
 datetime.timedelta(microseconds=720004),
 datetime.timedelta(microseconds=27002),
 datetime.timedelta(microseconds=901006),
 datetime.timedelta(microseconds=866002),
 datetime.timedelta(microseconds=63000),
 datetime.timedelta(microseconds=735005),
 datetime.timede

## Streamline parameter tuning


In [9]:
matrix = {
    "parameters": {
        "dataset": [
            functools.partial(datasets.load_breast_cancer, return_X_y=True),
        ],
        "preprocessing": [
            StandardScaler(),
        ],
        "classifier": [
            SVC,
        ],
        "svm_C": [1, 10, 100, 1000],
        "svm_gamma": [0.001, 0.0001]
    },
    "settings": { # Set global values here
        "n_fold": 5,
    },
}

In [10]:
def experiment(context: Context, config: Config):
    X, y = config.dataset()
    svm_C = config.svm_C
    svm_gamma = config.svm_gamma
    clf = config.classifier(C=svm_C, gamma=svm_gamma)
    pipeline = make_pipeline(config.preprocessing, clf)
    cv = config.settings["n_fold"]

    if context.checkpoint_exist():
        scores = context.restore()
    else:
        scores = cross_val_score(pipeline, X, y, cv=cv)
        context.checkpoint(scores)
    return scores.mean() * 100

In [11]:
notification_provider = ConsoleNotificationProvider()
results = Memento(experiment, notification_provider).run(matrix)

INFO:memento.memento:Running configurations:
INFO:memento.memento:  {'dataset': functools.partial(<function load_breast_cancer at 0x0000015495312280>, return_X_y=True), 'preprocessing': StandardScaler(), 'classifier': <class 'sklearn.svm._classes.SVC'>, 'svm_C': 1, 'svm_gamma': 0.001}
INFO:memento.memento:  {'dataset': functools.partial(<function load_breast_cancer at 0x0000015495312280>, return_X_y=True), 'preprocessing': StandardScaler(), 'classifier': <class 'sklearn.svm._classes.SVC'>, 'svm_C': 1, 'svm_gamma': 0.0001}
INFO:memento.memento:  {'dataset': functools.partial(<function load_breast_cancer at 0x0000015495312280>, return_X_y=True), 'preprocessing': StandardScaler(), 'classifier': <class 'sklearn.svm._classes.SVC'>, 'svm_C': 10, 'svm_gamma': 0.001}
INFO:memento.memento:  {'dataset': functools.partial(<function load_breast_cancer at 0x0000015495312280>, return_X_y=True), 'preprocessing': StandardScaler(), 'classifier': <class 'sklearn.svm._classes.SVC'>, 'svm_C': 10, 'svm_gam

All tasks completed


In [12]:
[result.inner for result in results]

[94.73063188945815,
 79.62272938984628,
 97.01443875174661,
 94.73063188945815,
 97.01443875174661,
 97.18987734823784,
 97.36686849868033,
 97.18987734823784]