# Bayesian Optimization over standard imputers

In [None]:
import sys
import warnings
import pandas as pd
import numpy as np

from IPython.display import HTML, display
import tabulate

warnings.simplefilter("ignore")


from hyperimpute.utils.distributions import enable_reproducible_results

enable_reproducible_results()

# Load imputers

In [None]:
from hyperimpute.plugins.imputers import Imputers, ImputerPlugin
from hyperimpute.plugins.utils.metrics import RMSE
from hyperimpute.plugins.utils.simulate import simulate_nan
from hyperimpute.utils.optimizer import EarlyStoppingExceeded, create_study

imputers = Imputers()

imputers.list()

imputers_seed = [
    "miracle",
    "miwae",
    "gain",
    "softimpute",
    "sinkhorn",
    "mean",
    "sklearn_ice",
    "most_frequent",
    "median",
    "EM",
    "sklearn_missforest",
]

subsample = 500

# Helpers

In [None]:
from sklearn.preprocessing import MinMaxScaler


def ampute(x, mechanism, p_miss):
    x_simulated = simulate_nan(x, p_miss, mechanism)

    mask = x_simulated["mask"]
    x_miss = x_simulated["X_incomp"]

    return x, x_miss, mask


def scale_data(X):
    X = np.asarray(X)
    preproc = MinMaxScaler()

    return np.asarray(preproc.fit_transform(X))


def simulate_scenarios(X):
    X = scale_data(X)

    datasets = {}

    mechanisms = ["MAR", "MNAR", "MCAR"]
    percentages = [0.2, 0.3]

    for ampute_mechanism in mechanisms:
        for p_miss in percentages:
            if ampute_mechanism not in datasets:
                datasets[ampute_mechanism] = {}

            datasets[ampute_mechanism][p_miss] = ampute(X, ampute_mechanism, p_miss)

    return datasets

# BO core

In [None]:
from typing import Any
import optuna
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import time


def evaluate_plugin(
    name: str,
    plugin: ImputerPlugin,
    X: np.ndarray,
    X_miss: np.ndarray,
    mask: np.ndarray,
    prev_best_score: float,
):

    study, pruner = create_study(
        study_name=f"{name}_imputer_evaluation_{plugin.name()}",
        direction="minimize",
        load_if_exists=False,
        patience=5,
    )

    def evaluate_args(**kwargs: Any) -> float:
        imputer = plugin(**kwargs)

        imputed = imputer.fit_transform(X_miss.copy())
        return RMSE(np.asarray(imputed), X, mask)

    baseline_score = evaluate_args(**{})

    if baseline_score < prev_best_score:
        return baseline_score, {}

    pruner.report_score(baseline_score)
    if prev_best_score < 100:
        pruner.report_score(prev_best_score)

    def objective(trial: optuna.Trial) -> float:
        args = plugin.sample_hyperparameters(trial)
        pruner.check_trial(trial)

        score = evaluate_args(**args)

        pruner.report_score(score)

        return score

    try:
        study.optimize(objective, n_trials=50, timeout=60 * 3)
    except EarlyStoppingExceeded:
        pass
        # print(f"Early stopping triggered for imputer {plugin.name()}")

    try:
        if baseline_score > study.best_value:
            return baseline_score, {}

        return study.best_value, study.best_trial.params
    except BaseException:
        return baseline_score, {}


def benchmark(
    name: str,
    X: np.ndarray,
    X_miss: np.ndarray,
    mask: np.ndarray,
):
    scores = {}
    start = time.time()

    best_score = 999
    for plugin in imputers_seed:

        plugin_t = imputers.get_type(plugin)
        try:
            score, params = evaluate_plugin(name, plugin_t, X, X_miss, mask, best_score)
            if score < best_score:
                best_score = score
        except BaseException as e:
            print("      >>>  Plugin failed", plugin, e)
            raise e

        scores[plugin] = (score, params)

    print(f" iteration for {name} took {time.time() - start} seconds")
    print(" iteration scores", scores)
    return scores

# Dataset: UCI Airfoil Self-Noise Data Set

https://archive.ics.uci.edu/ml/datasets/airfoil+self-noise


In [None]:
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat",
    header=None,
    sep="\\t",
)

df

In [None]:
frac = subsample / len(df)
X = df.sample(frac=frac).values

imputation_scenarios = simulate_scenarios(X)


results = []
candidates = {}
for scenario in ["MAR", "MCAR", "MNAR"]:
    print("Evaluating ", scenario)
    x, x_miss, mask = imputation_scenarios[scenario][0.3]

    bo_results = benchmark("airfoil", x, x_miss, mask)

    best_candidate = ""
    best_score = 99999
    best_params = {}
    for plugin in bo_results:
        score, params = bo_results[plugin]
        if score < best_score:
            best_score = score
            best_candidate = plugin
            best_params = params

    results.append([scenario, best_candidate, best_score])
    candidates[scenario] = (best_candidate, best_params)
results

headers = ["Scenario", "BO selected estimator", "BO score"]

display(HTML(tabulate.tabulate(results, headers=headers, tablefmt="html")))

In [None]:
# Full dataset evaluation
X = df.values
imputation_scenarios = simulate_scenarios(X)

results = []
for scenario in candidates:
    x, x_miss, mask = imputation_scenarios[scenario][0.3]

    plugin, plugin_params = candidates[scenario]

    model = imputers.get(plugin, **plugin_params)

    imputed = model.fit_transform(x_miss.copy())

    loss = RMSE(np.asarray(imputed), x, mask)

    results.append([scenario, plugin, loss])

headers = ["Scenario", "BO-selected model", "RMSE on full dataset"]


display(HTML(tabulate.tabulate(results, headers=headers, tablefmt="html")))

In [None]:
# Raw methods evaluation
ref_models = [
    "mean",
    "sklearn_missforest",
    "gain",
    "EM",
    "sklearn_ice",
    "softimpute",
    "sinkhorn",
    "miracle",
    "miwae",
]
results = []
for scenario in candidates:
    x, x_miss, mask = imputation_scenarios[scenario][0.3]

    local_res = [
        scenario,
    ]
    for plugin in ref_models:

        model = imputers.get(plugin)

        imputed = model.fit_transform(x_miss.copy())

        loss = RMSE(np.asarray(imputed), x, mask)

        local_res.append(loss)

    results.append(local_res)

headers = ["Scenario"] + ref_models


display(HTML(tabulate.tabulate(results, headers=headers, tablefmt="html")))

# Dataset: Blood Transfusion Service Center Data Set


https://archive.ics.uci.edu/ml/datasets/Blood+Transfusion+Service+Center

In [None]:
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data",
    sep=",",
)

df

In [None]:
frac = min(subsample / len(df), 1)
X = df.sample(frac=frac).values

imputation_scenarios = simulate_scenarios(X)


results = []
candidates = {}
for scenario in ["MAR", "MCAR", "MNAR"]:
    print("Evaluating ", scenario)
    x, x_miss, mask = imputation_scenarios[scenario][0.3]

    bo_results = benchmark("blood", x, x_miss, mask)

    best_candidate = ""
    best_score = 99999
    best_params = {}
    for plugin in bo_results:
        score, params = bo_results[plugin]
        if score < best_score:
            best_score = score
            best_candidate = plugin
            best_params = params

    results.append([scenario, best_candidate, best_score])
    candidates[scenario] = (best_candidate, best_params)
results

headers = ["Scenario", "BO selected estimator", "BO score"]

display(HTML(tabulate.tabulate(results, headers=headers, tablefmt="html")))

In [None]:
# Full dataset evaluation
X = df.values
imputation_scenarios = simulate_scenarios(X)

results = []
for scenario in candidates:
    x, x_miss, mask = imputation_scenarios[scenario][0.3]

    plugin, plugin_params = candidates[scenario]

    model = imputers.get(plugin, **plugin_params)

    imputed = model.fit_transform(x_miss.copy())

    loss = RMSE(np.asarray(imputed), x, mask)

    results.append([scenario, plugin, loss])

headers = ["Scenario", "BO-selected model", "RMSE on full dataset"]


display(HTML(tabulate.tabulate(results, headers=headers, tablefmt="html")))

In [None]:
# Other methods evaluation

ref_models = [
    "mean",
    "sklearn_missforest",
    "gain",
    "EM",
    "sklearn_ice",
    "softimpute",
    "sinkhorn",
    "miracle",
    "miwae",
]
results = []
for scenario in candidates:
    x, x_miss, mask = imputation_scenarios[scenario][0.3]

    local_res = [
        scenario,
    ]
    for plugin in ref_models:

        model = imputers.get(plugin)

        imputed = model.fit_transform(x_miss.copy())

        loss = RMSE(np.asarray(imputed), x, mask)

        local_res.append(loss)

    results.append(local_res)

headers = ["Scenario"] + ref_models


display(HTML(tabulate.tabulate(results, headers=headers, tablefmt="html")))