# Bayesian Optimization over standard imputers

In [1]:
import sys
import warnings
import pandas as pd
import numpy as np

from IPython.display import HTML, display
import tabulate

warnings.simplefilter("ignore")


from hyperimpute.utils.distributions import enable_reproducible_results

enable_reproducible_results()

# Load imputers

In [2]:
from hyperimpute.plugins.imputers import Imputers, ImputerPlugin
from hyperimpute.plugins.utils.metrics import RMSE
from hyperimpute.plugins.utils.simulate import simulate_nan
from hyperimpute.utils.optimizer import EarlyStoppingExceeded, create_study

imputers = Imputers()

imputers.list()

imputers_seed = [
    "miracle",
    "miwae",
    "gain",
    "softimpute",
    "sinkhorn",
    "mean",
    "sklearn_ice",
    "most_frequent",
    "median",
    "EM",
    "sklearn_missforest",
]

subsample = 500

2022-01-12 00:19:19.822381: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-12 00:19:19.822408: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Instructions for updating:
non-resource variables are not supported in the long term


# Helpers

In [3]:
from sklearn.preprocessing import MinMaxScaler


def ampute(x, mechanism, p_miss):
    x_simulated = simulate_nan(x, p_miss, mechanism)

    mask = x_simulated["mask"]
    x_miss = x_simulated["X_incomp"]

    return x, x_miss, mask


def scale_data(X):
    X = np.asarray(X)
    preproc = MinMaxScaler()

    return np.asarray(preproc.fit_transform(X))


def simulate_scenarios(X):
    X = scale_data(X)

    datasets = {}

    mechanisms = ["MAR", "MNAR", "MCAR"]
    percentages = [0.2, 0.3]

    for ampute_mechanism in mechanisms:
        for p_miss in percentages:
            if ampute_mechanism not in datasets:
                datasets[ampute_mechanism] = {}

            datasets[ampute_mechanism][p_miss] = ampute(X, ampute_mechanism, p_miss)

    return datasets

# BO core

In [11]:
from typing import Any
import optuna
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import time


def evaluate_plugin(
    name: str,
    plugin: ImputerPlugin,
    X: np.ndarray,
    X_miss: np.ndarray,
    mask: np.ndarray,
    prev_best_score: float,
):

    study, pruner = create_study(
        study_name=f"{name}_imputer_evaluation_{plugin.name()}",
        direction="minimize",
        load_if_exists=False,
        patience=5,
    )

    def evaluate_args(**kwargs: Any) -> float:
        imputer = plugin(**kwargs)

        imputed = imputer.fit_transform(X_miss.copy())
        return RMSE(np.asarray(imputed), X, mask)

    baseline_score = evaluate_args(**{})

    if baseline_score < prev_best_score:
        return baseline_score, {}

    pruner.report_score(baseline_score)
    if prev_best_score < 100:
        pruner.report_score(prev_best_score)

    def objective(trial: optuna.Trial) -> float:
        args = plugin.sample_hyperparameters(trial)
        pruner.check_trial(trial)

        score = evaluate_args(**args)

        pruner.report_score(score)

        return score

    try:
        study.optimize(objective, n_trials=50, timeout=60 * 3)
    except EarlyStoppingExceeded:
        pass
        # print(f"Early stopping triggered for imputer {plugin.name()}")

    try:
        if baseline_score > study.best_value:
            return baseline_score, {}

        return study.best_value, study.best_trial.params
    except BaseException:
        return baseline_score, {}


def benchmark(
    name: str,
    X: np.ndarray,
    X_miss: np.ndarray,
    mask: np.ndarray,
):
    scores = {}
    start = time.time()

    best_score = 999
    for plugin in imputers_seed:

        plugin_t = imputers.get_type(plugin)
        try:
            score, params = evaluate_plugin(name, plugin_t, X, X_miss, mask, best_score)
            if score < best_score:
                best_score = score
        except BaseException as e:
            print("      >>>  Plugin failed", plugin, e)
            raise e

        scores[plugin] = (score, params)

    print(f" iteration for {name} took {time.time() - start} seconds")
    print(" iteration scores", scores)
    return scores

# Dataset: UCI Airfoil Self-Noise Data Set

https://archive.ics.uci.edu/ml/datasets/airfoil+self-noise


In [5]:
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat",
    header=None,
    sep="\\t",
)

df

Unnamed: 0,0,1,2,3,4,5
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461
...,...,...,...,...,...,...
1498,2500,15.6,0.1016,39.6,0.052849,110.264
1499,3150,15.6,0.1016,39.6,0.052849,109.254
1500,4000,15.6,0.1016,39.6,0.052849,106.604
1501,5000,15.6,0.1016,39.6,0.052849,106.224


In [6]:
frac = subsample / len(df)
X = df.sample(frac=frac).values

imputation_scenarios = simulate_scenarios(X)


results = []
candidates = {}
for scenario in ["MAR", "MCAR", "MNAR"]:
    print("Evaluating ", scenario)
    x, x_miss, mask = imputation_scenarios[scenario][0.3]

    bo_results = benchmark("airfoil", x, x_miss, mask)

    best_candidate = ""
    best_score = 99999
    best_params = {}
    for plugin in bo_results:
        score, params = bo_results[plugin]
        if score < best_score:
            best_score = score
            best_candidate = plugin
            best_params = params

    results.append([scenario, best_candidate, best_score])
    candidates[scenario] = (best_candidate, best_params)
results

headers = ["Scenario", "BO selected estimator", "BO score"]

display(HTML(tabulate.tabulate(results, headers=headers, tablefmt="html")))

Evaluating  MAR


2022-01-12 00:19:24.616916: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-01-12 00:19:24.616951: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-01-12 00:19:24.616974: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (bcebere-Latitude-E5570): /proc/driver/nvidia/version does not exist
2022-01-12 00:19:24.617205: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


 iteration for airfoil took 471.84167885780334 seconds
 iteration scores {'miracle': (0.23944381163968922, {}), 'miwae': (0.2792718537174337, {}), 'gain': (0.36395007501845633, {}), 'softimpute': (0.2592579805675715, {}), 'sinkhorn': (0.26285034309085253, {}), 'mean': (0.2609491921826622, {}), 'sklearn_ice': (0.24114893131219062, {}), 'most_frequent': (0.3340366136173224, {}), 'median': (0.3120505455297883, {}), 'EM': (0.239399140360265, {}), 'sklearn_missforest': (0.23517457099603345, {})}
Evaluating  MCAR
 iteration for airfoil took 504.14622807502747 seconds
 iteration scores {'miracle': (0.24720409089450374, {}), 'miwae': (0.2888107607113494, {}), 'gain': (0.2649810718607523, {'batch_size': 512, 'iterations': 1000, 'hint_rate': 0.8190691638010601, 'loss_alpha': 40}), 'softimpute': (0.3532868445658454, {}), 'sinkhorn': (0.2540941963007164, {'eps': 0.0017541402048089462, 'lr': 0.01, 'niter': 200, 'batchsize': 200, 'noise': 0.0001, 'scaling': 0.8920559600376342}), 'mean': (0.277884090

Scenario,BO selected estimator,BO score
MAR,sklearn_missforest,0.235175
MCAR,sklearn_missforest,0.22281
MNAR,EM,0.235456


In [7]:
# Full dataset evaluation
X = df.values
imputation_scenarios = simulate_scenarios(X)

results = []
for scenario in candidates:
    x, x_miss, mask = imputation_scenarios[scenario][0.3]

    plugin, plugin_params = candidates[scenario]

    model = imputers.get(plugin, **plugin_params)

    imputed = model.fit_transform(x_miss.copy())

    loss = RMSE(np.asarray(imputed), x, mask)

    results.append([scenario, plugin, loss])

headers = ["Scenario", "BO-selected model", "RMSE on full dataset"]


display(HTML(tabulate.tabulate(results, headers=headers, tablefmt="html")))

Scenario,BO-selected model,RMSE on full dataset
MAR,sklearn_missforest,0.26868
MCAR,sklearn_missforest,0.232572
MNAR,EM,0.232368


In [8]:
# Raw methods evaluation
ref_models = [
    "mean",
    "sklearn_missforest",
    "gain",
    "EM",
    "sklearn_ice",
    "softimpute",
    "sinkhorn",
    "miracle",
    "miwae",
]
results = []
for scenario in candidates:
    x, x_miss, mask = imputation_scenarios[scenario][0.3]

    local_res = [
        scenario,
    ]
    for plugin in ref_models:

        model = imputers.get(plugin)

        imputed = model.fit_transform(x_miss.copy())

        loss = RMSE(np.asarray(imputed), x, mask)

        local_res.append(loss)

    results.append(local_res)

headers = ["Scenario"] + ref_models


display(HTML(tabulate.tabulate(results, headers=headers, tablefmt="html")))

Scenario,mean,sklearn_missforest,gain,EM,sklearn_ice,softimpute,sinkhorn,miracle,miwae
MAR,0.280384,0.268111,0.269012,0.257384,0.256336,0.376559,0.282096,0.259814,0.31023
MCAR,0.274646,0.229501,0.291209,0.23171,0.242434,0.341643,0.250579,0.239689,0.2666
MNAR,0.275186,0.230792,0.30268,0.232368,0.251734,0.363269,0.256675,0.245357,0.281104


# Dataset: Blood Transfusion Service Center Data Set


https://archive.ics.uci.edu/ml/datasets/Blood+Transfusion+Service+Center

In [12]:
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data",
    sep=",",
)

df

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
...,...,...,...,...,...
743,23,2,500,38,0
744,21,2,500,52,0
745,23,3,750,62,0
746,39,1,250,39,0


In [13]:
frac = min(subsample / len(df), 1)
X = df.sample(frac=frac).values

imputation_scenarios = simulate_scenarios(X)


results = []
candidates = {}
for scenario in ["MAR", "MCAR", "MNAR"]:
    print("Evaluating ", scenario)
    x, x_miss, mask = imputation_scenarios[scenario][0.3]

    bo_results = benchmark("blood", x, x_miss, mask)

    best_candidate = ""
    best_score = 99999
    best_params = {}
    for plugin in bo_results:
        score, params = bo_results[plugin]
        if score < best_score:
            best_score = score
            best_candidate = plugin
            best_params = params

    results.append([scenario, best_candidate, best_score])
    candidates[scenario] = (best_candidate, best_params)
results

headers = ["Scenario", "BO selected estimator", "BO score"]

display(HTML(tabulate.tabulate(results, headers=headers, tablefmt="html")))

Evaluating  MAR


[2022-01-12T01:38:45.186554+0200][478308][CRITICAL] EM step failed. Singular matrix
[2022-01-12T01:38:45.188958+0200][478308][CRITICAL] EM step failed. Singular matrix
[2022-01-12T01:38:45.191439+0200][478308][CRITICAL] EM step failed. Singular matrix
[2022-01-12T01:38:45.193838+0200][478308][CRITICAL] EM step failed. Singular matrix
[2022-01-12T01:38:45.195979+0200][478308][CRITICAL] EM step failed. Singular matrix
[2022-01-12T01:38:45.198809+0200][478308][CRITICAL] EM step failed. Singular matrix
[2022-01-12T01:38:45.201628+0200][478308][CRITICAL] EM step failed. Singular matrix


 iteration for blood took 556.8235321044922 seconds
 iteration scores {'miracle': (0.22281374759075392, {}), 'miwae': (0.260319048497951, {}), 'gain': (0.25306086015982776, {'batch_size': 512, 'iterations': 700, 'hint_rate': 0.9036179920917125, 'loss_alpha': 50}), 'softimpute': (0.28522732769840636, {}), 'sinkhorn': (0.270472102754256, {}), 'mean': (0.25886789951389994, {}), 'sklearn_ice': (0.22538548685909174, {}), 'most_frequent': (0.29872522941428054, {}), 'median': (0.2867958365315854, {}), 'EM': (nan, {}), 'sklearn_missforest': (0.23558441853099082, {})}
Evaluating  MCAR


[2022-01-12T01:49:45.103848+0200][478308][CRITICAL] EM step failed. Singular matrix
[2022-01-12T01:49:52.560761+0200][478308][CRITICAL] EM step failed. Singular matrix
[2022-01-12T01:50:00.027763+0200][478308][CRITICAL] EM step failed. Singular matrix


 iteration for blood took 762.2715146541595 seconds
 iteration scores {'miracle': (0.20947002652190388, {}), 'miwae': (0.24198870588467067, {}), 'gain': (0.2477777219454598, {}), 'softimpute': (0.26143704317207644, {}), 'sinkhorn': (0.25451037277872035, {}), 'mean': (0.23659587032628376, {}), 'sklearn_ice': (0.20962083230420084, {}), 'most_frequent': (0.2987056947354749, {}), 'median': (0.2623392887982259, {}), 'EM': (7.8939466025778335, {}), 'sklearn_missforest': (0.21992133090538618, {})}
Evaluating  MNAR


[2022-01-12T02:00:34.606225+0200][478308][CRITICAL] EM step failed. Singular matrix
[2022-01-12T02:00:34.619227+0200][478308][CRITICAL] EM step failed. Singular matrix
[2022-01-12T02:00:34.627477+0200][478308][CRITICAL] EM step failed. Singular matrix
[2022-01-12T02:00:34.635127+0200][478308][CRITICAL] EM step failed. Singular matrix
[2022-01-12T02:00:34.643377+0200][478308][CRITICAL] EM step failed. Singular matrix
[2022-01-12T02:00:34.651916+0200][478308][CRITICAL] EM step failed. Singular matrix
[2022-01-12T02:00:34.661754+0200][478308][CRITICAL] EM step failed. Singular matrix


 iteration for blood took 625.5450501441956 seconds
 iteration scores {'miracle': (0.22776309601912056, {}), 'miwae': (0.26717905752654464, {}), 'gain': (0.24690032677525536, {}), 'softimpute': (0.2587294265268598, {}), 'sinkhorn': (0.24746037415093045, {}), 'mean': (0.24077118923176727, {}), 'sklearn_ice': (0.22648552929626112, {}), 'most_frequent': (0.3021178491432924, {}), 'median': (0.2657084502674204, {}), 'EM': (nan, {}), 'sklearn_missforest': (0.22852292523002024, {})}


Scenario,BO selected estimator,BO score
MAR,miracle,0.222814
MCAR,miracle,0.20947
MNAR,sklearn_ice,0.226486


In [14]:
# Full dataset evaluation
X = df.values
imputation_scenarios = simulate_scenarios(X)

results = []
for scenario in candidates:
    x, x_miss, mask = imputation_scenarios[scenario][0.3]

    plugin, plugin_params = candidates[scenario]

    model = imputers.get(plugin, **plugin_params)

    imputed = model.fit_transform(x_miss.copy())

    loss = RMSE(np.asarray(imputed), x, mask)

    results.append([scenario, plugin, loss])

headers = ["Scenario", "BO-selected model", "RMSE on full dataset"]


display(HTML(tabulate.tabulate(results, headers=headers, tablefmt="html")))

Scenario,BO-selected model,RMSE on full dataset
MAR,miracle,0.239968
MCAR,miracle,0.201536
MNAR,sklearn_ice,0.233664


In [15]:
# Other methods evaluation

ref_models = [
    "mean",
    "sklearn_missforest",
    "gain",
    "EM",
    "sklearn_ice",
    "softimpute",
    "sinkhorn",
    "miracle",
    "miwae",
]
results = []
for scenario in candidates:
    x, x_miss, mask = imputation_scenarios[scenario][0.3]

    local_res = [
        scenario,
    ]
    for plugin in ref_models:

        model = imputers.get(plugin)

        imputed = model.fit_transform(x_miss.copy())

        loss = RMSE(np.asarray(imputed), x, mask)

        local_res.append(loss)

    results.append(local_res)

headers = ["Scenario"] + ref_models


display(HTML(tabulate.tabulate(results, headers=headers, tablefmt="html")))

[2022-01-12T02:05:43.877128+0200][478308][CRITICAL] EM step failed. Singular matrix
[2022-01-12T02:13:28.133058+0200][478308][CRITICAL] EM step failed. Singular matrix


Scenario,mean,sklearn_missforest,gain,EM,sklearn_ice,softimpute,sinkhorn,miracle,miwae
MAR,0.281461,0.261399,0.305318,5.06981e+46,0.254981,0.293861,0.305232,0.241764,0.465015
MCAR,0.231743,0.211176,0.236752,0.196185,0.19779,0.238292,0.245107,0.198667,0.230015
MNAR,0.244997,0.247402,0.243897,0.223115,0.233664,0.288444,0.251398,0.229792,0.248547
