In [None]:
import numpy as np
import pandas as pd
import sys
from sklearn.datasets import fetch_california_housing
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_diabetes
from sklearn.datasets import load_iris


X_raw_diab, _ = load_diabetes(as_frame=True, return_X_y=True)

X_raw_breast_cancer, _ = load_breast_cancer(as_frame=True, return_X_y=True)
X_raw_california, _ = fetch_california_housing(as_frame=True, return_X_y=True)
X_raw_iris, y_raw_iris = load_iris(as_frame=True, return_X_y=True)

climate_model_samples = np.loadtxt(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/00252/pop_failures.dat",
    skiprows=1,
)
climate_model_df = pd.DataFrame(climate_model_samples)

raw_datasets = {
    "iris": X_raw_iris,
    "parkinsons": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data",
        sep=",",
    ),
    "ionosphere": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data",
        sep=",",
        header=None,
    ),
    "credit": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data",
        header=None,
    ),
    "blood": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data"
    ),
    "airfoil": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat",
        header=None,
        sep="\\t",
    ),
    "wine_red": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv",
        sep=";",
    ),
    "spam": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
    ),
    "wine_white": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv",
        sep=";",
    ),
    "letter": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data",
        header=None,
    ),
}

In [None]:
for df in raw_datasets:
    print(df, len(raw_datasets[df]))

In [None]:
from benchmark_imputation import simulate_scenarios
from hyperimpute.plugins.imputers import Imputers
import warnings
import pandas as pd

import hyperimpute.logger as log

log.add(sink=sys.stderr, level="INFO")

imputers = Imputers()

warnings.filterwarnings("ignore")

## Model insights

In [None]:
from hyperimpute.plugins.utils.metrics import RMSE
from benchmark_imputation import ws_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from time import time
from hyperimpute.utils.distributions import enable_reproducible_results
from joblib import Parallel, delayed

dispatcher = Parallel(n_jobs=2)


def get_imputer():
    return imputers.get(
        "hyperimpute",
        optimizer="hyperband",
        select_lazy=False,
    )


def evaluate_model_trace_for_dataset(
    name: str,
    X_raw: pd.DataFrame,
    scenarios: list = ["MAR"],
    miss_pct: list = [0.1, 0.3, 0.5, 0.7],
    debug: bool = True,
):
    imputation_scenarios = simulate_scenarios(
        X_raw, column_limit=10, sample_columns=False
    )
    traces = {}

    def _local_eval(scenario, missingness):
        warnings.filterwarnings("ignore")

        x, x_miss, mask = imputation_scenarios[scenario][missingness]
        print("  evaluate ", name, scenario, missingness)
        try:
            model = get_imputer()
            model.fit_transform(x_miss.copy())

            return model.trace()["models"]

        except BaseException as e:
            print("scenario failed", str(e))

    for scenario in scenarios:
        traces[scenario] = {}

        local_traces = dispatcher(
            delayed(_local_eval)(scenario, miss) for miss in miss_pct
        )

        for local_trace, miss in zip(local_traces, miss_pct):
            x, x_miss, mask = imputation_scenarios[scenario][miss]
            observed_x = x_miss[~x_miss.isnull().any(axis=1)]
            traces[scenario][miss] = {
                "model_trace": local_trace,
                "observed_rows": len(observed_x),
            }

    return traces


def evaluate_dataset(dataset, seed: int = 0):
    enable_reproducible_results(seed)

    start = time()
    df = raw_datasets[dataset]
    for col in df.columns:
        if df[col].dtype == "object":
            df[col] = LabelEncoder().fit_transform(df[col])

    model_trace = evaluate_model_trace_for_dataset(dataset, df)

    print(f"evaluation for {dataset} took {time() - start}")
    return model_trace

In [None]:
import json


full_output = {}
for dataset in raw_datasets:
    print("eval dataset", dataset)
    try:
        full_output[dataset] = evaluate_dataset(dataset)
    except BaseException as e:
        print("scenario failed", dataset, e)
        continue

In [None]:
full_output

# Plots

In [None]:
import json
import matplotlib.pyplot as plt

In [None]:
def parse_json_data(data, scenario="MAR"):
    model_tracking = {}
    for data_set in data:
        for missingness in ["0.1", "0.3", "0.5", "0.7"]:
            models_selected = data[data_set][scenario][missingness]
            for model in models_selected:
                if not model_tracking.get(missingness, {}).get(model, {}):
                    if not model_tracking.get(missingness, {}):
                        model_tracking[missingness] = {model: 1}
                    else:
                        model_tracking[missingness][model] = 1
                else:
                    model_tracking[missingness][model] += 1

    unique_models = []
    for missingness in model_tracking:
        models = list(model_tracking[missingness].keys())
        for model in models:
            if model not in unique_models:
                unique_models.append(model)

    normalized_model_count = {}
    total_model_count = sum([count for model, count in model_tracking["0.1"].items()])
    for model in unique_models:
        normalized_model_count[model] = [0, 0, 0, 0]
        for index, missingness in enumerate(model_tracking):
            try:
                normalized_model_count[model][index] = (
                    model_tracking[missingness][model] / total_model_count
                )
            except KeyError:
                pass

    return normalized_model_count

In [None]:
def plot_figure(normalized_model_count):
    plt.style.use("seaborn-whitegrid")
    fig = plt.figure(figsize=(6, 6))
    missingness_rates = [0.1, 0.3, 0.5, 0.7]
    for model in normalized_model_count:
        plt.plot(
            missingness_rates,
            normalized_model_count[model],
            label=model,
            marker="o",
            ms=8,
            alpha=0.75,
        )

    plt.xlabel("Missingness Rate", fontsize=20)
    plt.ylabel("Selection Probability", fontsize=20)
    plt.xticks([0.1, 0.3, 0.5, 0.7], fontsize=16)
    plt.yticks(fontsize=16)
    plt.legend(prop={"size": 12}, loc=2)
    plt.show()

In [None]:
json_file_path = "selected_models_by_column.json"

with open(json_file_path, "r") as j:
    data = json.loads(j.read())

In [None]:
normalized_model_count = parse_json_data(data, "MAR")
plot_figure(normalized_model_count)

## Illustrative Example


In [None]:
model_mapping = {
    "xgboost_regressor": 0,
    "linear_regression": 1,
    "neural_nets": 2,
    "catboost_regressor": 3,
    "random_forest_regressor": 4,
    "logistic_regression": 5,
    "random_forest": 6,
    "catboost": 7,
    "xgboost": 8,
}

In [None]:
model_selection = {
    "models": {
        1: [
            "random_forest_regressor",
            "random_forest_regressor",
            "random_forest_regressor",
            "random_forest_regressor",
            "random_forest_regressor",
            "random_forest_regressor",
            "random_forest_regressor",
            "random_forest_regressor",
        ],
        5: [
            "xgboost_regressor",
            "catboost_regressor",
            "catboost_regressor",
            "catboost_regressor",
            "catboost_regressor",
            "catboost_regressor",
            "catboost_regressor",
            "catboost_regressor",
        ],
        3: [
            "xgboost_regressor",
            "xgboost_regressor",
            "xgboost_regressor",
            "xgboost_regressor",
            "xgboost_regressor",
            "xgboost_regressor",
            "xgboost_regressor",
            "xgboost_regressor",
        ],
    }
}
for column in model_selection["models"]:
    models = model_selection["models"][column]
    iterator = map(lambda model: model_mapping[model], models)
    model_selection["models"][column] = list(iterator)

In [None]:
fig = plt.figure(figsize=(8, 6))
for column in model_selection["models"]:
    plt.plot(
        model_selection["models"][column],
        label=f"column {column}",
        marker="o",
        ms="8",
        alpha=0.75,
    )

plt.yticks(
    ticks=list(range(9)),
    labels=list(model_mapping.keys()),
    fontsize=12,
    rotation=-45,
    va="bottom",
)
plt.xticks(fontsize=16)
plt.xlabel("Iterations", fontsize=20)
plt.ylabel("Model Selection", fontsize=20)
plt.legend(loc=1, prop={"size": 16})
plt.show()