## Load reference datasets

In [None]:
#!pip install xlrd
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_diabetes

X_raw_diab, _ = load_diabetes(as_frame=True, return_X_y=True)

X_raw_breast_cancer, _ = load_breast_cancer(as_frame=True, return_X_y=True)
X_raw_california, _ = fetch_california_housing(as_frame=True, return_X_y=True)

climate_model_samples = np.loadtxt(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/00252/pop_failures.dat",
    skiprows=1,
)
climate_model_df = pd.DataFrame(climate_model_samples)

raw_datasets = {
    "airfoil": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat",
        header=None,
        sep="\\t",
    ),
    "blood": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data"
    ),
    "bc": X_raw_breast_cancer,
    "california": X_raw_california,
    "clim_model": climate_model_df,
    "concr_compr": pd.read_excel(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls"
    ),
    "concr_slump": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/slump/slump_test.data"
    ),
    "conn_sonar": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data",
        header=None,
    ),
    "diabetes": X_raw_diab,
    "wine_red": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv",
        sep=";",
    ),
    "wine_white": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv",
        sep=";",
    ),
    "yeast": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data",
        sep="\s+",
        header=None,
    ),
}

## Parse results

In [None]:
import pandas as pd
import numpy as np

from io import StringIO

rmse_key = "Mean RMSE"
wass_key = "Mean Wasserstein score"
pred_key = "Mean downstream prediction error"

data = {}


models_cnt = 7
results = {}

df_names = []

remap_models = {
    "Our method": "hyperimpute",
    "sklearn_missforest": "missforest",
    "sklearn_ice": "ice",
}
norm_cols = [
    "Our method",
    "mean",
    "sklearn_missforest",
    "sklearn_ice",
    "gain",
    "sinkhorn",
    "softimpute",
]

for dataset in data:
    df_names.append(dataset)

    for metric in data[dataset]:
        print(metric, data[dataset][metric])
        df = pd.read_csv(data[dataset][metric], sep="	")

        # Prediction norm
        num_df = df._get_numeric_data()
        num_df[num_df <= 0] = 1e-6

        for scenario in ["MNAR", "MCAR", "MAR"]:
            if scenario not in results:
                results[scenario] = {}

            for miss in [0.1, 0.3, 0.5, 0.7]:
                if miss not in results[scenario]:
                    results[scenario][miss] = {}

                local_df = df[df["Scenario"] == scenario].drop(columns=["Scenario"])
                local_df = local_df[local_df["miss_pct [0, 1]"] == miss].drop(
                    columns=["miss_pct [0, 1]"]
                )

                local_df = local_df.rename(columns=remap_models)

                if len(local_df) == 0:
                    continue

                if metric not in results[scenario][miss]:
                    results[scenario][miss][metric] = {}
                for col in local_df.columns:
                    if col not in results[scenario][miss][metric]:
                        results[scenario][miss][metric][col] = []
                    results[scenario][miss][metric][col].append(local_df[col].values[0])

## General overview

In [None]:
import matplotlib.pyplot as plt

fontsize = 14


def generate_plot_for_ax(ax, scenario, miss, metric):
    offset = len(data)
    plt.setp(ax.get_yticklabels(), fontsize=fontsize)

    barWidth = 1

    max_val = 0
    idx = 0
    for model in results[scenario][miss][metric]:
        pos = [idx + offset * i for i in range(offset)]

        if max_val < max(results[scenario][miss][metric][model]):
            max_val = max(results[scenario][miss][metric][model])
        ax.bar(
            pos,
            results[scenario][miss][metric][model],
            width=barWidth,
            label=str(model),
            edgecolor="k",
        )
        idx += 1

    ax.legend(
        loc="upper center",
        bbox_to_anchor=(0.5, 1),
        ncol=models_cnt,
        prop={"size": fontsize},
    )

    ax.set_xticks(
        [offset * r + int(models_cnt / 2) for r in range(len(df_names))],
        df_names,
        rotation=0,
        fontsize=fontsize,
    )
    ax.set_yticks(np.linspace(0, max_val + 0.1, num=5), fontsize=fontsize)
    ax.set_ylabel(metric, fontsize=fontsize + 4)

    return ax


def generate_plot(scenario, miss):
    plt.style.use("seaborn-whitegrid")

    offset = len(data)
    metrics = list(results[scenario][miss].keys())
    fig, axs = plt.subplots(len(metrics), figsize=(20, 15))

    for idx, metric in enumerate(metrics):
        generate_plot_for_ax(axs[idx], scenario, miss, metric)

    # plt.title(f"{scenario} simulation with {miss} missingness", fontdict = {"fontsize": 150}, loc = "top")
    plt.xlabel(f"{scenario} simulation with {miss} missingness", fontsize=fontsize)

    plt.savefig(f"diagrams/general_overview_{scenario}_{miss}.png")
    plt.show()


for scenario in ["MAR", "MCAR", "MNAR"]:
    for miss in [0.1, 0.3, 0.5, 0.7]:
        generate_plot(scenario, miss)

In [None]:
plt.style.available

## By missingness

In [None]:
import numpy as np

x_axis = [0.1, 0.3, 0.5, 0.7]

fontsize = 14


def generate_plot_for_ax(ax, scenario, metric):
    offset = len(data)

    barWidth = 1

    max_val = 0
    idx = 0

    for model in results[scenario][0.1][metric]:

        datapoints = []

        for miss in results[scenario]:
            datapoints.append(np.mean(results[scenario][miss][metric][model]))

        ax.plot(
            x_axis,
            datapoints,
            label=str(model),
            linewidth=2,
            marker="o",
        )

    ax.set_xticks(x_axis, fontsize=fontsize)
    # ax.set_yticks(np.arange(0, 1, step=0.2), fontsize = 120)

    # ax.set_title(metric, fontdict = {"fontsize": 150}, loc = "left", rotation='vertical')
    # plt.title(scenario, fontdict = {"fontsize": 150}, loc = "left", rotation='vertical')
    ax.set_ylabel(metric, fontsize=fontsize)
    ax.tick_params(axis="both", which="major", labelsize=fontsize)

    return ax


def generate_plot(scenario):
    plt.style.use("seaborn-whitegrid")

    offset = len(data)

    metrics = list(results[scenario][0.1].keys())
    fig, axs = plt.subplots(1, len(metrics), figsize=(20, 7))

    for idx, metric in enumerate(metrics):
        generate_plot_for_ax(axs[idx], scenario, metric)

    # plt.title(f"{scenario} simulation with {miss} missingness", fontdict = {"fontsize": 150}, loc = "top")

    axs[0].legend(
        loc="upper left",
        bbox_to_anchor=(0.5, 1.1),
        ncol=models_cnt,
        prop={"size": fontsize},
    )
    fig.suptitle(f"{scenario} simulation", fontsize=fontsize)
    plt.savefig(f"diagrams/error_by_miss_{scenario}.png")

    plt.show()


for scenario in ["MAR", "MCAR", "MNAR"]:
    generate_plot(scenario)

## By dataset size

In [None]:
import numpy as np

x_axis = [len(raw_datasets[dataset]) for dataset in data]
sorted_args = np.argsort(x_axis)
norm_x_axis = np.linspace(min(x_axis), max(x_axis), len(x_axis))
fontsize = 14


def generate_plot_for_ax(ax, scenario, metric):
    offset = len(data)

    barWidth = 1

    max_val = 0
    idx = 0

    ## We selected the same dataset from every model, and average across missingness pct
    for model in results[scenario][0.1][metric]:
        datapoints = []

        for df_idx in range(len(results[scenario][0.1][metric][model])):
            local_datapoints = []

            for miss in results[scenario]:
                local_datapoints.append(
                    np.mean(results[scenario][miss][metric][model][df_idx])
                )

            datapoints.append(np.mean(local_datapoints))

        # print(x_axis, datapoints)

        ax.plot(
            np.asarray(x_axis)[sorted_args],
            np.asarray(datapoints)[sorted_args],
            label=str(model),
            linewidth=2,
        )

    ax.set_xticks(norm_x_axis, fontsize=fontsize)
    ax.set_ylabel(metric, fontsize=fontsize)
    ax.tick_params(axis="both", which="major", labelsize=fontsize)

    return ax


def generate_plot(scenario):
    plt.style.use("seaborn-whitegrid")

    offset = len(data)

    metrics = list(results[scenario][0.1].keys())
    fig, axs = plt.subplots(1, len(metrics), figsize=(20, 7))

    for idx, metric in enumerate(metrics):
        generate_plot_for_ax(axs[idx], scenario, metric)

    # plt.title(f"{scenario} simulation with {miss} missingness", fontdict = {"fontsize": 150}, loc = "top")

    axs[0].legend(
        loc="upper left",
        bbox_to_anchor=(0.5, 1.1),
        ncol=models_cnt,
        prop={"size": fontsize},
    )
    fig.suptitle(f"{scenario} simulation: error by dataset size", fontsize=fontsize)
    plt.savefig(f"diagrams/error_by_df_size_{scenario}.png")

    plt.show()


# generate_plot("MAR")
for scenario in ["MAR", "MCAR", "MNAR"]:
    generate_plot(scenario)

In [None]:
for dataset in raw_datasets:
    print(dataset, len(raw_datasets[dataset]))