In [None]:
#!pip install xlrd
import numpy as np
import pandas as pd
import sys
from sklearn.datasets import fetch_california_housing
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_diabetes
from sklearn.datasets import load_iris
import hyperimpute.logger as log

X_raw_diab, _ = load_diabetes(as_frame=True, return_X_y=True)

X_raw_breast_cancer, _ = load_breast_cancer(as_frame=True, return_X_y=True)
X_raw_california, _ = fetch_california_housing(as_frame=True, return_X_y=True)
X_raw_iris, y_raw_iris = load_iris(as_frame=True, return_X_y=True)

climate_model_samples = np.loadtxt(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/00252/pop_failures.dat",
    skiprows=1,
)
climate_model_df = pd.DataFrame(climate_model_samples)

raw_datasets = {
    "airfoil": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat",
        header=None,
        sep="\\t",
    ),
    "wine_white": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv",
        sep=";",
    ),
    "spam": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
    ),
    "california": X_raw_california,
    "sonar": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data",
        header=None,
    ),
    "libras": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/libras/movement_libras.data",
        sep=",",
        header=None,
    ),
    "parkinsons": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data",
        sep=",",
    ),
    "blood": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data"
    ),  # prefers LRs
    "bc": X_raw_breast_cancer,  # prefers LR
    "compression": pd.read_excel(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls"
    ),
    "slump": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/slump/slump_test.data"
    ),
    "diabetes": X_raw_diab,
    "wine_red": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv",
        sep=";",
    ),
    "yeast": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data",
        sep="\s+",
        header=None,
    ),
    "iris": X_raw_iris,
    "parkinsons": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data",
        sep=",",
    ),
    "ionosphere": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data",
        sep=",",
        header=None,
    ),
    "credit": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data",
        header=None,
    ),
    "letter": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data",
        header=None,
    ),
}

In [None]:
import sys
from benchmark_imputation import simulate_scenarios
from hyperimpute.plugins.imputers import Imputers
import warnings
import pandas as pd
import hyperimpute.logger as log
from hyperimpute.utils.metrics import print_score, generate_score
from hyperimpute.utils.distributions import enable_reproducible_results

enable_reproducible_results()

imputers = Imputers()

# log.add(sink=sys.stderr, level="INFO")
warnings.filterwarnings("ignore")

In [None]:
from hyperimpute.plugins.utils.metrics import RMSE
from benchmark_imputation import ws_score
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from pathlib import Path
from joblib import Parallel, delayed

dispatcher = Parallel(n_jobs=2)

output_dir = Path("extras_gain_of_function_results")
baseline_regressors = ["linear_regression", "random_forest_regressor"]


def full_hyperimpute():
    return imputers.get(
        "hyperimpute",
        optimizer="hyperband",
        select_lazy=False,
    )


def automl_constant_columns_change_iterations():
    return imputers.get(
        "hyperimpute",
        optimizer="hyperband",
        select_model_by_column=False,
        select_model_by_iteration=True,
        select_lazy=True,
    )


def automl_change_columns_constant_iterations():
    return imputers.get(
        "hyperimpute",
        optimizer="hyperband",
        select_model_by_column=True,
        select_model_by_iteration=False,
        select_lazy=True,
    )


def automl_constant_columns_constant_iterations():
    return imputers.get(
        "hyperimpute",
        optimizer="hyperband",
        select_model_by_column=False,
        select_model_by_iteration=False,
        select_lazy=True,
    )


def automl_use_only_lr():
    return imputers.get(
        "hyperimpute",
        optimizer="hyperband",
        regression_seed=["linear_regression"],
        classifier_seed=["logistic_regression"],
        select_model_by_column=False,
        select_model_by_iteration=False,
        select_lazy=True,
    )


def automl_use_only_rf():
    return imputers.get(
        "hyperimpute",
        optimizer="hyperband",
        regression_seed=["random_forest_regressor"],
        classifier_seed=["random_forest"],
        select_model_by_column=False,
        select_model_by_iteration=False,
        select_lazy=True,
    )


def automl_use_only_cb():
    return imputers.get(
        "hyperimpute",
        optimizer="hyperband",
        regression_seed=["catboost_regressor"],
        classifier_seed=["catboost"],
        select_model_by_column=False,
        select_model_by_iteration=False,
        select_lazy=True,
    )


def no_automl_constant_columns_constant_iterations():
    return imputers.get(
        "hyperimpute",
        optimizer="simple",
        select_model_by_column=False,
        select_model_by_iteration=False,
        select_lazy=True,
    )


def ice_use_only_lr():
    return imputers.get(
        "hyperimpute",
        optimizer="simple",
        regression_seed=["linear_regression"],
        classifier_seed=["logistic_regression"],
        select_model_by_column=False,
        select_model_by_iteration=False,
        select_lazy=True,
    )


def ice_use_only_rf():
    return imputers.get(
        "hyperimpute",
        optimizer="simple",
        regression_seed=["random_forest_regressor"],
        classifier_seed=["random_forest"],
        select_model_by_column=False,
        select_model_by_iteration=False,
        select_lazy=True,
    )


def ice_use_only_cb():
    return imputers.get(
        "hyperimpute",
        optimizer="simple",
        regression_seed=["catboost_regressor"],
        classifier_seed=["catboost"],
        select_model_by_column=False,
        select_model_by_iteration=False,
        select_lazy=True,
    )


def ice_use_only_xgb():
    return imputers.get(
        "hyperimpute",
        optimizer="simple",
        regression_seed=["xgboost_regressor"],
        classifier_seed=["xgboost"],
        select_model_by_column=False,
        select_model_by_iteration=False,
        select_lazy=True,
    )


evaluated_scenarios = {
    "Full HyperImpute": full_hyperimpute(),
    "constant_columns_change_iterations": automl_constant_columns_change_iterations(),
    "change_columns_constant_iterations": automl_change_columns_constant_iterations(),
    "constant_columns_constant_iterations": automl_constant_columns_constant_iterations(),
    "no_automl_constant_columns_constant_iterations": no_automl_constant_columns_constant_iterations(),
    "seeds_only_lr": automl_use_only_lr(),
    "seeds_only_rf": automl_use_only_rf(),
    "seeds_only_cb": automl_use_only_cb(),
    "ice_lr": ice_use_only_lr(),
    "ice_rf": ice_use_only_rf(),
    "ice_cb": ice_use_only_cb(),
    "ice_xgb": ice_use_only_xgb(),
}


def evaluate_dataset_gain(
    name: str,
    X_raw: pd.DataFrame,
    scenario: str = "MAR",
    missingness: float = 0.3,
    debug: bool = True,
):
    imputation_scenarios = simulate_scenarios(X_raw, column_limit=10)

    ws_scores = []
    rmse_scores = []
    try:
        x, x_miss, mask = imputation_scenarios[scenario][missingness]

        for fun_scenario in evaluated_scenarios:
            print("   evaluate ", name, fun_scenario)
            model = evaluated_scenarios[fun_scenario]

            xt = model.fit_transform(x_miss.copy())

            distribution_score = ws_score(xt, x)
            rmse_score = RMSE(np.asarray(xt), np.asarray(x), np.asarray(mask))

            ws_scores.append(distribution_score)
            rmse_scores.append(rmse_score)

    except BaseException as e:
        print("scenario failed", str(e))
        raise e
    return ws_scores, rmse_scores


out_keys = ["dataset"] + list(evaluated_scenarios.keys())

output_rmse_df = pd.DataFrame([], columns=out_keys)
output_rmse_std_df = pd.DataFrame([], columns=out_keys)
output_ws_df = pd.DataFrame([], columns=out_keys)
output_ws_std_df = pd.DataFrame([], columns=out_keys)

for dataset in raw_datasets:
    try:
        print(dataset)
        df = raw_datasets[dataset]

        for col in df.columns:
            if df[col].dtype == "object":
                df[col] = LabelEncoder().fit_transform(df[col])

        scenario = "MAR"
        miss = 0.3

        local_ws_scores = []
        local_rmse_scores = []

        bench_res = dispatcher(
            delayed(evaluate_dataset_gain)(dataset, df, scenario, miss)
            for i in range(10)
        )

        for ws_res, rmse_res in bench_res:
            local_ws_scores.append(ws_res)
            local_rmse_scores.append(rmse_res)

        ws_scores_iters = []
        ws_scores_iters_std = []
        for scenario_res in np.array(local_ws_scores).T:
            score = generate_score(scenario_res)
            ws_scores_iters.append(score[0])
            ws_scores_iters_std.append(score[1])
        output_ws_df = output_ws_df.append(
            pd.DataFrame([[dataset] + ws_scores_iters], columns=out_keys)
        )
        output_ws_std_df = output_ws_std_df.append(
            pd.DataFrame([[dataset] + ws_scores_iters_std], columns=out_keys)
        )

        rmse_scores_iters = []
        rmse_scores_iters_std = []
        for scenario_res in np.array(local_rmse_scores).T:
            score = generate_score(scenario_res)
            rmse_scores_iters.append(score[0])
            rmse_scores_iters_std.append(score[1])

        output_rmse_df = output_rmse_df.append(
            pd.DataFrame([[dataset] + rmse_scores_iters], columns=out_keys)
        )
        output_rmse_std_df = output_rmse_std_df.append(
            pd.DataFrame([[dataset] + rmse_scores_iters_std], columns=out_keys)
        )
    except BaseException as e:
        print("scenario failed", dataset, e)
        raise e

In [None]:
output_dir = Path("extras_gain_of_function_results")

output_rmse_df.to_csv(output_dir / "rmse_eval.csv", index=None)
output_rmse_std_df.to_csv(output_dir / "rmse_std_eval.csv", index=None)

output_rmse_df

In [None]:
output_ws_df.to_csv(output_dir / "ws_eval.csv", index=None)
output_ws_std_df.to_csv(output_dir / "ws_std_eval.csv", index=None)

output_ws_df

In [None]:
def plot_mean_std(mean_df, std_df):
    merged = mean_df.copy()

    for r in range(merged.shape[0]):
        minval = np.argmin(merged.iloc[r, :].values[1:]) + 1
        for c in range(merged.shape[1]):
            if isinstance(merged.iloc[r, c], str):
                continue
            merged.iloc[r, c] = print_score((merged.iloc[r, c], std_df.iloc[r, c]))

            if c == minval:
                merged.iloc[r, c] = "**" + merged.iloc[r, c] + "**"

    return merged


rename_cols = {
    "no_automl_constant_columns_constant_iterations": "no_automl",
    "constant_columns_change_iterations": "const_col_ch_iter",
    "change_columns_constant_iterations": "ch_col_const_iter",
    "constant_columns_constant_iterations": "const_col_const_iter",
    "Full HyperImpute": "full",
}

In [None]:
plot_mean_std(output_rmse_df, output_rmse_std_df).rename(columns=rename_cols).to_csv(
    output_dir / "rmse_eval_str.csv"
)

In [None]:
plot_mean_std(output_ws_df, output_ws_std_df).rename(columns=rename_cols).to_csv(
    output_dir / "ws_eval_str.csv"
)