In [None]:
#!pip install xlrd
import numpy as np
import pandas as pd
import sys
from sklearn.datasets import fetch_california_housing
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_diabetes
from sklearn.datasets import load_iris
import hyperimpute.logger as log

X_raw_diab, _ = load_diabetes(as_frame=True, return_X_y=True)

X_raw_breast_cancer, _ = load_breast_cancer(as_frame=True, return_X_y=True)
X_raw_california, _ = fetch_california_housing(as_frame=True, return_X_y=True)
X_raw_iris, y_raw_iris = load_iris(as_frame = True, return_X_y = True)

climate_model_samples = np.loadtxt(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/00252/pop_failures.dat",
    skiprows=1,
)
climate_model_df = pd.DataFrame(climate_model_samples)

raw_datasets = {
    "airfoil": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat",
        header=None,
        sep="\\t",
    ),
    "blood": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data"
    ),
    "bc": X_raw_breast_cancer,
    "california": X_raw_california,
    "climate": climate_model_df,
    "compression": pd.read_excel(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls"
    ),
    "slump": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/slump/slump_test.data"
    ),
    "sonar": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data",
        header=None,
    ),
    "diabetes": X_raw_diab,
    "wine_red": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv",
        sep=";",
    ),
    "wine_white": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv",
        sep=";",
    ),
    "yeast": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data",
        sep="\s+",
        header=None,
    ),
    "iris": X_raw_iris,
    "libras":pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/libras/movement_libras.data",sep=",", header = None),
    "parkinsons": pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data",sep=","),
    "yacht": pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00243/yacht_hydrodynamics.data",sep="\s+", header = None),
    "ionosphere": pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data",sep=",", header = None),
    "letter": pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data", header = None),
    "spam":pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"),

    "credit":pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data", header = None),
}

In [None]:
import sys
from benchmark_imputation import simulate_scenarios
from hyperimpute.plugins.imputers import Imputers
import warnings
import pandas as pd
import hyperimpute.logger as log

#log.add(sink=sys.stderr, level="INFO")


imputers = Imputers()

warnings.filterwarnings('ignore')

In [None]:
from hyperimpute.plugins.utils.metrics import RMSE
from benchmark_imputation import ws_score
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.preprocessing import LabelEncoder

function_scenarios = {
    "LR": ([], ["linear_regression"]),
    "Optimized LR": (["logistic_regression"], ["linear_regression"]),
    "RF": ([], ["random_forest_regressor"]),
    "Optimized RF": (["random_forest"], ["random_forest_regressor"]),
    "LR + RF": (["logistic_regression","random_forest"], ["linear_regression", "random_forest_regressor"]),
    "HyperImpute" : ([], [])
    
}

def get_imputer(
    classifier_seed = [],
    regression_seed = [],
):
    class_threshold = 20
    optimizer = "hyperband"
    if len(classifier_seed) == 0:
        class_threshold = 0
        optimizer = "simple"
    
    if len(regression_seed) == 0:
        return imputers.get("hyperimpute", 
            optimizer = optimizer,
        )

    return imputers.get("hyperimpute", 
        optimizer = optimizer,
        classifier_seed = classifier_seed,
        regression_seed = regression_seed, 
        class_threshold = class_threshold,
    )

def evaluate_dataset_gain(name: str, X_raw: pd.DataFrame, 
    scenario: str = "MAR",
    missingness: float = 0.3,
    debug: bool = True,
):
    imputation_scenarios = simulate_scenarios(X_raw, column_limit = 10)

    ws_scores = []
    rmse_scores = []
    try:
        x, x_miss, mask = imputation_scenarios[scenario][missingness]

        for fun_scenario in function_scenarios:
            classifier_seed, regression_seed = function_scenarios[fun_scenario]
            model = get_imputer(classifier_seed = classifier_seed, regression_seed = regression_seed)
                
            xt = model.fit_transform(x_miss.copy())
                    
            distribution_score = ws_score(xt, x)
            rmse_score = RMSE(np.asarray(xt), np.asarray(x), np.asarray(mask))
                
            ws_scores.append(distribution_score)
            rmse_scores.append(rmse_score)
            
    except BaseException as e:
        print("scenario failed", str(e))
        raise e
    return ws_scores, rmse_scores

output_dir = Path("diagrams/function_gain")

for dataset in raw_datasets:
    try:
        df = raw_datasets[dataset]

        for col in df.columns:
            if df[col].dtype == "object":
                df[col] = LabelEncoder().fit_transform(df[col])
            
        scenario = "MAR"
        miss = 0.3
        ws_scores, rmse_scores = evaluate_dataset_gain(dataset, df, scenario, miss)
        #print(dataset, ws_scores, rmse_scores)
        fig, axs = plt.subplots(2, figsize = (15, 5))

        x_axis_labels = list(function_scenarios.keys())
        colormap = "BuPu"
        y_axis_labels = ["$W^2_2$"]
        ax0 = sns.heatmap([ws_scores], ax = axs[0], linewidth=0.1, 
                   yticklabels=y_axis_labels,
                   cmap=colormap)
        ax0.set_title(f"{dataset} {scenario} {miss}")

        y_axis_labels = ["RMSE"]
        sns.heatmap([rmse_scores], ax = axs[1], linewidth=0.1, 
                    xticklabels=x_axis_labels, yticklabels=y_axis_labels,
                   cmap=colormap)
        
        axs[1].tick_params(axis='x', rotation=0)
        plt.subplots_adjust(hspace = 0.1)
        
        plt.savefig(output_dir / f"function_gain_{scenario}_{dataset}_{miss}.png")

        plt.show()
    except BaseException as e:
        print("scenario failed", dataset, e)
        raise e