In [1]:
#!pip install xlrd
import numpy as np
import pandas as pd
import sys
from sklearn.datasets import fetch_california_housing
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_diabetes
from sklearn.datasets import load_iris
import hyperimpute.logger as log

X_raw_diab, _ = load_diabetes(as_frame=True, return_X_y=True)

X_raw_breast_cancer, _ = load_breast_cancer(as_frame=True, return_X_y=True)
X_raw_california, _ = fetch_california_housing(as_frame=True, return_X_y=True)
X_raw_iris, y_raw_iris = load_iris(as_frame = True, return_X_y = True)

climate_model_samples = np.loadtxt(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/00252/pop_failures.dat",
    skiprows=1,
)
climate_model_df = pd.DataFrame(climate_model_samples)

raw_datasets = {
    "airfoil": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat",
        header=None,
        sep="\\t",
    ),
    "blood": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data"
    ),
    "bc": X_raw_breast_cancer,
    "california": X_raw_california,
    "climate": climate_model_df,
    "compression": pd.read_excel(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls"
    ),
    "slump": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/slump/slump_test.data"
    ),
    "sonar": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data",
        header=None,
    ),
    "diabetes": X_raw_diab,
    "wine_red": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv",
        sep=";",
    ),
    "wine_white": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv",
        sep=";",
    ),
    "yeast": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data",
        sep="\s+",
        header=None,
    ),
    "iris": X_raw_iris,
    "libras":pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/libras/movement_libras.data",sep=",", header = None),
    "parkinsons": pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data",sep=","),
    "yacht": pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00243/yacht_hydrodynamics.data",sep="\s+", header = None),
    "ionosphere": pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data",sep=",", header = None),
    "letter": pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data", header = None),
    "spam":pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"),
    "credit":pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data", header = None),
}

  return func(*args, **kwargs)


In [2]:
from benchmark_imputation import simulate_scenarios
from hyperimpute.plugins.imputers import Imputers
import warnings
import pandas as pd

imputers = Imputers()

warnings.filterwarnings('ignore')

2022-01-16 21:44:47.097679: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-16 21:44:47.097805: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Instructions for updating:
non-resource variables are not supported in the long term


In [3]:
def get_imputer():
    return imputers.get("hyperimpute", 
        optimizer = "simple"
    )

def evaluate_dataset(name: str, X_raw: pd.DataFrame, 
    scenarios: list = ["MAR", "MCAR"],
    miss_pct: list = [0.1, 0.3, 0.5, 0.7],
    debug: bool = True,
):
    imputation_scenarios = simulate_scenarios(X_raw, column_limit = 10)

    out = {}
    for scenario in scenarios:
        out[scenario] = {}
        for missingness in miss_pct:
        
            try:
                x, x_miss, mask = imputation_scenarios[scenario][missingness]

                model = get_imputer()
                
                model.fit_transform(x_miss)
                
                mod_names = []
                for mod_idx in model.trace():
                    mod =  model.trace()[mod_idx][0]
                    mod_names.append(mod.name())
                out[scenario][missingness] = mod_names
                print("       > eval ", scenario, missingness, mod_names)
                
            except BaseException as e:
                print("scenario failed", str(e))
                continue
    return out

In [4]:
from sklearn.preprocessing import LabelEncoder

selected_models = {}
for dataset in raw_datasets:
    print("  > eval ", dataset)

    df = raw_datasets[dataset]
    for col in df.columns:
        if df[col].dtype == "object":
            df[col] = LabelEncoder().fit_transform(df[col])
    
    selected_models[dataset] = evaluate_dataset(dataset, df)

  > eval  airfoil
       > eval  MAR 0.1 ['xgboost_regressor', 'catboost_regressor', 'xgboost_regressor']
       > eval  MAR 0.3 ['xgboost_regressor', 'xgboost_regressor', 'xgboost_regressor']
       > eval  MAR 0.5 ['xgboost_regressor', 'xgboost_regressor', 'xgboost_regressor']
       > eval  MAR 0.7 ['random_forest_regressor', 'random_forest_regressor', 'random_forest_regressor']
       > eval  MCAR 0.1 ['xgboost_regressor', 'xgboost_regressor', 'xgboost_regressor', 'xgboost_regressor', 'xgboost_regressor', 'xgboost_regressor']
       > eval  MCAR 0.3 ['xgboost_regressor', 'xgboost_regressor', 'xgboost_regressor', 'xgboost_regressor', 'xgboost_regressor', 'xgboost_regressor']
       > eval  MCAR 0.5 ['xgboost_regressor', 'catboost_regressor', 'catboost_regressor', 'catboost_regressor', 'catboost_regressor', 'catboost_regressor']
       > eval  MCAR 0.7 ['random_forest_regressor', 'random_forest_regressor', 'random_forest_regressor', 'random_forest_regressor', 'random_forest_regressor

       > eval  MAR 0.7 ['linear_regression', 'linear_regression', 'linear_regression', 'linear_regression', 'linear_regression']
       > eval  MCAR 0.1 ['linear_regression', 'linear_regression', 'linear_regression', 'linear_regression', 'linear_regression', 'linear_regression', 'linear_regression', 'linear_regression', 'linear_regression', 'linear_regression']
       > eval  MCAR 0.3 ['linear_regression', 'linear_regression', 'linear_regression', 'linear_regression', 'linear_regression', 'linear_regression', 'linear_regression', 'linear_regression', 'linear_regression', 'linear_regression']
       > eval  MCAR 0.5 ['linear_regression', 'linear_regression', 'linear_regression', 'linear_regression', 'linear_regression', 'linear_regression', 'linear_regression', 'linear_regression', 'linear_regression', 'linear_regression']
       > eval  MCAR 0.7 ['random_forest_regressor', 'random_forest_regressor', 'random_forest_regressor', 'random_forest_regressor', 'random_forest_regressor', 'rando

       > eval  MCAR 0.3 ['catboost', 'random_forest_regressor', 'catboost_regressor', 'xgboost', 'linear_regression', 'catboost_regressor', 'catboost_regressor', 'catboost_regressor', 'catboost_regressor', 'catboost_regressor']
       > eval  MCAR 0.5 ['neural_nets', 'linear_regression', 'linear_regression', 'catboost', 'linear_regression', 'linear_regression', 'linear_regression', 'linear_regression', 'linear_regression', 'linear_regression']
       > eval  MCAR 0.7 ['random_forest_regressor', 'logistic_regression', 'random_forest_regressor', 'random_forest_regressor', 'random_forest_regressor', 'random_forest_regressor', 'random_forest_regressor', 'random_forest_regressor', 'logistic_regression', 'random_forest_regressor']
  > eval  iris
       > eval  MAR 0.1 ['random_forest_regressor', 'linear_regression']
       > eval  MAR 0.3 ['random_forest_regressor', 'random_forest_regressor']
       > eval  MAR 0.5 ['linear_regression', 'linear_regression']
       > eval  MAR 0.7 ['linear_re

       > eval  MCAR 0.3 ['xgboost_regressor', 'catboost_regressor', 'xgboost_regressor', 'xgboost_regressor', 'xgboost_regressor', 'xgboost_regressor', 'xgboost_regressor', 'xgboost_regressor', 'xgboost_regressor', 'xgboost_regressor']
       > eval  MCAR 0.5 ['xgboost_regressor', 'catboost_regressor', 'catboost_regressor', 'catboost_regressor', 'catboost_regressor', 'catboost_regressor', 'catboost_regressor', 'catboost_regressor', 'catboost_regressor', 'catboost_regressor']
       > eval  MCAR 0.7 ['catboost_regressor', 'xgboost_regressor', 'catboost_regressor', 'catboost_regressor', 'catboost_regressor', 'catboost_regressor', 'catboost_regressor', 'catboost_regressor', 'catboost_regressor', 'catboost_regressor']
  > eval  spam
       > eval  MAR 0.1 ['catboost_regressor', 'catboost_regressor', 'catboost_regressor', 'catboost_regressor', 'catboost_regressor']
       > eval  MAR 0.3 ['catboost_regressor', 'random_forest_regressor', 'catboost_regressor', 'catboost_regressor', 'catboost_

In [5]:
import json

with open("general_results/selected_models.json", "w") as f:
    json.dump(selected_models, f)

In [72]:
selected_models

{'airfoil': {'MAR': {0.1: ['xgboost_regressor',
    'catboost_regressor',
    'xgboost_regressor'],
   0.3: ['xgboost_regressor', 'xgboost_regressor', 'xgboost_regressor'],
   0.5: ['xgboost_regressor', 'xgboost_regressor', 'xgboost_regressor'],
   0.7: ['random_forest_regressor',
    'random_forest_regressor',
    'random_forest_regressor']},
  'MCAR': {0.1: ['xgboost_regressor',
    'xgboost_regressor',
    'xgboost_regressor',
    'xgboost_regressor',
    'xgboost_regressor',
    'xgboost_regressor'],
   0.3: ['xgboost_regressor',
    'xgboost_regressor',
    'xgboost_regressor',
    'xgboost_regressor',
    'xgboost_regressor',
    'xgboost_regressor'],
   0.5: ['xgboost_regressor',
    'catboost_regressor',
    'catboost_regressor',
    'catboost_regressor',
    'catboost_regressor',
    'catboost_regressor'],
   0.7: ['random_forest_regressor',
    'random_forest_regressor',
    'random_forest_regressor',
    'random_forest_regressor',
    'random_forest_regressor',
    'random_f

In [79]:
from IPython.display import HTML, display
import tabulate

print_headers = ["Dataset", "0.1 missingness", "0.3 missingness", "0.5 missingness", "0.7 missingness"]


def pretty_models(model_list):
    out = []
    for model in np.unique(model_list):
        if model == "xgboost":
            model = "XGBClassifier"
        elif model == "xgboost_regressor":
            model = "XGBRegressor"
        elif model == "catboost":
            model = "CatboostClassifier"
        elif model == "catboost_regressor":
            model = "CatboostRegressor"
        elif model == "random_forest":
            model = "RFClassifier"
        elif model == "random_forest_regressor":
            model = "RFRegressor"
        elif model == "logistic_regression":
            model = "LogisticRegression"
        elif model == "linear_regression":
            model = "LinearRegression"
        elif model == "neural_nets":
            model = "NNClassifier"
        else:
            raise RuntimeError(model)
        out.append(model)
        
    return sorted(out)


def print_scenario(scenario):
    print_data = []

    for dataset in selected_models:
        local_print_data = [dataset]
        for missingness in selected_models[dataset][scenario]:
            models = '\n'.join(pretty_models(selected_models[dataset][scenario][missingness]))
            local_print_data.append(models)

        print_data.append(local_print_data)

    display(tabulate.tabulate(print_data, headers=print_headers, tablefmt="html"))

In [80]:
print_scenario("MAR")

Dataset,0.1 missingness,0.3 missingness,0.5 missingness,0.7 missingness
airfoil,CatboostRegressor XGBRegressor,XGBRegressor,XGBRegressor,RFRegressor
blood,CatboostRegressor LinearRegression,CatboostRegressor LinearRegression RFRegressor,LinearRegression RFClassifier,LinearRegression
bc,LinearRegression,LinearRegression,LinearRegression,LinearRegression
california,CatboostRegressor XGBRegressor,CatboostRegressor XGBRegressor,CatboostRegressor LinearRegression RFRegressor XGBRegressor,LinearRegression
climate,RFClassifier RFRegressor,RFClassifier RFRegressor,RFRegressor,RFRegressor
compression,CatboostRegressor XGBRegressor,CatboostRegressor,CatboostRegressor LinearRegression RFRegressor XGBRegressor,LinearRegression
slump,LinearRegression,LinearRegression,LinearRegression,LinearRegression
sonar,RFRegressor,RFRegressor,RFRegressor,LinearRegression
diabetes,LinearRegression,LinearRegression RFRegressor,LinearRegression LogisticRegression,LinearRegression LogisticRegression
wine_red,CatboostRegressor,CatboostRegressor,LinearRegression,LinearRegression


In [81]:
print_scenario("MCAR")

Dataset,0.1 missingness,0.3 missingness,0.5 missingness,0.7 missingness
airfoil,XGBRegressor,XGBRegressor,CatboostRegressor XGBRegressor,RFRegressor
blood,CatboostRegressor LinearRegression NNClassifier,CatboostClassifier LinearRegression,RFClassifier RFRegressor,LinearRegression RFClassifier
bc,LinearRegression,LinearRegression,LinearRegression,LinearRegression
california,CatboostRegressor,CatboostRegressor,CatboostRegressor,CatboostRegressor XGBRegressor
climate,LogisticRegression RFClassifier RFRegressor,LogisticRegression RFRegressor,RFClassifier RFRegressor,LogisticRegression RFClassifier RFRegressor
compression,CatboostRegressor,LinearRegression,LinearRegression,LinearRegression
slump,LinearRegression,LinearRegression,LinearRegression,RFRegressor
sonar,RFRegressor,RFRegressor,RFRegressor,RFRegressor
diabetes,LinearRegression LogisticRegression RFRegressor,LinearRegression LogisticRegression,LinearRegression LogisticRegression,LinearRegression LogisticRegression
wine_red,CatboostRegressor XGBRegressor,CatboostRegressor,LinearRegression,LinearRegression


In [89]:
import copy

template = """
\midrule
\multirow{4}{*}{DATASET}  
& $0.1$ & MAR_0.1 & MCAR_0.1  \\
& $0.3$ & MAR_0.3 & MCAR_0.3  \\
& $0.5$ & MAR_0.5 & MCAR_0.5 \\
& $0.7$ & MAR_0.7 & MCAR_0.7 \\

"""

sorted_dfs = sorted(selected_models.keys())
def print_scenarios():
    print_data = []

    for dataset in sorted_dfs:
        latex_val = copy.copy(template)
        
        latex_val = latex_val.replace("DATASET", dataset)
        for scenario in selected_models[dataset]:
            for missingness in selected_models[dataset][scenario]:
                key = f"{scenario}_{missingness}"
                latex_val = latex_val.replace(key, ",".join(pretty_models(selected_models[dataset][scenario][missingness])))
        print(latex_val)
print_scenarios()


\midrule
\multirow{4}{*}{airfoil}  
& $0.1$ & CatboostRegressor,XGBRegressor & XGBRegressor  \
& $0.3$ & XGBRegressor & XGBRegressor  \
& $0.5$ & XGBRegressor & CatboostRegressor,XGBRegressor \
& $0.7$ & RFRegressor & RFRegressor \



\midrule
\multirow{4}{*}{bc}  
& $0.1$ & LinearRegression & LinearRegression  \
& $0.3$ & LinearRegression & LinearRegression  \
& $0.5$ & LinearRegression & LinearRegression \
& $0.7$ & LinearRegression & LinearRegression \



\midrule
\multirow{4}{*}{blood}  
& $0.1$ & CatboostRegressor,LinearRegression & CatboostRegressor,LinearRegression,NNClassifier  \
& $0.3$ & CatboostRegressor,LinearRegression,RFRegressor & CatboostClassifier,LinearRegression  \
& $0.5$ & LinearRegression,RFClassifier & RFClassifier,RFRegressor \
& $0.7$ & LinearRegression & LinearRegression,RFClassifier \



\midrule
\multirow{4}{*}{california}  
& $0.1$ & CatboostRegressor,XGBRegressor & CatboostRegressor  \
& $0.3$ & CatboostRegressor,XGBRegressor & CatboostRegressor  \
& $0.5

In [9]:
def get_imputer():
    return imputers.get("hyperimpute", 
        optimizer = "simple",
        n_inner_iter = 20,
        n_outer_iter = 10,
    )

def evaluate_convergence(name: str, X_raw: pd.DataFrame, 
    scenarios: list = ["MCAR"],
    miss_pct: list = [0.3],
    debug: bool = True,
):
    imputation_scenarios = simulate_scenarios(X_raw, column_limit = 10)

    out = {}
    for scenario in scenarios:
        out[scenario] = {}
        for missingness in miss_pct:
        
            try:
                x, x_miss, mask = imputation_scenarios[scenario][missingness]

                model = get_imputer()
                
                model.fit_transform(x_miss)
                
                trace = model.perf_trace()
                
                max_wait = 0
                
                for mod_idx in trace:
                    if max_wait < len(trace[mod_idx]):
                        max_wait = len(trace[mod_idx])
                        
                for mod_idx in trace:
                    if len(trace[mod_idx]) < max_wait:
                        trace[mod_idx] += [trace[mod_idx][-1]] * (max_wait - len(trace[mod_idx]))
                        
                for mod_idx in trace:
                    arr = np.asarray(trace[mod_idx])
                    if arr[0] > 0:
                        arr = 1 - arr
                    else:
                        arr = -arr
                    
                    print(len(arr), arr)
            except BaseException as e:
                raise e
                print("scenario failed", str(e))
                continue
    return scores

from sklearn.preprocessing import LabelEncoder

dataset = "airfoil"
df = raw_datasets[dataset]
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = LabelEncoder().fit_transform(df[col])
    
evaluate_convergence(dataset, df)

10 [0.01830315 0.0083259  0.0083259  0.0083259  0.0083259  0.0083259  0.0083259  0.0083259  0.0083259  0.0083259 ]
10 [0.02253    0.00252703 0.00022153 0.00022153 0.00022153 0.00022153 0.00022153 0.00022153 0.00022153 0.00022153]
10 [0.0564028 0.0193788 0.001206  0.0009708 0.0009708 0.0009708 0.0009708 0.0009708 0.0009708 0.0009708]
10 [0.30309533 0.22114067 0.13408667 0.087902   0.087902   0.087902   0.087902   0.087902   0.087902   0.087902  ]
10 [0.01588992 0.00049684 0.00005521 0.00005521 0.00005521 0.00002555 0.00002555 0.00002555 0.00002555 0.00002555]
10 [0.0068767  0.0068767  0.0068767  0.00650639 0.00650639 0.00650639 0.00650639 0.00650639 0.00650639 0.00650639]


NameError: name 'scores' is not defined