In [1]:
import copy
import os
import time
import random
import sys
import warnings

import numpy as np
import pandas as pd
from typing import Any


from hyperimpute.plugins.imputers import Imputers
from hyperimpute.utils.distributions import enable_reproducible_results
import hyperimpute.logger as log

from benchmark_imputation import evaluate_dataset_repeated_internal

from IPython.display import HTML, display
import tabulate

import json

warnings.filterwarnings('ignore')
enable_reproducible_results()

imputers = Imputers()
log.add(sink=sys.stderr, level="INFO")

Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
from pathlib import Path

def get_imputer():
    return imputers.get("hyperimpute", 
        optimizer = "simple"
    )

def save_results(fname, results):
    path = Path("tutorial_02_hyperimpute_example_results")
    path.mkdir(parents=True, exist_ok=True)

    out = path / fname
    
    with open(out, 'w') as outfile:
        json.dump(results, outfile)
        
def evaluate_dataset_repeated(
    name,
    X_raw,
    y,
    ref_methods=["mean", "sklearn_missforest", "sklearn_ice", 
                 "gain", "sinkhorn", "softimpute", "miracle", "miwae"],
    scenarios=["MNAR", "MCAR", "MAR"],
    miss_pct=[0.1, 0.3, 0.5, 0.7],
    n_iter=1,
    debug=False,
):
    results = evaluate_dataset_repeated_internal(
        name = name,
        evaluated_model = get_imputer(),
        X_raw = X_raw,
        y = y,
        ref_methods=ref_methods,
        scenarios=scenarios,
        miss_pct=miss_pct,
        n_iter=n_iter,
        debug=debug,
    )
    
    save_results(name, results)

## Sanity check in  debug mode

In [3]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat', header = None, sep ="\\t")

y = df[5]
X_raw = df.drop(columns = [5])

evaluate_dataset_repeated("airfoil_debug", X_raw, y,        
            scenarios =  ["MNAR"],      
            ref_methods=["mean", "miracle"],
            debug = True,
            n_iter = 1,
            miss_pct = [0.3]
)

> evaluation trial  0
  > eval  MNAR 0.3


[2022-01-13T12:34:26.576142+0200][353152][INFO]   > BO iter 0
[2022-01-13T12:34:26.969530+0200][353152][INFO]      >>> Column 0 <-- score -0.02207321259644897 <-- Model random_forest_regressor
[2022-01-13T12:34:27.392770+0200][353152][INFO]      >>> Column 1 <-- score -0.02305180517219732 <-- Model catboost_regressor
[2022-01-13T12:34:27.770646+0200][353152][INFO]      >>> Column 2 <-- score -0.03354103691649689 <-- Model catboost_regressor
[2022-01-13T12:34:31.111581+0200][353152][INFO]   > BO iter 1
[2022-01-13T12:34:31.537858+0200][353152][INFO]      >>> Column 3 <-- score -0.04553676254540827 <-- Model catboost_regressor
[2022-01-13T12:34:31.886386+0200][353152][INFO]      >>> Column 4 <-- score -0.0005044679073548755 <-- Model xgboost_regressor
[2022-01-13T12:34:32.296918+0200][353152][INFO]      >>> Column 2 <-- score -0.0019666368204545367 <-- Model catboost_regressor
[2022-01-13T12:34:35.606147+0200][353152][INFO]   > BO iter 2
[2022-01-13T12:34:36.060592+0200][353152][INFO]   

benchmark took  62.69257068634033
RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,miracle
MNAR,0.3,0.212094,0.291261,0.265912




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,miracle
MNAR,0.3,0.0579122,0.368713,0.226537




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,miracle
MNAR,0.3,7.04437,5.78295,7.58695


# Datasets

In [3]:
import hyperimpute.logger as log

log.remove()

|           Dataset          |  Size |
|:--------------------------:|:-----:|
|     airfoil self noise     |  1503 |
|     blood transfusion      |  748  |
|  breast cancer diagnostic  |  569  |
|         california         | 20640 |
|   climate model crashes    |  540  |
|    concrete compression    |  1030 |
|       concrete slump       |  103  |
| connectionist bench sonar  |  208  |
|  connectionist bench vowel |  990  |
|            iris            |  150  |
|      wine quality red      |  1599 |
|     wine quality white     |  4899 |
|            yeast           |  1484 |

## Dataset: UCI Airfoil Self-Noise Data Set

https://archive.ics.uci.edu/ml/datasets/airfoil+self-noise


In [None]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat', header = None, sep ="\\t")

df

In [None]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("airfoil", X_raw, y)

## Dataset: UCI Blood Transfusion Service Center Data Set

https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data

In [None]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data')

df

In [None]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("blood", X_raw, y)

## Dataset: Breast Cancer Wisconsin (Diagnostic)

https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)

In [None]:
from sklearn.datasets import load_breast_cancer

X_raw, y = load_breast_cancer(as_frame = True, return_X_y = True)

X_raw

In [None]:
evaluate_dataset_repeated("bc", X_raw, y)

## Dataset: California Housing



In [None]:
from sklearn.datasets import fetch_california_housing

X_raw, y = fetch_california_housing(as_frame = True, return_X_y = True)

X_raw

In [None]:
evaluate_dataset_repeated("california", X_raw, y,
                    ref_methods=["mean", "sklearn_missforest", "sklearn_ice"],
                         )

# Dataset: Climate Model Simulation Crashes
https://archive.ics.uci.edu/ml/datasets/climate+model+simulation+crashes

In [None]:
samples = np.loadtxt("https://archive.ics.uci.edu/ml/machine-learning-databases/00252/pop_failures.dat", skiprows=1)
df = pd.DataFrame(samples)

df

In [None]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("climate_model", X_raw, y)

## Concrete Compressive Strength Data Set
https://archive.ics.uci.edu/ml/datasets/concrete+compressive+strength

In [None]:
df = pd.read_excel("https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls")

df

In [None]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("concrete_compressive", X_raw, y)

## Concrete Slump Test Data Set

https://archive.ics.uci.edu/ml/datasets/concrete+slump+test

In [None]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/slump/slump_test.data")

df

In [None]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns = ["No", last_col])

evaluate_dataset_repeated("concret_slump", X_raw, y)

## Connectionist Bench (Sonar, Mines vs. Rocks) Data Set

https://archive.ics.uci.edu/ml/datasets/connectionist+bench+(sonar,+mines+vs.+rocks)

In [None]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data", header = None)

df

In [None]:
last_col = df.columns[-1]
y = (df[last_col] == 'M').astype(int)
X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("connectionist_sonar", X_raw, y)

## Wine-Red dataset

In [None]:
# Wine Quality Data Set

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep = ';')

df

In [None]:
last_col = df.columns[-1]

y = df[last_col]
mapped_labels = sorted(y.unique())
mapping = {}
for idx, label in enumerate(mapped_labels):
    mapping[label] = idx
y = y.map(mapping)       

X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("wine_red", X_raw, y)

## Wine-White dataset

In [8]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep = ';')

df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [None]:
last_col = df.columns[-1]

y = df[last_col]
mapped_labels = sorted(y.unique())
mapping = {}
for idx, label in enumerate(mapped_labels):
    mapping[label] = idx
y = y.map(mapping)  

X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("wine_white", X_raw, y)

## Yeast Data Set


In [6]:
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data",sep="\s+", header = None)

df = df.drop(columns = [0])

for col in [9]:
    df[col] = LabelEncoder().fit_transform(df[col])

df

Unnamed: 0,1,2,3,4,5,6,7,8,9
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,6
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,6
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,6
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,7
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,6
...,...,...,...,...,...,...,...,...,...
1479,0.81,0.62,0.43,0.17,0.5,0.0,0.53,0.22,4
1480,0.47,0.43,0.61,0.40,0.5,0.0,0.48,0.47,7
1481,0.67,0.57,0.36,0.19,0.5,0.0,0.56,0.22,4
1482,0.43,0.40,0.60,0.16,0.5,0.0,0.53,0.39,7


In [7]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("yeast", X_raw, y)

benchmark took  1753.116238117218
RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute,miracle,miwae
MNAR,0.1,0.125945,0.138525,0.124456,0.124563,0.130307,0.157895,0.140038,0.125815,0.14437
MNAR,0.3,0.122669,0.120924,0.138543,0.474408,0.143974,0.146723,0.14805,0.116743,0.211633
MNAR,0.5,0.143019,0.117326,0.13094,0.390336,0.161258,0.142737,0.199797,0.121093,0.170488
MNAR,0.7,0.131071,0.116965,0.189294,0.129047,0.270225,0.143452,0.243708,0.13265,0.189738
MCAR,0.1,0.102408,0.110117,0.105621,0.102596,0.112878,0.134744,0.117223,0.103615,0.180807
MCAR,0.3,0.12223,0.11964,0.117849,0.114406,0.12425,0.141791,0.14305,0.116015,0.149102
MCAR,0.5,0.227462,0.11546,0.132303,0.15258,0.148788,0.145145,0.166237,0.116336,0.206251
MCAR,0.7,0.123386,0.118576,0.1414,0.121886,0.194473,0.149475,0.227563,0.226918,0.125716
MAR,0.1,0.128327,0.139384,0.130077,0.135631,0.168567,0.152255,0.276507,0.130812,0.241397
MAR,0.3,0.1234,0.132449,0.130485,0.122758,0.135329,0.152172,0.139881,0.126263,0.200659




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute,miracle,miwae
MNAR,0.1,0.0435414,0.0672621,0.0437175,0.0421781,0.0452205,0.0243323,0.0377831,0.0509687,0.0464627
MNAR,0.3,0.0734489,0.17509,0.114883,0.167717,0.176139,0.0706597,0.113106,0.140562,0.198684
MNAR,0.5,0.097977,0.294353,0.139545,0.251447,0.346287,0.126023,0.266176,0.285289,0.187895
MNAR,0.7,0.218443,0.40494,0.310635,0.298272,1.10663,0.175972,0.630418,0.50876,0.248995
MCAR,0.1,0.0376662,0.0590753,0.0411014,0.0379584,0.0438002,0.0204809,0.0355375,0.0471644,0.034616
MCAR,0.3,0.0693771,0.175408,0.0990736,0.103725,0.149387,0.0636714,0.10563,0.148063,0.10094
MCAR,0.5,0.207005,0.289849,0.143499,0.207843,0.329727,0.114058,0.209622,0.269919,0.338447
MCAR,0.7,0.242141,0.404758,0.230241,0.279569,0.596333,0.176029,0.588903,0.768945,0.263097
MAR,0.1,0.0258716,0.0386576,0.0264184,0.0226136,0.0294993,0.0158894,0.037025,0.0286939,0.0193067
MAR,0.3,0.058119,0.0960556,0.0533997,0.056955,0.0821483,0.0385997,0.0484831,0.0703301,0.0552522




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute,miracle,miwae
MNAR,0.1,0.333842,0.538279,-0.408829,-0.163807,-0.0273539,-0.270486,-0.220029,-0.401102,-0.105056
MNAR,0.3,-0.575868,1.09426,0.884959,0.832516,0.136983,0.834459,1.21054,0.529605,-0.644091
MNAR,0.5,1.09163,1.4047,0.546397,0.0979955,0.486862,0.127368,1.48724,1.0671,0.831851
MNAR,0.7,3.75583,3.79441,2.2919,2.81196,1.00333,-0.0093457,-0.304572,2.04517,-0.137345
MCAR,0.1,0.038646,0.345303,0.305334,0.223507,0.272635,-0.300882,0.582269,0.149741,-0.925991
MCAR,0.3,0.086084,1.40627,-0.185991,-0.845075,0.585701,-0.100261,-0.058128,-0.57747,-1.38511
MCAR,0.5,1.22857,3.90463,1.79153,-0.288388,-0.336008,0.210883,2.17748,1.62608,1.43795
MCAR,0.7,4.63469,6.47155,5.22561,1.36868,0.49681,1.15989,1.67849,2.66846,0.505834
MAR,0.1,0.153869,0.819711,0.579431,0.215426,0.249278,0.0124508,0.163121,0.438526,0.115126
MAR,0.3,-0.822019,0.514985,-0.143287,0.205721,-0.0106678,0.438263,0.409718,0.833489,0.01622


## Diabetes


In [4]:
from sklearn.datasets import load_diabetes

X, y = load_diabetes(as_frame = True, return_X_y = True)

X

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068330,-0.092204
2,0.085299,0.050680,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018118,0.044485
439,0.041708,0.050680,-0.015906,0.017282,-0.037344,-0.013840,-0.024993,-0.011080,-0.046879,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044528,-0.025930


In [5]:
evaluate_dataset_repeated("diabetes", X, y)

benchmark took  1108.6461713314056
RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute,miracle,miwae
MNAR,0.1,0.222286,0.258611,0.207309,0.192466,0.222272,0.317546,0.245276,0.194138,0.350423
MNAR,0.3,0.238452,0.253043,0.215814,0.203783,0.222732,0.289086,0.243985,0.20139,0.273777
MNAR,0.5,0.303333,0.250482,0.240296,0.243825,0.24251,0.267965,0.262982,0.273621,0.267322
MNAR,0.7,0.344291,0.253511,0.276074,0.249656,0.284133,0.249511,0.282974,0.391341,0.277253
MCAR,0.1,0.222616,0.229916,0.205023,0.189248,0.225382,0.268853,0.221326,0.191103,0.247555
MCAR,0.3,0.151096,0.173332,0.142765,0.149044,0.150582,0.148372,0.166595,0.133709,0.207213
MCAR,0.5,0.195569,0.187574,0.173077,0.166205,0.172847,0.166627,0.179487,0.176458,0.286744
MCAR,0.7,0.323852,0.244087,0.261368,0.222087,0.30203,0.286302,0.265912,0.503505,0.263141
MAR,0.1,0.330602,0.316601,0.302709,0.275914,0.295659,0.385305,0.319713,0.276921,0.333016
MAR,0.3,0.126193,0.182665,0.132853,0.125831,0.157598,0.150815,0.163263,0.128699,0.196782




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute,miracle,miwae
MNAR,0.1,0.0469299,0.153246,0.0729738,0.0637556,0.0741792,0.102312,0.0712301,0.0657199,0.136464
MNAR,0.3,0.109953,0.481376,0.237646,0.176171,0.256243,0.263264,0.230681,0.218274,0.254482
MNAR,0.5,0.338342,0.790299,0.272211,0.348411,0.553691,0.399136,0.34895,0.545561,0.544419
MNAR,0.7,0.559856,1.101,0.488051,0.531964,0.776831,0.519599,0.547205,0.927716,0.724568
MCAR,0.1,0.0384297,0.142666,0.0691897,0.0604959,0.0827683,0.0594415,0.0698783,0.0662842,0.0974609
MCAR,0.3,0.0818189,0.334039,0.117347,0.0875486,0.149664,0.102048,0.124006,0.124035,0.124664
MCAR,0.5,0.177118,0.610403,0.200746,0.2027,0.365513,0.223856,0.250454,0.341473,0.230906
MCAR,0.7,0.309347,1.03505,0.354639,0.540964,0.98299,0.566729,0.476189,1.87154,0.567891
MAR,0.1,0.0466857,0.102145,0.0592968,0.0537279,0.0597768,0.0456732,0.0629418,0.0543772,0.0756726
MAR,0.3,0.0576067,0.166087,0.0629178,0.0608146,0.0994516,0.0609499,0.0976604,0.0726027,0.0575837




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute,miracle,miwae
MNAR,0.1,-325.036,-864.114,-339.023,95.8308,-420.316,149.551,200.118,170.937,-256.16
MNAR,0.3,-146.815,119.157,-114.473,171.962,1004.1,-355.237,-276.54,344.523,394.528
MNAR,0.5,1051.91,1059.82,587.53,-35.1043,91.2758,-467.944,-236.901,-735.295,-444.136
MNAR,0.7,639.678,1138.42,811.315,-687.703,737.055,364.014,586.555,776.966,356.153
MCAR,0.1,-121.636,-379.81,-295.321,-599.168,-61.5442,-174.726,301.752,174.434,140.785
MCAR,0.3,-376.672,104.626,-339.398,-540.824,334.725,-135.747,339.069,468.572,34.6182
MCAR,0.5,-132.582,1376.02,373.641,808.288,239.636,388.16,1880.01,503.429,215.095
MCAR,0.7,981.332,4824.0,2133.63,2609.51,1066.92,1663.97,1607.16,2318.03,360.839
MAR,0.1,304.113,-366.073,-299.474,-566.946,9.13203,-195.649,-255.315,110.042,98.1373
MAR,0.3,402.979,276.052,-362.258,791.25,83.7996,284.527,115.899,-869.057,707.78


## Iris


In [None]:
from sklearn.datasets import load_iris

X, y = load_iris(as_frame = True, return_X_y = True)

X

In [None]:
evaluate_dataset_repeated("iris", X, y)

# Conclusion
