In [1]:
import copy
import os
import time
import random
import sys
import warnings

import numpy as np
import pandas as pd
from typing import Any


from hyperimpute.plugins.imputers import Imputers
from hyperimpute.utils.distributions import enable_reproducible_results
import hyperimpute.logger as log

from benchmark_imputation import evaluate_dataset_repeated_internal

from IPython.display import HTML, display
import tabulate

warnings.filterwarnings('ignore')
enable_reproducible_results()

imputers = Imputers()
log.add(sink=sys.stderr, level="INFO")

In [2]:
def get_imputer():
    return imputers.get("hyperimpute", 
        optimizer = "simple"
    )

def evaluate_dataset_repeated(
    name,
    X_raw,
    y,
    ref_methods=["mean", "missforest", "ice", "gain", "sinkhorn", "softimpute"],
    scenarios=["MNAR"],
    miss_pct=[0.1, 0.3, 0.5, 0.7],
    n_iter=2,
    debug=False,
):
    return evaluate_dataset_repeated_internal(
        name = name,
        evaluated_model = get_imputer(),
        X_raw = X_raw,
        y = y,
        ref_methods=ref_methods,
        scenarios=scenarios,
        miss_pct=miss_pct,
        n_iter=n_iter,
        debug=debug,
    )


## Sanity check in  debug mode

In [3]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat', header = None, sep ="\\t")

y = df[5]
X_raw = df.drop(columns = [5])

evaluate_dataset_repeated("airfoil", X_raw, y,        
            scenarios =  ["MNAR"],      
            ref_methods=["mean"],
            debug = True,
            n_iter = 1,
            miss_pct = [0.3, 0.5]
)

[2021-12-28T17:56:04.167172+0200][2160642][INFO]   > BO iter 0


> evaluation trial  0
  > eval  MNAR 0.3


[2021-12-28T17:56:04.803025+0200][2160642][INFO]      >>> Column 0 <-- score -0.022112253926842934 <-- Model random_forest_regressor
[2021-12-28T17:56:05.279050+0200][2160642][INFO]      >>> Column 1 <-- score -0.023227102227457337 <-- Model random_forest_regressor
[2021-12-28T17:56:06.993310+0200][2160642][INFO]      >>> Column 2 <-- score 0.9467684 <-- Model catboost
[2021-12-28T17:56:08.292088+0200][2160642][INFO]      >>> Column 3 <-- score 0.81683 <-- Model catboost
[2021-12-28T17:56:08.725737+0200][2160642][INFO]      >>> Column 4 <-- score -0.01805905811542978 <-- Model catboost_regressor
[2021-12-28T17:56:44.812234+0200][2160642][INFO]   > BO iter 1
[2021-12-28T17:56:46.384266+0200][2160642][INFO]      >>> Column 3 <-- score 0.8890893333333334 <-- Model catboost
[2021-12-28T17:56:46.822382+0200][2160642][INFO]      >>> Column 4 <-- score -0.00010503702123890898 <-- Model xgboost_regressor
[2021-12-28T17:56:48.518586+0200][2160642][INFO]      >>> Column 2 <-- score 0.9957724 <--

  > eval  MNAR 0.5


[2021-12-28T17:59:18.660317+0200][2160642][INFO]      >>> Column 0 <-- score -0.028178478163543193 <-- Model random_forest_regressor
[2021-12-28T17:59:19.216352+0200][2160642][INFO]      >>> Column 1 <-- score -0.028439419642828853 <-- Model random_forest_regressor
[2021-12-28T17:59:20.219195+0200][2160642][INFO]      >>> Column 2 <-- score 0.8449091601065641 <-- Model catboost
[2021-12-28T17:59:20.990599+0200][2160642][INFO]      >>> Column 3 <-- score 0.633037342816909 <-- Model xgboost
[2021-12-28T17:59:21.395366+0200][2160642][INFO]      >>> Column 4 <-- score -0.029780313809634813 <-- Model catboost_regressor
[2021-12-28T18:00:00.016349+0200][2160642][INFO]   > BO iter 1
[2021-12-28T18:00:01.308748+0200][2160642][INFO]      >>> Column 2 <-- score 0.9917646508957128 <-- Model catboost
[2021-12-28T18:00:01.902412+0200][2160642][INFO]      >>> Column 1 <-- score -0.0008081790000816433 <-- Model catboost_regressor
[2021-12-28T18:00:02.930187+0200][2160642][INFO]      >>> Column 3 <-- 

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean
MNAR,0.3,0.217078,0.291261
MNAR,0.5,0.28535,0.293272




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean
MNAR,0.3,0.0670451,0.368713
MNAR,0.5,0.08482,0.581678




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean
MNAR,0.3,7.35963,7.2194
MNAR,0.5,9.48815,23.4791


# Datasets

In [4]:
import hyperimpute.logger as log

log.remove()

|           Dataset          |  Size |
|:--------------------------:|:-----:|
|     airfoil self noise     |  1503 |
|     blood transfusion      |  748  |
|  breast cancer diagnostic  |  569  |
|         california         | 20640 |
|   climate model crashes    |  540  |
|    concrete compression    |  1030 |
|       concrete slump       |  103  |
| connectionist bench sonar  |  208  |
|  connectionist bench vowel |  990  |
|            iris            |  150  |
|      wine quality red      |  1599 |
|     wine quality white     |  4899 |
|            yeast           |  1484 |

## Dataset: UCI Airfoil Self-Noise Data Set

https://archive.ics.uci.edu/ml/datasets/airfoil+self-noise


In [5]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat', header = None, sep ="\\t")

df

Unnamed: 0,0,1,2,3,4,5
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461
...,...,...,...,...,...,...
1498,2500,15.6,0.1016,39.6,0.052849,110.264
1499,3150,15.6,0.1016,39.6,0.052849,109.254
1500,4000,15.6,0.1016,39.6,0.052849,106.604
1501,5000,15.6,0.1016,39.6,0.052849,106.224


In [6]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("airfoil", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.144708,0.288164,0.2139,0.252906,0.276345,0.260976,0.491963
MNAR,0.3,0.217241,0.291256,0.256207,0.276267,0.288532,0.278979,0.474374
MNAR,0.5,0.296338,0.289964,0.298987,0.296867,0.305134,0.290346,0.422935
MNAR,0.7,0.340418,0.292197,0.336271,0.349725,0.340133,0.306781,0.438317




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0133248,0.118729,0.057651,0.0656187,0.0933934,0.0459619,0.105292
MNAR,0.3,0.0391352,0.35331,0.150004,0.174236,0.277195,0.157011,0.220076
MNAR,0.5,0.128265,0.59418,0.328645,0.363237,0.443169,0.294438,0.567461
MNAR,0.7,0.336491,0.819319,0.492555,0.523901,0.73967,0.432831,1.07156




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,1.05919,2.05349,1.63053,2.23704,2.33516,1.77581,2.953
MNAR,0.3,4.38635,6.45973,6.9211,8.60349,6.50533,5.91243,7.17626
MNAR,0.5,9.92052,19.4891,13.5252,16.5408,15.0103,15.133,19.3609
MNAR,0.7,23.0555,31.4659,24.5489,26.5661,25.7929,23.6032,40.5113


## Dataset: UCI Blood Transfusion Service Center Data Set

https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data

In [7]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data')

df

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
...,...,...,...,...,...
743,23,2,500,38,0
744,21,2,500,52,0
745,23,3,750,62,0
746,39,1,250,39,0


In [8]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("blood", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.164804,0.224422,0.14047,0.15852,0.2265,0.170516,0.181528
MNAR,0.3,0.218616,0.176396,0.125179,0.143957,0.191075,0.17383,0.173564
MNAR,0.5,0.185113,0.171459,0.144833,0.146948,0.18769,0.181997,0.241073
MNAR,0.7,0.300299,0.169033,0.157616,0.155229,0.216624,0.187166,0.229547




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0174071,0.0558912,0.0208889,0.0191985,0.0521654,0.0188856,0.0213952
MNAR,0.3,0.0875289,0.147992,0.0618508,0.0608915,0.129178,0.0545285,0.0838654
MNAR,0.5,0.0962936,0.232759,0.107907,0.104739,0.150373,0.0842868,0.204581
MNAR,0.7,0.375563,0.320429,0.170906,0.193656,0.295661,0.120397,0.317621




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,-0.0026738,0.0,0.0026738,0.0026738,0.0026738,0.0,0.0026738
MNAR,0.3,0.0,0.0026738,-0.0026738,-0.0026738,-0.00534759,0.00534759,0.0
MNAR,0.5,0.00534759,0.00534759,0.0,-0.0026738,0.0,-0.00534759,0.0026738
MNAR,0.7,0.0,-1.38778e-17,0.0,0.0026738,0.0,0.0026738,-2.77556e-17


## Dataset: Breast Cancer Wisconsin (Diagnostic)

https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)

In [9]:
from sklearn.datasets import load_breast_cancer

X_raw, y = load_breast_cancer(as_frame = True, return_X_y = True)

X_raw

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [10]:
evaluate_dataset_repeated("bc", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0500823,0.172573,0.0708669,0.0463529,0.0769067,0.0854349,0.109586
MNAR,0.3,0.0489212,0.157164,0.0588115,0.0337353,0.0573996,0.0728904,0.0956118
MNAR,0.5,0.051741,0.158649,0.0637953,0.0387232,0.061998,0.0776887,0.0939525
MNAR,0.7,0.0648283,0.15608,0.0794214,0.0610326,0.105739,0.0935093,0.105789




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.00966045,0.0978985,0.0218317,0.00967066,0.0305486,0.0231261,0.0336695
MNAR,0.3,0.0215591,0.255641,0.0413653,0.0174521,0.05566,0.0519733,0.0811908
MNAR,0.5,0.0469398,0.442698,0.0865996,0.0318441,0.105329,0.109949,0.130773
MNAR,0.7,0.0885137,0.633934,0.144621,0.0848959,0.201956,0.214609,0.173438




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0034965,-0.0034965,-0.0034965,0.0,-0.0034965,0.00699301,0.00699301
MNAR,0.3,0.0034965,0.0034965,-0.0034965,0.0,0.0034965,0.0034965,0.0034965
MNAR,0.5,0.0034965,0.0034965,0.0,0.0034965,0.0,0.00699301,-0.0034965
MNAR,0.7,-0.00699301,0.0034965,0.00699301,-0.0104895,0.0,-0.0034965,0.0034965


## Dataset: California Housing



In [11]:
from sklearn.datasets import fetch_california_housing

X_raw, y = fetch_california_housing(as_frame = True, return_X_y = True)

X_raw

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [12]:
evaluate_dataset_repeated("california", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0850631,0.148011,0.0971206,0.12305,0.155804,0.163337,0.19019
MNAR,0.3,0.119716,0.14626,0.113112,0.127818,0.252883,0.179639,0.203635
MNAR,0.5,0.155559,0.145574,0.134367,0.171084,0.182343,0.186639,0.232857
MNAR,0.7,0.159224,0.146142,0.150149,0.233411,0.196303,0.192536,0.286745




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.00966953,0.0738321,0.0244431,0.022453,0.0622053,0.0316327,0.0451187
MNAR,0.3,0.0550806,0.220222,0.0823236,0.0869749,0.223576,0.110115,0.117346
MNAR,0.5,0.175012,0.364354,0.188968,0.183464,0.288004,0.195871,0.263305
MNAR,0.7,0.19686,0.513229,0.276308,0.354337,0.455993,0.319911,0.659091




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0115255,0.0229007,0.0214946,0.0132075,0.0289336,0.0222988,0.0311324
MNAR,0.3,0.0771492,0.0642283,0.0552563,0.0750979,0.0769797,0.0965283,0.08634
MNAR,0.5,0.14881,0.136758,0.123395,0.131455,0.127111,0.207312,0.151784
MNAR,0.7,0.346278,0.373748,0.24603,0.288346,0.308006,0.37621,0.229418


# Dataset: Climate Model Simulation Crashes
https://archive.ics.uci.edu/ml/datasets/climate+model+simulation+crashes

In [13]:
samples = np.loadtxt("https://archive.ics.uci.edu/ml/machine-learning-databases/00252/pop_failures.dat", skiprows=1)
df = pd.DataFrame(samples)

df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,1.0,1.0,0.859036,0.927825,0.252866,0.298838,0.170521,0.735936,0.428325,0.567947,...,0.245675,0.104226,0.869091,0.997518,0.448620,0.307522,0.858310,0.796997,0.869893,0.0
1,1.0,2.0,0.606041,0.457728,0.359448,0.306957,0.843331,0.934851,0.444572,0.828015,...,0.616870,0.975786,0.914344,0.845247,0.864152,0.346713,0.356573,0.438447,0.512256,1.0
2,1.0,3.0,0.997600,0.373238,0.517399,0.504993,0.618903,0.605571,0.746225,0.195928,...,0.679355,0.803413,0.643995,0.718441,0.924775,0.315371,0.250642,0.285636,0.365858,1.0
3,1.0,4.0,0.783408,0.104055,0.197533,0.421837,0.742056,0.490828,0.005525,0.392123,...,0.471463,0.597879,0.761659,0.362751,0.912819,0.977971,0.845921,0.699431,0.475987,1.0
4,1.0,5.0,0.406250,0.513199,0.061812,0.635837,0.844798,0.441502,0.191926,0.487546,...,0.551543,0.743877,0.312349,0.650223,0.522261,0.043545,0.376660,0.280098,0.132283,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535,3.0,176.0,0.657136,0.489375,0.133713,0.411950,0.087780,0.356289,0.480204,0.029678,...,0.280546,0.384117,0.885948,0.768482,0.459479,0.334482,0.573002,0.610183,0.737706,1.0
536,3.0,177.0,0.915894,0.842720,0.518947,0.090622,0.336981,0.893576,0.978703,0.674868,...,0.798108,0.353546,0.044796,0.990900,0.347027,0.512499,0.810549,0.593332,0.142565,0.0
537,3.0,178.0,0.478600,0.941185,0.769245,0.950776,0.189406,0.112743,0.745645,0.527096,...,0.193103,0.829563,0.101506,0.548878,0.381966,0.198811,0.867108,0.461632,0.652817,1.0
538,3.0,179.0,0.007793,0.779287,0.867468,0.704820,0.983282,0.420303,0.710612,0.174746,...,0.761134,0.436714,0.690132,0.825133,0.981656,0.113193,0.364799,0.201469,0.536535,1.0


In [14]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("climate_model", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.298743,0.291078,0.298766,0.291174,0.38073,0.326369,0.298671
MNAR,0.3,0.353979,0.306099,0.319163,0.306126,0.368946,0.338511,0.350395
MNAR,0.5,0.355531,0.298662,0.341402,0.298843,0.361791,0.325642,0.323398
MNAR,0.7,0.366916,0.289632,0.36191,0.306556,0.352193,0.305717,0.303755




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.176842,0.201252,0.173079,0.201059,0.0921369,0.118547,0.160969
MNAR,0.3,0.400732,0.606718,0.468939,0.605685,0.293228,0.370513,0.441785
MNAR,0.5,0.496068,1.03484,0.518399,1.03041,0.458838,0.689379,0.765584
MNAR,0.7,0.468083,1.39675,0.495648,1.18787,0.624923,1.04504,1.08678




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0,0,0,0,0,0,0
MNAR,0.3,0,0,0,0,0,0,0
MNAR,0.5,0,0,0,0,0,0,0
MNAR,0.7,0,0,0,0,0,0,0


## Concrete Compressive Strength Data Set
https://archive.ics.uci.edu/ml/datasets/concrete+compressive+strength

In [17]:
df = pd.read_excel("https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls")

df



Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.052780
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075
...,...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,44.284354
1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,31.178794
1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,23.696601
1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28,32.768036


In [18]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("concrete_compressive", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.127659,0.229549,0.161401,0.14268,0.21864,0.151985,0.271812
MNAR,0.3,0.14105,0.218032,0.183882,0.173305,0.212105,0.173747,0.274559
MNAR,0.5,0.23432,0.222959,0.215246,0.217602,0.233711,0.211424,0.304984
MNAR,0.7,0.262718,0.22627,0.256473,0.219176,0.262162,0.236142,0.367232




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.02841,0.150003,0.0567961,0.0464503,0.121426,0.03645,0.0936959
MNAR,0.3,0.0752258,0.416005,0.162091,0.151437,0.313251,0.110078,0.194939
MNAR,0.5,0.289068,0.726082,0.282916,0.407049,0.622954,0.268095,0.322919
MNAR,0.7,0.428775,1.02878,0.534083,0.70146,0.85769,0.471369,0.7353




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,12.1776,6.0547,6.64722,3.1711,7.67449,5.63959,11.3421
MNAR,0.3,19.9213,42.8667,34.5156,22.9598,35.301,37.2671,39.9327
MNAR,0.5,62.4502,73.452,68.3492,53.7658,75.1467,50.4143,70.4116
MNAR,0.7,129.838,136.197,151.009,128.801,140.449,137.929,127.668


## Concrete Slump Test Data Set

https://archive.ics.uci.edu/ml/datasets/concrete+slump+test

In [19]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/slump/slump_test.data")

df

Unnamed: 0,No,Cement,Slag,Fly ash,Water,SP,Coarse Aggr.,Fine Aggr.,SLUMP(cm),FLOW(cm),Compressive Strength (28-day)(Mpa)
0,1,273.0,82.0,105.0,210.0,9.0,904.0,680.0,23.0,62.0,34.99
1,2,163.0,149.0,191.0,180.0,12.0,843.0,746.0,0.0,20.0,41.14
2,3,162.0,148.0,191.0,179.0,16.0,840.0,743.0,1.0,20.0,41.81
3,4,162.0,148.0,190.0,179.0,19.0,838.0,741.0,3.0,21.5,42.08
4,5,154.0,112.0,144.0,220.0,10.0,923.0,658.0,20.0,64.0,26.82
...,...,...,...,...,...,...,...,...,...,...,...
98,99,248.3,101.0,239.1,168.9,7.7,954.2,640.6,0.0,20.0,49.97
99,100,248.0,101.0,239.9,169.1,7.7,949.9,644.1,2.0,20.0,50.23
100,101,258.8,88.0,239.6,175.3,7.6,938.9,646.0,0.0,20.0,50.50
101,102,297.1,40.9,239.9,194.0,7.5,908.9,651.8,27.5,67.0,49.17


In [20]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns = ["No", last_col])

evaluate_dataset_repeated("concret_slump", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.143125,0.268542,0.202363,0.161628,0.223312,0.208112,0.299692
MNAR,0.3,0.244332,0.287321,0.246143,0.220585,0.294042,0.237259,0.341461
MNAR,0.5,0.282429,0.282655,0.259785,0.25283,0.341443,0.262996,0.363972
MNAR,0.7,0.327394,0.295106,0.330625,0.290618,0.401101,0.286423,0.377579




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0559945,0.182286,0.0856876,0.0638095,0.126284,0.0935587,0.114004
MNAR,0.3,0.210167,0.604646,0.289922,0.269237,0.340178,0.258384,0.349979
MNAR,0.5,0.355165,0.985586,0.369878,0.550043,0.572694,0.501129,0.570151
MNAR,0.7,0.574186,1.36914,0.603596,0.855377,0.810402,0.81221,0.809319




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.200741,3.58583,-0.323835,1.21367,0.953388,3.16196,-0.317518
MNAR,0.3,4.52869,18.7524,14.543,10.5688,13.1498,12.9133,15.8334
MNAR,0.5,7.62541,34.139,23.859,12.553,18.846,7.80416,27.3853
MNAR,0.7,45.2355,42.8609,66.6874,23.4305,28.542,30.5659,43.0757


## Connectionist Bench (Sonar, Mines vs. Rocks) Data Set

https://archive.ics.uci.edu/ml/datasets/connectionist+bench+(sonar,+mines+vs.+rocks)

In [21]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data", header = None)

df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,0.0200,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.0180,0.0084,0.0090,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.0140,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.2280,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.0180,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.0100,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.0150,0.0085,0.0073,0.0050,0.0044,0.0040,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.0590,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.0110,0.0015,0.0072,0.0048,0.0107,0.0094,R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,0.0187,0.0346,0.0168,0.0177,0.0393,0.1630,0.2028,0.1694,0.2328,0.2684,...,0.0116,0.0098,0.0199,0.0033,0.0101,0.0065,0.0115,0.0193,0.0157,M
204,0.0323,0.0101,0.0298,0.0564,0.0760,0.0958,0.0990,0.1018,0.1030,0.2154,...,0.0061,0.0093,0.0135,0.0063,0.0063,0.0034,0.0032,0.0062,0.0067,M
205,0.0522,0.0437,0.0180,0.0292,0.0351,0.1171,0.1257,0.1178,0.1258,0.2529,...,0.0160,0.0029,0.0051,0.0062,0.0089,0.0140,0.0138,0.0077,0.0031,M
206,0.0303,0.0353,0.0490,0.0608,0.0167,0.1354,0.1465,0.1123,0.1945,0.2354,...,0.0086,0.0046,0.0126,0.0036,0.0035,0.0034,0.0079,0.0036,0.0048,M


In [22]:
last_col = df.columns[-1]
y = (df[last_col] == 'M').astype(int)
X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("connectionist_sonar", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.132944,0.236434,0.13402,0.127612,0.226908,0.155026,0.208471
MNAR,0.3,0.105842,0.212061,0.109972,0.0996079,0.203512,0.133231,0.179371
MNAR,0.5,0.117097,0.216269,0.119613,0.106197,0.201091,0.144205,0.186268
MNAR,0.7,0.149001,0.194133,0.13453,0.135913,0.215859,0.157296,0.174844




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0544924,0.144948,0.0561602,0.0461353,0.0595784,0.0641676,0.0995767
MNAR,0.3,0.0983328,0.395963,0.100255,0.0671456,0.119543,0.139126,0.199219
MNAR,0.5,0.161854,0.681462,0.16818,0.0990853,0.161084,0.283811,0.380363
MNAR,0.7,0.301265,0.799918,0.276173,0.225508,0.310164,0.484101,0.493367




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0384615,-0.00961538,-0.00961538,1.38778e-17,-0.0192308,0.0192308,-0.0480769
MNAR,0.3,-0.0192308,0.0,-0.0192308,0.0192308,0.00961538,0.00961538,-0.00961538
MNAR,0.5,0.0,-0.0288462,0.0288462,0.00961538,-0.0288462,0.0384615,0.0192308
MNAR,0.7,-0.00961538,0.00961538,0.00961538,0.0288462,-1.38778e-17,0.00961538,-0.00961538


## Wine-Red dataset

In [23]:
# Wine Quality Data Set

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep = ';')

df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [24]:
last_col = df.columns[-1]

y = df[last_col]
mapped_labels = sorted(y.unique())
mapping = {}
for idx, label in enumerate(mapped_labels):
    mapping[label] = idx
y = y.map(mapping)       

X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("wine_red", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0793192,0.142182,0.0993018,0.0938755,0.119726,0.101626,0.131569
MNAR,0.3,0.10228,0.13478,0.103629,0.108172,0.131254,0.105966,0.140035
MNAR,0.5,0.115014,0.138227,0.118793,0.11813,0.14564,0.117033,0.145233
MNAR,0.7,0.1393,0.143592,0.143854,0.125666,0.146758,0.134211,0.164857




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0155575,0.0817115,0.0313744,0.0208089,0.0491792,0.0197792,0.0407482
MNAR,0.3,0.0476172,0.240676,0.0877433,0.0581179,0.159379,0.0665156,0.115418
MNAR,0.5,0.0893502,0.40958,0.155191,0.128161,0.272691,0.13666,0.200833
MNAR,0.7,0.155995,0.612671,0.274585,0.330354,0.447788,0.241305,0.303967




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,-0.0075,-0.015,-0.01125,-0.01125,0.01375,0.01375,0.0125
MNAR,0.3,-0.00625,0.0175,0.01,0.0075,0.02625,0.0075,0.02625
MNAR,0.5,0.0275,0.045,0.03125,-0.01875,0.05125,0.035,0.0325
MNAR,0.7,0.07375,0.0875,0.03,-0.02125,0.04,0.0925,0.0625


## Wine-White dataset

In [25]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep = ';')

df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [26]:
last_col = df.columns[-1]

y = df[last_col]
mapped_labels = sorted(y.unique())
mapping = {}
for idx, label in enumerate(mapped_labels):
    mapping[label] = idx
y = y.map(mapping)  

X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("wine_white", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0764606,0.107663,0.0899638,0.081524,0.104303,0.11503,0.11019
MNAR,0.3,0.0826739,0.109283,0.0858944,0.0904319,0.116092,0.111547,0.113623
MNAR,0.5,0.109128,0.116303,0.10763,0.104696,0.128051,0.124692,0.137258
MNAR,0.7,0.115674,0.11002,0.11404,0.108375,0.121911,0.118799,0.128982




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0199282,0.0611466,0.0322991,0.0219552,0.0498956,0.0124373,0.0305315
MNAR,0.3,0.0531883,0.185615,0.0824925,0.05151,0.126319,0.0453742,0.0907645
MNAR,0.5,0.0983222,0.340803,0.145784,0.11738,0.21426,0.0930199,0.149696
MNAR,0.7,0.1149,0.438104,0.226542,0.233775,0.240696,0.15108,0.204872




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,-0.00367347,-0.0155102,-0.0106122,-0.00653061,0.0044898,0.00653061,-0.00367347
MNAR,0.3,0.0187755,0.00816327,0.0126531,0.00734694,0.0102041,0.00122449,0.0167347
MNAR,0.5,0.00857143,-0.0318367,-0.00163265,0.00489796,-0.00857143,0.0134694,0.00408163
MNAR,0.7,0.0395918,0.00530612,0.0155102,0.022449,0.0404082,0.0469388,0.0387755


## Yeast Data Set


In [27]:
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data",sep="\s+", header = None)

df = df.drop(columns = [0])

for col in [9]:
    df[col] = LabelEncoder().fit_transform(df[col])

df

Unnamed: 0,1,2,3,4,5,6,7,8,9
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,6
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,6
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,6
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,7
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,6
...,...,...,...,...,...,...,...,...,...
1479,0.81,0.62,0.43,0.17,0.5,0.0,0.53,0.22,4
1480,0.47,0.43,0.61,0.40,0.5,0.0,0.48,0.47,7
1481,0.67,0.57,0.36,0.19,0.5,0.0,0.56,0.22,4
1482,0.43,0.40,0.60,0.16,0.5,0.0,0.53,0.39,7


In [28]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("yeast", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0996731,0.112021,0.114111,0.101011,0.14094,0.138608,0.132191
MNAR,0.3,0.113581,0.118736,0.126055,0.817868,0.151548,0.143931,0.150059
MNAR,0.5,0.26021,0.118045,0.174599,0.210869,0.139037,0.146162,0.180206
MNAR,0.7,0.274545,0.117681,0.164879,0.135578,0.140059,0.143889,0.239026




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.035277,0.0568872,0.0421737,0.0373,0.0555351,0.0290176,0.0271993
MNAR,0.3,0.0860163,0.176392,0.102672,0.224053,0.142408,0.0802766,0.11073
MNAR,0.5,0.337209,0.292389,0.222547,0.19814,0.228046,0.139659,0.21813
MNAR,0.7,0.524418,0.410635,0.281003,0.298496,0.289385,0.199691,0.611932




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.549567,-0.316927,-0.00613426,-0.0211455,-0.195356,-0.502976,0.268317
MNAR,0.3,1.30303,0.964024,0.425304,0.331436,0.252578,-0.60134,-0.716556
MNAR,0.5,0.0936687,1.27095,1.56778,0.526099,0.970464,1.35449,0.749686
MNAR,0.7,2.59162,5.73852,4.95898,1.17125,1.6188,2.07922,1.32384


## Diabetes


In [29]:
from sklearn.datasets import load_diabetes

X, y = load_diabetes(as_frame = True, return_X_y = True)

X

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068330,-0.092204
2,0.085299,0.050680,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018118,0.044485
439,0.041708,0.050680,-0.015906,0.017282,-0.037344,-0.013840,-0.024993,-0.011080,-0.046879,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044528,-0.025930


In [30]:
evaluate_dataset_repeated("diabetes", X, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.176864,0.216373,0.175713,0.161813,0.185126,0.214137,0.208031
MNAR,0.3,0.151521,0.182763,0.143738,0.139092,0.166386,0.153483,0.169905
MNAR,0.5,0.227668,0.216416,0.201701,0.226108,0.232758,0.228835,0.215941
MNAR,0.7,0.179144,0.188415,0.184893,0.166721,0.211558,0.178487,0.195383




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0422108,0.132053,0.0615625,0.054641,0.0752719,0.0509197,0.0663924
MNAR,0.3,0.110242,0.358718,0.128995,0.103213,0.21638,0.116605,0.165332
MNAR,0.5,0.171192,0.657878,0.267726,0.27156,0.453477,0.306193,0.302122
MNAR,0.7,0.366252,0.850465,0.349789,0.462821,0.608505,0.352183,0.42478




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,-264.919,136.92,288.093,105.422,21.2783,350.611,91.2088
MNAR,0.3,-257.081,492.632,-61.7086,-224.735,-160.726,310.911,157.233
MNAR,0.5,348.563,1001.38,553.485,628.357,113.858,112.279,609.294
MNAR,0.7,1012.27,2251.0,1534.6,1244.64,1554.91,744.919,1028.25


## Iris


In [31]:
from sklearn.datasets import load_iris

X, y = load_iris(as_frame = True, return_X_y = True)

X

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [32]:
evaluate_dataset_repeated("iris", X, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.109464,0.260318,0.126617,0.113694,0.138309,0.12568,0.139134
MNAR,0.3,0.127895,0.28486,0.157074,0.150146,0.203458,0.179286,0.234296
MNAR,0.5,0.220656,0.282144,0.202143,0.196524,0.219567,0.21162,0.302076
MNAR,0.7,0.312318,0.29283,0.23034,0.258736,0.269414,0.25989,0.450384




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0218233,0.0927906,0.0285739,0.0218347,0.0277367,0.0239422,0.0315751
MNAR,0.3,0.0534257,0.266748,0.066535,0.0594183,0.144287,0.0842251,0.10803
MNAR,0.5,0.150529,0.486255,0.139007,0.165694,0.175254,0.205944,0.336353
MNAR,0.7,0.415498,0.692087,0.304326,0.479452,0.373021,0.376784,0.854559




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0,0.0131579,-0.0131579,0.0,0.0,0.0,0.0131579
MNAR,0.3,0.0,0.0131579,0.0,0.0,0.0263158,-0.0131579,0.0131579
MNAR,0.5,0.0921053,0.0657895,0.0394737,0.0131579,0.0394737,0.0,0.0131579
MNAR,0.7,0.0789474,0.0394737,0.0657895,0.144737,0.0394737,0.0394737,0.0657895


# Conclusion
