In [1]:
import copy
import os
import time
import random
import sys
import warnings

import numpy as np
import pandas as pd
from typing import Any


from hyperimpute.plugins.imputers import Imputers
from hyperimpute.utils.distributions import enable_reproducible_results
import hyperimpute.logger as log

from benchmark_imputation import evaluate_dataset_repeated_internal

from IPython.display import HTML, display
import tabulate

warnings.filterwarnings('ignore')
enable_reproducible_results()

imputers = Imputers()
log.add(sink=sys.stderr, level="INFO")

In [2]:
def get_imputer():
    return imputers.get("hyperimpute", 
        optimizer = "simple"
    )

def evaluate_dataset_repeated(
    name,
    X_raw,
    y,
    ref_methods=["mean", "sklearn_missforest", "sklearn_ice", "gain", "sinkhorn", "softimpute"],
    scenarios=["MNAR", "MCAR", "MAR"],
    miss_pct=[0.1, 0.3, 0.5, 0.7],
    n_iter=2,
    debug=False,
):
    return evaluate_dataset_repeated_internal(
        name = name,
        evaluated_model = get_imputer(),
        X_raw = X_raw,
        y = y,
        ref_methods=ref_methods,
        scenarios=scenarios,
        miss_pct=miss_pct,
        n_iter=n_iter,
        debug=debug,
    )


## Sanity check in  debug mode

In [3]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat', header = None, sep ="\\t")

y = df[5]
X_raw = df.drop(columns = [5])

evaluate_dataset_repeated("airfoil", X_raw, y,        
            scenarios =  ["MNAR"],      
            ref_methods=["mean"],
            debug = True,
            n_iter = 1,
            miss_pct = [0.3, 0.5]
)

> evaluation trial  0
  > eval  MNAR 0.3


[2022-01-10T14:04:04.087921+0200][892856][INFO]   > BO iter 0
[2022-01-10T14:04:04.681761+0200][892856][INFO]      >>> Column 0 <-- score -0.02205200381533318 <-- Model random_forest_regressor
[2022-01-10T14:04:05.234518+0200][892856][INFO]      >>> Column 1 <-- score -0.024103476543516647 <-- Model catboost_regressor
[2022-01-10T14:04:07.013883+0200][892856][INFO]      >>> Column 2 <-- score 0.9467684 <-- Model catboost
[2022-01-10T14:04:08.220652+0200][892856][INFO]      >>> Column 3 <-- score 0.81683 <-- Model catboost
[2022-01-10T14:04:08.615515+0200][892856][INFO]      >>> Column 4 <-- score -0.01805905811542978 <-- Model catboost_regressor
[2022-01-10T14:04:48.322921+0200][892856][INFO]   > BO iter 1
[2022-01-10T14:04:49.669120+0200][892856][INFO]      >>> Column 3 <-- score 0.9173439999999999 <-- Model catboost
[2022-01-10T14:04:50.102060+0200][892856][INFO]      >>> Column 4 <-- score -0.00014304528998656266 <-- Model xgboost_regressor
[2022-01-10T14:04:51.589785+0200][892856][

  > eval  MNAR 0.5


[2022-01-10T14:07:10.429804+0200][892856][INFO]      >>> Column 0 <-- score -0.028120643458368982 <-- Model random_forest_regressor
[2022-01-10T14:07:10.833265+0200][892856][INFO]      >>> Column 1 <-- score -0.029424511091093154 <-- Model catboost_regressor
[2022-01-10T14:07:11.632924+0200][892856][INFO]      >>> Column 2 <-- score 0.8449091601065641 <-- Model catboost
[2022-01-10T14:07:12.282331+0200][892856][INFO]      >>> Column 3 <-- score 0.633037342816909 <-- Model xgboost
[2022-01-10T14:07:12.658105+0200][892856][INFO]      >>> Column 4 <-- score -0.029780313809634813 <-- Model catboost_regressor
[2022-01-10T14:07:47.406283+0200][892856][INFO]   > BO iter 1
[2022-01-10T14:07:48.345480+0200][892856][INFO]      >>> Column 2 <-- score 0.9954343021586576 <-- Model catboost
[2022-01-10T14:07:48.852982+0200][892856][INFO]      >>> Column 1 <-- score -0.0007007482960861136 <-- Model xgboost_regressor
[2022-01-10T14:07:49.661525+0200][892856][INFO]      >>> Column 3 <-- score 0.8357438

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean
MNAR,0.3,0.215636,0.291261
MNAR,0.5,0.28744,0.293272




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean
MNAR,0.3,0.049091,0.368713
MNAR,0.5,0.0767515,0.581678




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean
MNAR,0.3,5.68334,6.11345
MNAR,0.5,15.4386,17.8344


# Datasets

In [4]:
import hyperimpute.logger as log

log.remove()

|           Dataset          |  Size |
|:--------------------------:|:-----:|
|     airfoil self noise     |  1503 |
|     blood transfusion      |  748  |
|  breast cancer diagnostic  |  569  |
|         california         | 20640 |
|   climate model crashes    |  540  |
|    concrete compression    |  1030 |
|       concrete slump       |  103  |
| connectionist bench sonar  |  208  |
|  connectionist bench vowel |  990  |
|            iris            |  150  |
|      wine quality red      |  1599 |
|     wine quality white     |  4899 |
|            yeast           |  1484 |

## Dataset: UCI Airfoil Self-Noise Data Set

https://archive.ics.uci.edu/ml/datasets/airfoil+self-noise


In [5]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat', header = None, sep ="\\t")

df

Unnamed: 0,0,1,2,3,4,5
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461
...,...,...,...,...,...,...
1498,2500,15.6,0.1016,39.6,0.052849,110.264
1499,3150,15.6,0.1016,39.6,0.052849,109.254
1500,4000,15.6,0.1016,39.6,0.052849,106.604
1501,5000,15.6,0.1016,39.6,0.052849,106.224


In [6]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("airfoil", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0986067,0.272058,0.186676,0.227327,0.250581,0.247976,0.436362
MNAR,0.3,0.215678,0.290283,0.250333,0.277035,0.2893,0.285903,0.440387
MNAR,0.5,0.279726,0.290034,0.296214,0.295891,0.299723,0.292813,0.437269
MNAR,0.7,0.37544,0.290367,0.344691,0.378536,0.313747,0.301324,0.441572
MCAR,0.1,0.148841,0.295521,0.219529,0.250741,0.306174,0.288508,0.366133
MCAR,0.3,0.210631,0.288045,0.250799,0.262798,0.268198,0.279614,0.385917
MCAR,0.5,0.286537,0.287997,0.297014,0.282577,0.302108,0.29154,0.436472
MCAR,0.7,0.379508,0.288652,0.342933,0.305802,0.297301,0.303154,0.437688
MAR,0.1,0.0983867,0.308118,0.240904,0.280889,0.334779,0.284804,0.478445
MAR,0.3,0.14679,0.27941,0.245977,0.246446,0.25488,0.249318,0.406816




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0103882,0.11047,0.0500276,0.0581412,0.0835655,0.0387163,0.100338
MNAR,0.3,0.04038,0.364479,0.18494,0.192564,0.274264,0.162139,0.469811
MNAR,0.5,0.110037,0.581481,0.28756,0.336475,0.441395,0.29108,0.750241
MNAR,0.7,0.365855,0.823487,0.533157,0.565133,0.607638,0.445415,0.777694
MCAR,0.1,0.0113089,0.12084,0.0587849,0.0704895,0.0895503,0.05022,0.0615458
MCAR,0.3,0.0522155,0.342383,0.155944,0.176834,0.270703,0.150233,0.176548
MCAR,0.5,0.103163,0.578916,0.271038,0.329172,0.500393,0.276782,0.753827
MCAR,0.7,0.565523,0.812215,0.401302,0.555483,0.581656,0.435361,0.827179
MAR,0.1,0.00450576,0.079268,0.0468246,0.0497269,0.0624417,0.0300007,0.0782933
MAR,0.3,0.020294,0.204405,0.0971576,0.111182,0.128481,0.0787673,0.202243




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,1.29417,2.52297,1.47202,1.84762,1.96122,2.34969,2.47851
MNAR,0.3,5.13616,7.01838,5.04973,7.0159,7.07673,7.36324,6.28799
MNAR,0.5,14.5545,18.9397,12.5652,18.6493,11.0757,14.2694,16.2646
MNAR,0.7,26.6019,31.8432,28.7775,28.0672,29.8481,31.5055,41.1186
MCAR,0.1,0.564624,2.24016,1.28312,1.65849,2.07417,1.62574,2.33165
MCAR,0.3,3.5256,5.58734,7.00772,5.56349,6.83604,5.43883,6.91711
MCAR,0.5,9.38671,16.3566,17.3259,15.3147,16.3353,17.1037,16.9102
MCAR,0.7,25.8197,35.0125,22.24,26.3216,30.8277,24.993,35.7535
MAR,0.1,0.51869,0.616982,0.491072,0.822526,0.918958,1.25038,1.22748
MAR,0.3,2.48678,4.02872,3.73681,3.28798,3.98137,3.51572,3.52378


## Dataset: UCI Blood Transfusion Service Center Data Set

https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data

In [7]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data')

df

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
...,...,...,...,...,...
743,23,2,500,38,0
744,21,2,500,52,0
745,23,3,750,62,0
746,39,1,250,39,0


In [8]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("blood", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.107036,0.191052,0.106587,0.136593,0.133743,0.139856,0.165121
MNAR,0.3,0.181682,0.176675,0.132719,0.145754,0.197329,0.168701,0.1719
MNAR,0.5,0.164543,0.173913,0.14747,0.149937,0.170609,0.17989,0.214656
MNAR,0.7,0.158862,0.173131,0.151286,0.159304,0.178154,0.192607,0.252713
MCAR,0.1,0.0970866,0.155451,0.0997683,0.10482,0.139083,0.155043,0.124951
MCAR,0.3,0.148492,0.165388,0.118234,0.120529,0.1417,0.159256,0.154474
MCAR,0.5,0.169088,0.157143,0.127761,0.129058,0.134831,0.166082,0.176362
MCAR,0.7,0.169216,0.164527,0.160072,0.164768,0.193833,0.186153,0.234159
MAR,0.1,0.164865,0.289819,0.171008,0.237158,0.217491,0.203912,0.230159
MAR,0.3,0.128786,0.202495,0.125144,0.154332,0.158673,0.163129,0.167993




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0156187,0.0567258,0.0216311,0.0177337,0.0330522,0.0208131,0.0254905
MNAR,0.3,0.044267,0.140247,0.0513067,0.0552674,0.101583,0.0466682,0.0784723
MNAR,0.5,0.11399,0.240806,0.112246,0.108117,0.17535,0.0852097,0.203378
MNAR,0.7,0.171779,0.324773,0.214187,0.197793,0.267342,0.138072,0.338153
MCAR,0.1,0.0127873,0.0466441,0.0152332,0.0170337,0.0353856,0.0197545,0.0164807
MCAR,0.3,0.0532218,0.137657,0.0550801,0.0564629,0.0965844,0.0438431,0.062981
MCAR,0.5,0.110908,0.221599,0.0848354,0.0980125,0.103952,0.0719762,0.158793
MCAR,0.7,0.188273,0.322504,0.230888,0.207838,0.295957,0.127567,0.30562
MAR,0.1,0.0140132,0.0374067,0.0147051,0.0146386,0.0269153,0.0150613,0.0179505
MAR,0.3,0.0235338,0.0767404,0.0297305,0.0248873,0.0393886,0.0278266,0.0472658




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0,0.0,-0.0026738,0.0,-0.00802139,0.0026738,0.0026738
MNAR,0.3,-0.0026738,-0.0026738,0.0026738,0.0106952,0.0026738,0.0026738,0.0
MNAR,0.5,-0.00802139,-0.0026738,0.00534759,-0.0026738,0.0026738,0.00534759,0.00802139
MNAR,0.7,0.0,0.0026738,0.0,-0.0026738,0.0160428,0.0026738,0.0026738
MCAR,0.1,0.0,-0.0026738,0.0,0.0,0.0,0.0,0.0
MCAR,0.3,-0.00534759,-0.0026738,-1.38778e-17,0.0026738,0.0,0.0,0.0
MCAR,0.5,0.0026738,0.0,-0.0026738,0.0,0.0,0.0026738,-0.0026738
MCAR,0.7,-0.00534759,-0.0026738,0.0,-0.00802139,0.0,0.0026738,0.0026738
MAR,0.1,-0.00534759,0.0026738,-0.00534759,-0.0026738,0.0026738,0.0,-0.00802139
MAR,0.3,-0.0026738,0.0,0.0,2.77556e-17,0.0,0.0,0.0026738


## Dataset: Breast Cancer Wisconsin (Diagnostic)

https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)

In [9]:
from sklearn.datasets import load_breast_cancer

X_raw, y = load_breast_cancer(as_frame = True, return_X_y = True)

X_raw

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [10]:
evaluate_dataset_repeated("bc", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0637595,0.163947,0.0787796,0.0557502,0.0896828,0.0849071,0.101623
MNAR,0.3,0.0646503,0.153299,0.0813538,0.0604925,0.0860487,0.0849523,0.104183
MNAR,0.5,0.0644767,0.163802,0.0742579,0.0496812,0.0984978,0.0885914,0.101692
MNAR,0.7,0.0675687,0.153037,0.0732236,0.0519485,0.0890631,0.0890356,0.104894
MCAR,0.1,0.02777,0.150324,0.0477258,0.0279667,0.0472133,0.057807,0.0827789
MCAR,0.3,0.0470322,0.158456,0.0612905,0.0409766,0.0499785,0.0683846,0.0945808
MCAR,0.5,0.0535745,0.141308,0.0673203,0.0467213,0.0573777,0.0744077,0.0948212
MCAR,0.7,0.0806574,0.154039,0.0930907,0.0711052,0.0923276,0.0939902,0.115596
MAR,0.1,0.0405203,0.170915,0.0651997,0.0403405,0.0571908,0.0794754,0.11333
MAR,0.3,0.0518775,0.166173,0.076351,0.0488705,0.076312,0.0931705,0.115604




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0124585,0.0928417,0.0258667,0.0118873,0.029174,0.0247242,0.0339945
MNAR,0.3,0.0298356,0.260124,0.0609565,0.0310944,0.0894322,0.0634569,0.0790714
MNAR,0.5,0.0481931,0.452693,0.0947168,0.039478,0.148972,0.125417,0.117372
MNAR,0.7,0.0875882,0.619413,0.125803,0.0656361,0.206787,0.193557,0.153924
MCAR,0.1,0.00693256,0.0911905,0.0156451,0.00707256,0.0207969,0.0176994,0.0265012
MCAR,0.3,0.0203121,0.294,0.0535445,0.0206046,0.0498083,0.0510044,0.078328
MCAR,0.5,0.0464788,0.407161,0.0777633,0.0432597,0.0840819,0.0924129,0.145384
MCAR,0.7,0.095433,0.658966,0.135722,0.0992739,0.247121,0.187393,0.219553
MAR,0.1,0.00518684,0.0480664,0.0112657,0.00515654,0.00926664,0.0130059,0.0156088
MAR,0.3,0.0121215,0.138208,0.0315389,0.0112894,0.0315419,0.0410869,0.0520661




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.00699301,-0.0034965,3.4694500000000004e-18,-0.00699301,0.0,0.0034965,0.0034965
MNAR,0.3,-0.0034965,0.00699301,-0.0104895,0.0104895,0.0,-0.0034965,-0.0034965
MNAR,0.5,0.0,-0.0034965,-0.0034965,0.00699301,-0.00699301,0.0034965,0.0034965
MNAR,0.7,0.0034965,-3.4694500000000004e-18,0.00699301,0.0104895,0.0104895,0.0034965,0.00699301
MCAR,0.1,-0.00699301,0.0034965,0.0034965,-0.0034965,0.0034965,0.0034965,0.0
MCAR,0.3,0.0034965,0.0,0.0,0.00699301,0.0,0.0,-3.4694500000000004e-18
MCAR,0.5,-0.0034965,0.0,-0.00699301,-0.0034965,-0.00699301,-0.00699301,0.0034965
MCAR,0.7,-0.00699301,0.00699301,0.0034965,0.0,-0.00699301,-0.0034965,0.0034965
MAR,0.1,0.0,-0.0034965,0.0104895,-0.00699301,-0.00699301,-0.0034965,0.0
MAR,0.3,-0.0034965,-0.0034965,0.0034965,-0.0034965,0.0104895,-0.0034965,-0.00699301


## Dataset: California Housing



In [11]:
from sklearn.datasets import fetch_california_housing

X_raw, y = fetch_california_housing(as_frame = True, return_X_y = True)

X_raw

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [12]:
evaluate_dataset_repeated("california", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0915515,0.143948,0.102043,0.145124,0.141932,0.16135,0.193783
MNAR,0.3,0.119835,0.145231,0.115949,0.184623,0.179337,0.17876,0.208331
MNAR,0.5,0.143098,0.151738,0.138369,0.159449,0.192473,0.192777,0.245518
MNAR,0.7,0.167246,0.149685,0.161949,0.368182,0.236329,0.195107,0.26613
MCAR,0.1,0.0841602,0.14543,0.0970414,0.0971789,0.1319,0.160162,0.183931
MCAR,0.3,0.119616,0.14669,0.120585,0.195421,0.183569,0.179778,0.204303
MCAR,0.5,0.140269,0.146237,0.144308,0.138382,0.156179,0.187814,0.227107
MCAR,0.7,0.162331,0.146426,0.174246,0.141734,0.214964,0.193607,0.281409
MAR,0.1,0.0674522,0.160662,0.0867636,0.117405,0.117726,0.166988,0.190475
MAR,0.3,0.0731485,0.152029,0.0879051,0.122508,0.122458,0.1728,0.193063




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0101604,0.0718975,0.0263332,0.0250003,0.064111,0.0317999,0.0461059
MNAR,0.3,0.0295214,0.218148,0.0996353,0.0881546,0.179462,0.0963679,0.113842
MNAR,0.5,0.0948878,0.373089,0.167046,0.177903,0.325065,0.215107,0.301403
MNAR,0.7,0.226915,0.51538,0.386714,0.370503,0.656617,0.339763,0.651661
MCAR,0.1,0.00949899,0.0723175,0.0241912,0.0218338,0.05953,0.0271474,0.0420138
MCAR,0.3,0.0349798,0.217816,0.0847088,0.084525,0.243712,0.0922918,0.123359
MCAR,0.5,0.111286,0.362705,0.198423,0.183595,0.358763,0.176397,0.289495
MCAR,0.7,0.220163,0.509417,0.406713,0.388806,0.595262,0.304925,0.662695
MAR,0.1,0.00442705,0.0437257,0.0126508,0.0108587,0.0280408,0.0192817,0.0287789
MAR,0.3,0.010814,0.112513,0.0380483,0.0311819,0.0766291,0.0713668,0.0741027




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0205503,0.0176833,0.0142691,0.0134756,0.0315728,0.0317001,0.0240688
MNAR,0.3,0.0782913,0.0606707,0.0507076,0.0735444,0.0800056,0.0953938,0.076138
MNAR,0.5,0.133866,0.125778,0.122595,0.135267,0.155998,0.190752,0.154784
MNAR,0.7,0.342732,0.316518,0.274455,0.236819,0.244545,0.375236,0.246133
MCAR,0.1,0.00871884,0.0153014,0.0118297,0.0182376,0.0272195,0.0267989,0.0167541
MCAR,0.3,0.048985,0.0535833,0.054224,0.0694741,0.0523693,0.0872123,0.0732474
MCAR,0.5,0.136253,0.109327,0.116507,0.137226,0.125197,0.154612,0.123697
MCAR,0.7,0.25815,0.305142,0.243458,0.2169,0.253796,0.31976,0.233827
MAR,0.1,0.006111,0.00682104,0.00300369,0.00790716,0.0041553,0.0158152,0.0101721
MAR,0.3,0.0236548,0.0232284,0.0253908,0.036127,0.0324183,0.0479124,0.040395


# Dataset: Climate Model Simulation Crashes
https://archive.ics.uci.edu/ml/datasets/climate+model+simulation+crashes

In [13]:
samples = np.loadtxt("https://archive.ics.uci.edu/ml/machine-learning-databases/00252/pop_failures.dat", skiprows=1)
df = pd.DataFrame(samples)

df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,1.0,1.0,0.859036,0.927825,0.252866,0.298838,0.170521,0.735936,0.428325,0.567947,...,0.245675,0.104226,0.869091,0.997518,0.448620,0.307522,0.858310,0.796997,0.869893,0.0
1,1.0,2.0,0.606041,0.457728,0.359448,0.306957,0.843331,0.934851,0.444572,0.828015,...,0.616870,0.975786,0.914344,0.845247,0.864152,0.346713,0.356573,0.438447,0.512256,1.0
2,1.0,3.0,0.997600,0.373238,0.517399,0.504993,0.618903,0.605571,0.746225,0.195928,...,0.679355,0.803413,0.643995,0.718441,0.924775,0.315371,0.250642,0.285636,0.365858,1.0
3,1.0,4.0,0.783408,0.104055,0.197533,0.421837,0.742056,0.490828,0.005525,0.392123,...,0.471463,0.597879,0.761659,0.362751,0.912819,0.977971,0.845921,0.699431,0.475987,1.0
4,1.0,5.0,0.406250,0.513199,0.061812,0.635837,0.844798,0.441502,0.191926,0.487546,...,0.551543,0.743877,0.312349,0.650223,0.522261,0.043545,0.376660,0.280098,0.132283,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535,3.0,176.0,0.657136,0.489375,0.133713,0.411950,0.087780,0.356289,0.480204,0.029678,...,0.280546,0.384117,0.885948,0.768482,0.459479,0.334482,0.573002,0.610183,0.737706,1.0
536,3.0,177.0,0.915894,0.842720,0.518947,0.090622,0.336981,0.893576,0.978703,0.674868,...,0.798108,0.353546,0.044796,0.990900,0.347027,0.512499,0.810549,0.593332,0.142565,0.0
537,3.0,178.0,0.478600,0.941185,0.769245,0.950776,0.189406,0.112743,0.745645,0.527096,...,0.193103,0.829563,0.101506,0.548878,0.381966,0.198811,0.867108,0.461632,0.652817,1.0
538,3.0,179.0,0.007793,0.779287,0.867468,0.704820,0.983282,0.420303,0.710612,0.174746,...,0.761134,0.436714,0.690132,0.825133,0.981656,0.113193,0.364799,0.201469,0.536535,1.0


In [14]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("climate_model", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.348344,0.299622,0.306973,0.2997,0.374355,0.340907,0.33533
MNAR,0.3,0.303099,0.290407,0.302913,0.290969,0.359613,0.32006,0.297738
MNAR,0.5,0.355093,0.298895,0.335911,0.299032,0.356459,0.32455,0.318874
MNAR,0.7,0.40656,0.297874,0.372265,0.298117,0.371931,0.311861,0.319547
MCAR,0.1,0.298403,0.294878,0.300951,0.294966,0.368697,0.328919,0.303682
MCAR,0.3,0.299506,0.287019,0.297278,0.287092,0.355362,0.320066,0.297813
MCAR,0.5,0.360329,0.29718,0.343941,0.2974,0.3505,0.323381,0.326665
MCAR,0.7,0.388618,0.299781,0.377757,0.300057,0.352285,0.316842,0.320691
MAR,0.1,0.298287,0.294781,0.297903,0.294716,0.363022,0.330395,0.303692
MAR,0.3,0.311131,0.293247,0.30676,0.293322,0.36892,0.319853,0.301395




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.192149,0.224449,0.196111,0.224277,0.126343,0.133588,0.177043
MNAR,0.3,0.44126,0.611357,0.462124,0.603943,0.276301,0.378949,0.48431
MNAR,0.5,0.494457,1.03545,0.56202,1.02968,0.509785,0.69361,0.750685
MNAR,0.7,0.598606,1.42296,0.537782,1.41351,0.577223,1.05806,1.05097
MCAR,0.1,0.184806,0.202217,0.176339,0.201445,0.092429,0.118353,0.161766
MCAR,0.3,0.449507,0.59156,0.467262,0.590814,0.245229,0.357904,0.461514
MCAR,0.5,0.456505,1.02661,0.518165,1.01841,0.498616,0.677539,0.741436
MCAR,0.7,0.435986,1.43918,0.544117,1.42031,0.666806,1.07588,1.09139
MAR,0.1,0.0919412,0.100191,0.0874736,0.0986202,0.0546528,0.0595508,0.0800308
MAR,0.3,0.228389,0.310805,0.243868,0.310441,0.153399,0.204111,0.246639




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0,0,0,0,0,0,0
MNAR,0.3,0,0,0,0,0,0,0
MNAR,0.5,0,0,0,0,0,0,0
MNAR,0.7,0,0,0,0,0,0,0
MCAR,0.1,0,0,0,0,0,0,0
MCAR,0.3,0,0,0,0,0,0,0
MCAR,0.5,0,0,0,0,0,0,0
MCAR,0.7,0,0,0,0,0,0,0
MAR,0.1,0,0,0,0,0,0,0
MAR,0.3,0,0,0,0,0,0,0


## Concrete Compressive Strength Data Set
https://archive.ics.uci.edu/ml/datasets/concrete+compressive+strength

In [29]:
!pip install xlrd

df = pd.read_excel("https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls")

df

Collecting xlrd
  Using cached xlrd-2.0.1-py2.py3-none-any.whl (96 kB)
Installing collected packages: xlrd
Successfully installed xlrd-2.0.1


Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.052780
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075
...,...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,44.284354
1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,31.178794
1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,23.696601
1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28,32.768036


In [30]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("concrete_compressive", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0940349,0.22225,0.15215,0.136101,0.200575,0.151814,0.246851
MNAR,0.3,0.194456,0.226681,0.19537,0.187603,0.22084,0.18051,0.288269
MNAR,0.5,0.218036,0.226004,0.229436,0.209854,0.260343,0.213475,0.307882
MNAR,0.7,0.263805,0.227033,0.247898,0.22688,0.283977,0.240279,0.356887
MCAR,0.1,0.100578,0.216655,0.150913,0.12996,0.187544,0.140732,0.255672
MCAR,0.3,0.14756,0.222711,0.184524,0.172764,0.211863,0.178445,0.273807
MCAR,0.5,0.205265,0.223112,0.219509,0.209906,0.215142,0.212105,0.296698
MCAR,0.7,0.27182,0.224851,0.254024,0.220676,0.245867,0.236163,0.341985
MAR,0.1,0.0519031,0.227351,0.141076,0.116302,0.168484,0.12336,0.2554
MAR,0.3,0.103349,0.236091,0.160966,0.153866,0.212644,0.143825,0.291286




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0221071,0.138579,0.0556629,0.0419581,0.0975725,0.0326731,0.0773395
MNAR,0.3,0.125384,0.445556,0.198477,0.17023,0.340145,0.120054,0.211062
MNAR,0.5,0.232955,0.731695,0.335363,0.407201,0.670765,0.259761,0.342324
MNAR,0.7,0.509905,1.03032,0.508757,0.723998,0.954728,0.497416,0.642554
MCAR,0.1,0.0213613,0.141884,0.052611,0.0380386,0.100016,0.0305115,0.0766001
MCAR,0.3,0.0762109,0.437841,0.16915,0.166279,0.34535,0.110086,0.207777
MCAR,0.5,0.23167,0.716386,0.274024,0.417758,0.579133,0.224676,0.303713
MCAR,0.7,0.582292,1.01705,0.426211,0.738687,0.764653,0.404411,0.624289
MAR,0.1,0.00539368,0.0737409,0.026636,0.0170408,0.0443708,0.0144965,0.0384795
MAR,0.3,0.0410947,0.246525,0.0927978,0.0807225,0.130237,0.0553919,0.158797




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,9.41204,9.18873,8.8479,6.2478,18.2295,5.68571,11.2376
MNAR,0.3,35.3349,36.6276,27.9812,27.3018,29.8729,22.6012,30.2076
MNAR,0.5,56.6285,74.6833,68.2607,47.6967,60.7214,65.7633,68.0139
MNAR,0.7,106.766,185.13,114.124,94.2846,95.578,112.562,138.722
MCAR,0.1,9.66555,10.0514,8.54655,10.5808,10.1737,2.36426,6.54345
MCAR,0.3,24.5924,30.0201,25.3201,18.9705,31.3126,21.0862,34.6465
MCAR,0.5,50.0585,81.6305,62.4228,45.5063,45.5098,50.8514,55.9028
MCAR,0.7,129.888,175.059,125.322,95.9453,94.6527,92.144,84.7685
MAR,0.1,-0.540469,4.9828,2.2257,0.946496,3.55901,1.77865,2.43189
MAR,0.3,10.1598,17.6618,5.43349,11.5452,17.9366,19.1186,14.2081


## Concrete Slump Test Data Set

https://archive.ics.uci.edu/ml/datasets/concrete+slump+test

In [31]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/slump/slump_test.data")

df

Unnamed: 0,No,Cement,Slag,Fly ash,Water,SP,Coarse Aggr.,Fine Aggr.,SLUMP(cm),FLOW(cm),Compressive Strength (28-day)(Mpa)
0,1,273.0,82.0,105.0,210.0,9.0,904.0,680.0,23.0,62.0,34.99
1,2,163.0,149.0,191.0,180.0,12.0,843.0,746.0,0.0,20.0,41.14
2,3,162.0,148.0,191.0,179.0,16.0,840.0,743.0,1.0,20.0,41.81
3,4,162.0,148.0,190.0,179.0,19.0,838.0,741.0,3.0,21.5,42.08
4,5,154.0,112.0,144.0,220.0,10.0,923.0,658.0,20.0,64.0,26.82
...,...,...,...,...,...,...,...,...,...,...,...
98,99,248.3,101.0,239.1,168.9,7.7,954.2,640.6,0.0,20.0,49.97
99,100,248.0,101.0,239.9,169.1,7.7,949.9,644.1,2.0,20.0,50.23
100,101,258.8,88.0,239.6,175.3,7.6,938.9,646.0,0.0,20.0,50.50
101,102,297.1,40.9,239.9,194.0,7.5,908.9,651.8,27.5,67.0,49.17


In [32]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns = ["No", last_col])

evaluate_dataset_repeated("concret_slump", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.130375,0.283323,0.211235,0.134277,0.220507,0.213268,0.30492
MNAR,0.3,0.241477,0.278454,0.236863,0.202408,0.26795,0.238882,0.32237
MNAR,0.5,0.330353,0.298425,0.30807,0.269544,0.346342,0.276316,0.334532
MNAR,0.7,0.346621,0.299631,0.337322,0.284316,0.376743,0.292003,0.403617
MCAR,0.1,0.163037,0.293472,0.248296,0.157655,0.249313,0.236992,0.317033
MCAR,0.3,0.211108,0.278632,0.216435,0.186238,0.268776,0.213614,0.296852
MCAR,0.5,0.309865,0.29566,0.287036,0.255685,0.316697,0.261557,0.367314
MCAR,0.7,0.322832,0.291131,0.345523,0.291846,0.37099,0.292587,0.399494
MAR,0.1,0.119885,0.293494,0.226955,0.102931,0.207582,0.202099,0.282046
MAR,0.3,0.172502,0.293237,0.206328,0.150157,0.221518,0.214266,0.344548




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0461909,0.173773,0.0845788,0.0486956,0.0978614,0.0872435,0.104688
MNAR,0.3,0.208933,0.571832,0.26061,0.226782,0.341554,0.274339,0.33843
MNAR,0.5,0.409014,1.04776,0.399077,0.571993,0.579578,0.52708,0.485203
MNAR,0.7,0.702569,1.38442,0.767783,0.906589,0.92399,0.856881,0.733975
MCAR,0.1,0.0502471,0.186615,0.095609,0.0519445,0.132589,0.093605,0.107195
MCAR,0.3,0.191518,0.559619,0.248085,0.207897,0.34035,0.244101,0.302157
MCAR,0.5,0.359089,0.968491,0.396657,0.537859,0.442039,0.44183,0.503903
MCAR,0.7,0.611101,1.37965,0.614147,0.945749,0.821574,0.795384,0.828595
MAR,0.1,0.0225262,0.0989172,0.0450807,0.0228727,0.0548978,0.0395293,0.0598786
MAR,0.3,0.0937575,0.290612,0.118053,0.0872645,0.122738,0.144169,0.205225




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,7.96247,2.28302,0.573614,3.97118,7.7063,-0.45968,2.30549
MNAR,0.3,-0.808287,8.3421,8.55458,-1.90941,15.9373,5.94932,11.8155
MNAR,0.5,28.0917,21.9629,31.7976,15.4809,19.9786,26.8156,40.1851
MNAR,0.7,63.8402,56.3414,49.5495,72.7562,35.5308,30.4647,45.2678
MCAR,0.1,-1.35237,8.98464,4.06845,2.43222,8.18902,-1.26109,4.61878
MCAR,0.3,5.60905,3.01918,10.2971,0.00573064,6.70799,14.3468,6.59705
MCAR,0.5,19.5933,25.7159,9.53351,39.5015,25.719,27.1808,44.8976
MCAR,0.7,49.2285,44.3687,63.3818,18.0193,51.0445,43.1846,52.4711
MAR,0.1,-0.668643,3.70474,3.52281,1.53253,-0.687184,-0.930021,0.390918
MAR,0.3,2.42668,8.876,3.94847,-3.31228,1.81334,-5.11865,10.9326


## Connectionist Bench (Sonar, Mines vs. Rocks) Data Set

https://archive.ics.uci.edu/ml/datasets/connectionist+bench+(sonar,+mines+vs.+rocks)

In [17]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data", header = None)

df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,0.0200,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.0180,0.0084,0.0090,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.0140,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.2280,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.0180,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.0100,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.0150,0.0085,0.0073,0.0050,0.0044,0.0040,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.0590,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.0110,0.0015,0.0072,0.0048,0.0107,0.0094,R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,0.0187,0.0346,0.0168,0.0177,0.0393,0.1630,0.2028,0.1694,0.2328,0.2684,...,0.0116,0.0098,0.0199,0.0033,0.0101,0.0065,0.0115,0.0193,0.0157,M
204,0.0323,0.0101,0.0298,0.0564,0.0760,0.0958,0.0990,0.1018,0.1030,0.2154,...,0.0061,0.0093,0.0135,0.0063,0.0063,0.0034,0.0032,0.0062,0.0067,M
205,0.0522,0.0437,0.0180,0.0292,0.0351,0.1171,0.1257,0.1178,0.1258,0.2529,...,0.0160,0.0029,0.0051,0.0062,0.0089,0.0140,0.0138,0.0077,0.0031,M
206,0.0303,0.0353,0.0490,0.0608,0.0167,0.1354,0.1465,0.1123,0.1945,0.2354,...,0.0086,0.0046,0.0126,0.0036,0.0035,0.0034,0.0079,0.0036,0.0048,M


In [18]:
last_col = df.columns[-1]
y = (df[last_col] == 'M').astype(int)
X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("connectionist_sonar", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.11339,0.203983,0.122131,0.112133,0.244101,0.137241,0.191149
MNAR,0.3,0.103983,0.219757,0.108497,0.0960202,0.179494,0.136901,0.181443
MNAR,0.5,0.117793,0.226408,0.120348,0.114148,0.192123,0.153932,0.191956
MNAR,0.7,0.16367,0.213808,0.138275,0.132605,0.238028,0.162024,0.193332
MCAR,0.1,0.103802,0.203918,0.105869,0.0931318,0.184962,0.140971,0.191673
MCAR,0.3,0.112887,0.203704,0.116589,0.106728,0.152549,0.1344,0.175826
MCAR,0.5,0.11053,0.206342,0.109171,0.115628,0.16517,0.131706,0.175777
MCAR,0.7,0.152643,0.203511,0.128906,0.126024,0.224702,0.145341,0.175946
MAR,0.1,0.0915571,0.202557,0.100542,0.0875968,0.163752,0.11411,0.199227
MAR,0.3,0.113281,0.244143,0.118229,0.0974136,0.194051,0.152377,0.215799




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.039141,0.127442,0.0414071,0.0356584,0.0550976,0.0506709,0.0833342
MNAR,0.3,0.102041,0.414358,0.103631,0.0727541,0.0968783,0.162503,0.19904
MNAR,0.5,0.183563,0.74189,0.1774,0.121807,0.194948,0.312559,0.35639
MNAR,0.7,0.306249,0.925297,0.276164,0.219029,0.430423,0.497412,0.592448
MCAR,0.1,0.033652,0.12318,0.0358301,0.0278663,0.0422361,0.0490939,0.0716809
MCAR,0.3,0.12207,0.407299,0.12091,0.0884303,0.0896623,0.153672,0.20827
MCAR,0.5,0.153352,0.635783,0.159491,0.118548,0.128172,0.256794,0.382165
MCAR,0.7,0.209651,0.848601,0.239734,0.175036,0.300417,0.3994,0.457189
MAR,0.1,0.0150432,0.0604385,0.0172931,0.0141212,0.0233292,0.0198877,0.0369629
MAR,0.3,0.0518238,0.237908,0.0521409,0.0360766,0.0692119,0.0946851,0.115355




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,-0.00961538,-0.00961538,-0.0192308,-0.0192308,-0.00961538,-0.00961538,-1.38778e-17
MNAR,0.3,-0.0384615,-0.0192308,-1.38778e-17,0.0192308,0.0384615,0.00961538,0.0384615
MNAR,0.5,0.0192308,-0.0480769,0.0,0.0288462,0.0,-0.00961538,-0.00961538
MNAR,0.7,0.0673077,0.0480769,-0.00961538,0.0288462,-0.00961538,-0.0192308,0.0288462
MCAR,0.1,0.0,-0.0288462,1.38778e-17,0.0384615,0.00961538,-0.0192308,0.0
MCAR,0.3,-0.0288462,-0.0288462,0.00961538,1.38778e-17,-1.38778e-17,0.0192308,0.0384615
MCAR,0.5,-0.00961538,-0.0865385,-0.00961538,0.0384615,-0.00961538,-0.00961538,0.0576923
MCAR,0.7,0.0192308,-0.0480769,-0.0480769,-0.0192308,-0.00961538,-0.00961538,-0.00961538
MAR,0.1,0.0,-1.38778e-17,-0.0192308,0.0288462,-0.00961538,0.0480769,-0.0192308
MAR,0.3,0.0192308,-0.00961538,-0.0192308,0.0192308,0.0192308,-0.0192308,-0.0192308


## Wine-Red dataset

In [19]:
# Wine Quality Data Set

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep = ';')

df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [20]:
last_col = df.columns[-1]

y = df[last_col]
mapped_labels = sorted(y.unique())
mapping = {}
for idx, label in enumerate(mapped_labels):
    mapping[label] = idx
y = y.map(mapping)       

X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("wine_red", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0752039,0.1401,0.098928,0.0959491,0.128485,0.100422,0.130893
MNAR,0.3,0.0987004,0.149223,0.113852,0.111393,0.161089,0.115829,0.147675
MNAR,0.5,0.119887,0.129554,0.110075,0.118754,0.134804,0.109477,0.140377
MNAR,0.7,0.133398,0.138674,0.131645,0.118621,0.143345,0.125157,0.15213
MCAR,0.1,0.0658238,0.124059,0.0870472,0.0807754,0.103502,0.0901291,0.120696
MCAR,0.3,0.0893625,0.131478,0.102247,0.103136,0.117732,0.102584,0.124882
MCAR,0.5,0.107639,0.133288,0.116596,0.133035,0.119456,0.111167,0.13598
MCAR,0.7,0.123825,0.13401,0.137249,0.116828,0.128673,0.121395,0.146836
MAR,0.1,0.0728211,0.130312,0.0934271,0.0929654,0.105813,0.0979824,0.138476
MAR,0.3,0.0760118,0.141953,0.0957656,0.0912189,0.116302,0.0961798,0.138254




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0142913,0.0807123,0.0330839,0.0232251,0.0523492,0.020918,0.0436199
MNAR,0.3,0.0451937,0.271517,0.0961348,0.0692068,0.190704,0.0729256,0.137669
MNAR,0.5,0.101179,0.376945,0.144568,0.114874,0.280272,0.110811,0.166052
MNAR,0.7,0.205393,0.568047,0.193939,0.302736,0.440325,0.213925,0.333902
MCAR,0.1,0.012467,0.0711498,0.0294814,0.0197489,0.0411159,0.0162572,0.0367373
MCAR,0.3,0.047676,0.22644,0.0875655,0.0613968,0.173698,0.0530271,0.125836
MCAR,0.5,0.0912907,0.388815,0.159041,0.139215,0.265805,0.104339,0.187216
MCAR,0.7,0.167877,0.549259,0.215117,0.272314,0.378635,0.194025,0.310065
MAR,0.1,0.00700614,0.0363547,0.0167247,0.0105962,0.0232236,0.0113816,0.0204545
MAR,0.3,0.0182825,0.128698,0.0388142,0.0321971,0.0648991,0.0320215,0.0558386




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.015,-0.0175,-0.0075,0.01,-0.04375,0.01,0.02375
MNAR,0.3,0.01,0.09,0.05375,0.04,0.03875,-0.01125,-0.025
MNAR,0.5,0.02625,0.0275,-0.00875,0.0375,0.0525,0.0125,0.07375
MNAR,0.7,0.0475,0.0475,0.05375,0.03625,0.075,0.02625,0.05625
MCAR,0.1,-0.00375,0.0225,0.0,-0.02,0.00375,0.0125,0.0225
MCAR,0.3,0.01875,0.02625,0.0125,0.00625,0.0,-0.01375,0.0225
MCAR,0.5,0.03375,0.00625,-0.01,0.0025,5.55112e-17,-0.0025,0.00125
MCAR,0.7,0.03875,0.0125,-0.02125,-0.015,0.03375,0.0,0.045
MAR,0.1,-0.00125,0.00125,-0.00125,0.00125,0.0225,0.00375,0.01125
MAR,0.3,0.0175,-0.03125,0.00125,-0.01375,-0.025,0.005,-0.01125


## Wine-White dataset

In [21]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep = ';')

df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [22]:
last_col = df.columns[-1]

y = df[last_col]
mapped_labels = sorted(y.unique())
mapping = {}
for idx, label in enumerate(mapped_labels):
    mapping[label] = idx
y = y.map(mapping)  

X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("wine_white", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0727354,0.103042,0.0854247,0.0847544,0.107913,0.106509,0.110193
MNAR,0.3,0.0813748,0.108733,0.0847014,0.0831427,0.140518,0.110404,0.114335
MNAR,0.5,0.0944585,0.108495,0.0971849,0.093809,0.12526,0.114727,0.12573
MNAR,0.7,0.119635,0.110697,0.119605,0.0998229,0.113239,0.121258,0.130591
MCAR,0.1,0.0679117,0.102801,0.0813494,0.0723241,0.0973954,0.107067,0.106009
MCAR,0.3,0.0882778,0.117533,0.0964852,0.0930527,0.109951,0.122417,0.128209
MCAR,0.5,0.0965341,0.107996,0.0973386,0.0908435,0.113358,0.114363,0.129853
MCAR,0.7,0.0946295,0.104888,0.0990581,0.0908909,0.163107,0.111558,0.127448
MAR,0.1,0.060646,0.0877463,0.0718315,0.0649366,0.0806419,0.0895635,0.0883445
MAR,0.3,0.0679351,0.0951652,0.0736501,0.0701594,0.0859406,0.0913712,0.0972755




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0159007,0.0592228,0.0297268,0.0200129,0.0408071,0.0111219,0.0318135
MNAR,0.3,0.0457558,0.182691,0.0716587,0.0494507,0.118678,0.0433221,0.0768762
MNAR,0.5,0.0870612,0.304643,0.124173,0.111142,0.205734,0.0859616,0.127245
MNAR,0.7,0.127535,0.440035,0.246507,0.239924,0.328454,0.164552,0.225277
MCAR,0.1,0.0166395,0.0592331,0.0279893,0.0206139,0.0449555,0.0115891,0.0296626
MCAR,0.3,0.0607302,0.206864,0.0841483,0.0611244,0.153677,0.0454123,0.0933889
MCAR,0.5,0.079619,0.306602,0.114975,0.12076,0.245624,0.0767952,0.130842
MCAR,0.7,0.111466,0.412276,0.157383,0.212668,0.492855,0.133099,0.223587
MAR,0.1,0.00544151,0.02536,0.0124135,0.00681189,0.0145951,0.00549712,0.0110102
MAR,0.3,0.018118,0.083792,0.0326016,0.0242433,0.0537354,0.0180017,0.0338188




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.00122449,-0.00571429,0.00653061,0.0044898,-0.00530612,-0.00653061,0.00408163
MNAR,0.3,0.0118367,0.00204082,0.0118367,-0.0044898,0.0130612,0.00693878,0.0187755
MNAR,0.5,0.0293878,0.00816327,-0.000408163,0.0191837,0.0240816,0.0191837,0.0171429
MNAR,0.7,0.0428571,-0.00122449,0.0261224,0.0232653,0.0330612,0.0371429,0.0314286
MCAR,0.1,0.00285714,-0.00244898,-0.00244898,-0.00163265,0.000408163,0.0077551,-0.00204082
MCAR,0.3,-0.00163265,-0.00571429,0.0110204,0.00734694,-0.00571429,0.00979592,0.0122449
MCAR,0.5,0.0126531,0.00489796,0.0,0.00938776,0.0155102,0.0179592,0.0167347
MCAR,0.7,0.037551,-0.0179592,0.0179592,0.0212245,0.0146939,0.035102,0.00979592
MAR,0.1,0.00204082,-0.00734694,0.0077551,-0.00285714,0.0,-0.00367347,0.000816327
MAR,0.3,-0.00122449,-0.00204082,0.00163265,-0.00204082,-0.00244898,0.0122449,-0.000816327


## Yeast Data Set


In [23]:
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data",sep="\s+", header = None)

df = df.drop(columns = [0])

for col in [9]:
    df[col] = LabelEncoder().fit_transform(df[col])

df

Unnamed: 0,1,2,3,4,5,6,7,8,9
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,6
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,6
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,6
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,7
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,6
...,...,...,...,...,...,...,...,...,...
1479,0.81,0.62,0.43,0.17,0.5,0.0,0.53,0.22,4
1480,0.47,0.43,0.61,0.40,0.5,0.0,0.48,0.47,7
1481,0.67,0.57,0.36,0.19,0.5,0.0,0.56,0.22,4
1482,0.43,0.40,0.60,0.16,0.5,0.0,0.53,0.39,7


In [24]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("yeast", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.117453,0.128456,0.117004,0.11725,0.133013,0.155637,0.133258
MNAR,0.3,0.118561,0.121012,0.119037,1.39098,0.19128,0.146783,0.145906
MNAR,0.5,0.190731,0.120352,0.143276,0.371297,0.159744,0.146622,0.200788
MNAR,0.7,0.198841,0.117189,0.170802,0.129153,0.175309,0.142518,0.240509
MCAR,0.1,0.168633,0.112562,0.103377,0.102249,0.120164,0.143704,0.116916
MCAR,0.3,0.118312,0.117005,0.125661,0.11032,0.121774,0.14208,0.138731
MCAR,0.5,0.237552,0.116969,0.14096,0.150004,0.206444,0.147035,0.166718
MCAR,0.7,0.161953,0.118011,0.144744,0.128346,0.176713,0.144901,0.229307
MAR,0.1,0.125897,0.135135,0.122007,0.126504,0.159721,0.150642,0.199633
MAR,0.3,0.266251,0.130611,0.126043,0.119048,0.136373,0.155481,0.127852




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0397536,0.0632104,0.0413296,0.0397329,0.0555037,0.0286407,0.0367047
MNAR,0.3,0.083123,0.174674,0.101324,0.357128,0.184749,0.0747036,0.105476
MNAR,0.5,0.160801,0.295353,0.14523,0.248863,0.325795,0.139273,0.271506
MNAR,0.7,0.306656,0.40518,0.288229,0.290754,0.510224,0.191372,0.628813
MCAR,0.1,0.0538398,0.0593431,0.0397077,0.0384134,0.0604229,0.0280981,0.033949
MCAR,0.3,0.0742524,0.17586,0.102824,0.103922,0.150473,0.069786,0.106247
MCAR,0.5,0.262739,0.287709,0.178311,0.191461,0.411392,0.122976,0.219374
MCAR,0.7,0.208346,0.408149,0.215432,0.292964,0.486606,0.197344,0.594472
MAR,0.1,0.0221726,0.0355844,0.0236326,0.0222871,0.0274116,0.0152322,0.0277986
MAR,0.3,0.137498,0.0873706,0.0510539,0.0537098,0.0555516,0.046173,0.048455




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.201638,0.0923407,-0.389095,-0.493224,0.782682,-0.509497,0.405683
MNAR,0.3,0.375411,0.96063,0.842549,-0.193667,-0.146521,0.302913,0.212279
MNAR,0.5,1.58783,2.77712,2.14531,0.116879,0.618502,-0.0104523,1.01299
MNAR,0.7,0.834788,3.95717,4.20832,1.56217,1.3302,1.3004,1.43382
MCAR,0.1,0.363696,-0.0475145,-0.746165,-0.552416,-0.442602,-0.0682176,-0.270399
MCAR,0.3,0.549162,0.479243,1.50842,0.573042,-0.306409,0.0432456,0.4662
MCAR,0.5,1.12726,3.65199,1.34299,0.066058,0.7376,0.143703,0.741687
MCAR,0.7,2.16439,4.16481,3.34055,1.17458,0.916137,0.931748,0.82719
MAR,0.1,-0.0692218,-0.573825,0.359598,0.22131,0.409627,0.0774603,0.639419
MAR,0.3,-0.444984,0.159684,0.984682,0.0739514,0.0308037,-0.228744,-0.17658


## Diabetes


In [25]:
from sklearn.datasets import load_diabetes

X, y = load_diabetes(as_frame = True, return_X_y = True)

X

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068330,-0.092204
2,0.085299,0.050680,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018118,0.044485
439,0.041708,0.050680,-0.015906,0.017282,-0.037344,-0.013840,-0.024993,-0.011080,-0.046879,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044528,-0.025930


In [26]:
evaluate_dataset_repeated("diabetes", X, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.171038,0.219262,0.17393,0.156444,0.203461,0.224312,0.209269
MNAR,0.3,0.247768,0.253631,0.21967,0.210364,0.28393,0.294137,0.256262
MNAR,0.5,0.274188,0.251127,0.249813,0.273615,0.263317,0.252487,0.259368
MNAR,0.7,0.228865,0.220812,0.22766,0.20796,0.242141,0.213044,0.232639
MCAR,0.1,0.216405,0.235566,0.198886,0.187848,0.236192,0.254857,0.223859
MCAR,0.3,0.191774,0.210162,0.174294,0.169589,0.191429,0.215433,0.20602
MCAR,0.5,0.222698,0.21836,0.208813,0.206705,0.22332,0.230181,0.219028
MCAR,0.7,0.278577,0.247615,0.267709,0.235345,0.293898,0.272291,0.263805
MAR,0.1,0.226106,0.249057,0.214452,0.197476,0.240354,0.270125,0.237593
MAR,0.3,0.118328,0.185551,0.131873,0.121798,0.146699,0.143538,0.16247




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0428952,0.13214,0.0592628,0.0495317,0.0604662,0.0634267,0.065605
MNAR,0.3,0.0988352,0.481369,0.224999,0.189858,0.282589,0.262226,0.216425
MNAR,0.5,0.259592,0.777904,0.290456,0.339046,0.458913,0.372996,0.354659
MNAR,0.7,0.341592,0.969183,0.329661,0.457325,0.769646,0.421081,0.450833
MCAR,0.1,0.0386656,0.144598,0.0709735,0.0616672,0.0795193,0.0579012,0.0698245
MCAR,0.3,0.097943,0.397765,0.159226,0.142097,0.269102,0.176567,0.175105
MCAR,0.5,0.201357,0.676736,0.235086,0.255888,0.441081,0.305133,0.292846
MCAR,0.7,0.418405,1.06725,0.371492,0.636168,0.811101,0.595991,0.50604
MAR,0.1,0.0343452,0.0810219,0.0430111,0.0381005,0.0269162,0.0426411,0.0442794
MAR,0.3,0.0547215,0.177168,0.0707401,0.0599276,0.0795133,0.0562363,0.0934271




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,-174.688,-653.302,-99.9496,151.011,492.685,135.864,-217.574
MNAR,0.3,-149.672,-193.562,-256.899,-40.191,111.596,1040.98,263.968
MNAR,0.5,873.529,589.867,-164.112,365.04,-629.636,615.471,-349.13
MNAR,0.7,170.321,2543.92,143.36,722.382,644.621,1520.57,-460.063
MCAR,0.1,42.1537,28.7045,99.1212,-50.1681,176.69,139.434,5.16636
MCAR,0.3,48.9922,-660.979,229.135,-107.609,-136.785,107.562,2.88626
MCAR,0.5,483.071,910.833,-36.745,12.1008,90.2823,954.205,473.454
MCAR,0.7,1534.4,2963.66,101.025,768.183,420.687,1375.5,807.717
MAR,0.1,117.263,-264.87,-484.231,-252.58,174.881,-464.719,-202.045
MAR,0.3,-26.9662,-42.9027,120.558,-344.131,-243.062,-113.291,-230.841


## Iris


In [27]:
from sklearn.datasets import load_iris

X, y = load_iris(as_frame = True, return_X_y = True)

X

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [28]:
evaluate_dataset_repeated("iris", X, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0992483,0.278265,0.117544,0.104121,0.158497,0.121427,0.143558
MNAR,0.3,0.191878,0.296008,0.13512,0.161858,0.204936,0.195676,0.296036
MNAR,0.5,0.180703,0.267481,0.191555,0.192633,0.234012,0.19366,0.319524
MNAR,0.7,0.32939,0.280567,0.273619,0.255019,0.333107,0.266085,0.424247
MCAR,0.1,0.0957085,0.263385,0.0950338,0.113609,0.176241,0.131824,0.167136
MCAR,0.3,0.147434,0.26443,0.139063,0.148951,0.303854,0.175324,0.260917
MCAR,0.5,0.169367,0.263858,0.189129,0.175805,0.192942,0.185237,0.295343
MCAR,0.7,0.31489,0.265026,0.280265,0.239678,0.226659,0.22398,0.372002
MAR,0.1,0.0775927,0.338458,0.0753215,0.0859418,0.134853,0.094665,0.102482
MAR,0.3,0.145011,0.323802,0.164489,0.0995753,0.169479,0.100504,0.116267




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0237793,0.0916426,0.0236495,0.0239845,0.038757,0.0246055,0.0270527
MNAR,0.3,0.0836966,0.292719,0.0710119,0.075946,0.150953,0.116724,0.169207
MNAR,0.5,0.157546,0.459574,0.142295,0.161612,0.295782,0.186099,0.380906
MNAR,0.7,0.486287,0.678923,0.394089,0.446654,0.523881,0.365465,0.874629
MCAR,0.1,0.0209528,0.0948043,0.0201746,0.0181155,0.0504063,0.0205177,0.0352554
MCAR,0.3,0.0579812,0.260053,0.0565371,0.0589651,0.249283,0.0613123,0.127948
MCAR,0.5,0.127017,0.455372,0.138308,0.143383,0.176283,0.150144,0.33195
MCAR,0.7,0.407001,0.613905,0.280116,0.395784,0.343552,0.250077,0.639133
MAR,0.1,0.00849533,0.066873,0.0102317,0.00818319,0.0194126,0.0100425,0.011999
MAR,0.3,0.041083,0.176571,0.0483087,0.0225182,0.0658428,0.0244998,0.0308981




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,sklearn_missforest,sklearn_ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0131579,0.0,-0.0131579,-0.0131579,0.0,0.0131579,0.0
MNAR,0.3,0.0131579,-0.0394737,0.0,-0.0263158,0.0263158,0.0131579,-0.0131579
MNAR,0.5,0.0657895,0.0131579,0.0263158,0.0,0.0789474,0.0131579,0.0
MNAR,0.7,0.0921053,0.118421,0.118421,0.0921053,0.0394737,0.0657895,0.0657895
MCAR,0.1,0.0263158,0.0,0.0131579,-0.0131579,-0.0263158,0.0,0.0
MCAR,0.3,0.0131579,0.0263158,0.0,0.0131579,-0.0131579,0.0131579,0.0131579
MCAR,0.5,-0.0131579,0.0131579,0.0131579,0.0,0.0657895,-0.0263158,0.0131579
MCAR,0.7,0.0263158,0.0263158,0.0789474,0.131579,0.0394737,0.0921053,0.0657895
MAR,0.1,0.0,0.0263158,0.0,0.0,0.0,0.0,0.0
MAR,0.3,0.0131579,0.0131579,0.0131579,-0.0131579,0.0131579,0.0,0.0263158


# Conclusion
