In [1]:
import copy
import os
import time
import random
import sys
import warnings

import numpy as np
import pandas as pd
from typing import Any


from hyperimpute.plugins.imputers import Imputers
from hyperimpute.utils.distributions import enable_reproducible_results
import hyperimpute.logger as log

from benchmark_imputation import evaluate_dataset_repeated_internal

from IPython.display import HTML, display
import tabulate

warnings.filterwarnings('ignore')
enable_reproducible_results()

imputers = Imputers()
log.add(sink=sys.stderr, level="INFO")

In [2]:
def get_imputer():
    return Imputers().get("hyperimpute", 
        optimizer = "bayesian"
    )


def evaluate_dataset_repeated(
    name,
    X_raw,
    y,
    ref_methods=["mean", "missforest", "ice", "gain", "sinkhorn", "softimpute"],
    scenarios=["MNAR"],
    miss_pct=[0.1, 0.3, 0.5, 0.7],
    n_iter=2,
    debug=False,
):
    return evaluate_dataset_repeated_internal(
        name = name,
        evaluated_model = get_imputer(),
        X_raw = X_raw,
        y = y,
        ref_methods=ref_methods,
        scenarios=scenarios,
        miss_pct=miss_pct,
        n_iter=n_iter,
        debug=debug,
    )


## Sanity check in  debug mode

In [3]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat', header = None, sep ="\\t")

y = df[5]
X_raw = df.drop(columns = [5])

evaluate_dataset_repeated("airfoil", X_raw, y,        
            scenarios =  ["MNAR"],      
            ref_methods=["mean"],
            debug = True,
            n_iter = 2,
            miss_pct = [0.3, 0.5]
)

[2021-12-29T22:42:40.747224+0200][1230602][INFO]   > BO iter 0


> evaluation trial  0
  > eval  MNAR 0.3


[2021-12-29T22:42:45.771979+0200][1230602][INFO]      >>> Column 0 <-- score -0.021701651773402764 <-- Model random_forest_regressor({'criterion': 0, 'max_features': 0, 'min_samples_split': 10, 'min_samples_leaf': 10})
[2021-12-29T22:42:51.981005+0200][1230602][INFO]      >>> Column 1 <-- score -0.020823876426386972 <-- Model xgboost_regressor({'max_depth': 3, 'lr': 0.01})
[2021-12-29T22:43:06.300874+0200][1230602][INFO]      >>> Column 2 <-- score 0.9620216308366576 <-- Model catboost({'depth': 4, 'grow_policy': 2})
[2021-12-29T22:43:13.925942+0200][1230602][INFO]      >>> Column 3 <-- score 0.8665721111778001 <-- Model catboost({'depth': 5, 'grow_policy': 2})
[2021-12-29T22:43:18.752967+0200][1230602][INFO]      >>> Column 4 <-- score -0.014874342577024567 <-- Model catboost_regressor({'depth': 4, 'grow_policy': 2})
[2021-12-29T22:43:49.796798+0200][1230602][INFO]   > BO iter 1
[2021-12-29T22:43:54.901244+0200][1230602][INFO]      >>> Column 3 <-- score 0.9519096474239254 <-- Model c

  > eval  MNAR 0.5


[2021-12-29T22:47:18.126475+0200][1230602][INFO]      >>> Column 0 <-- score -0.027050020255943358 <-- Model random_forest_regressor({'criterion': 0, 'max_features': 1, 'min_samples_split': 2, 'min_samples_leaf': 2})
[2021-12-29T22:47:22.407588+0200][1230602][INFO]      >>> Column 1 <-- score -0.027427547197500127 <-- Model xgboost_regressor({'max_depth': 3, 'lr': 0.01})
[2021-12-29T22:47:39.102746+0200][1230602][INFO]      >>> Column 2 <-- score 0.8835116078761293 <-- Model catboost({'depth': 4, 'grow_policy': 2})
[2021-12-29T22:47:47.241784+0200][1230602][INFO]      >>> Column 3 <-- score 0.6665841393972116 <-- Model catboost({'depth': 6, 'grow_policy': 2})
[2021-12-29T22:47:51.113107+0200][1230602][INFO]      >>> Column 4 <-- score -0.026836283728656273 <-- Model catboost_regressor({'depth': 5, 'grow_policy': 1})
[2021-12-29T22:48:20.659333+0200][1230602][INFO]   > BO iter 1
[2021-12-29T22:48:35.303550+0200][1230602][INFO]      >>> Column 2 <-- score 0.9977014602487831 <-- Model cat

> evaluation trial  1
  > eval  MNAR 0.3


[2021-12-29T22:52:01.929616+0200][1230602][INFO]      >>> Column 0 <-- score -0.016596744901440846 <-- Model random_forest_regressor({'criterion': 0, 'max_features': 0, 'min_samples_split': 2, 'min_samples_leaf': 2})
[2021-12-29T22:52:06.135245+0200][1230602][INFO]      >>> Column 1 <-- score -0.015857848327026438 <-- Model xgboost_regressor({'max_depth': 3, 'lr': 0.01})
[2021-12-29T22:52:16.590905+0200][1230602][INFO]      >>> Column 2 <-- score 0.959888787843592 <-- Model catboost({'depth': 4, 'grow_policy': 0})
[2021-12-29T22:52:29.048396+0200][1230602][INFO]      >>> Column 3 <-- score 0.8977133910573212 <-- Model catboost({'depth': 5, 'grow_policy': 0})
[2021-12-29T22:52:32.666832+0200][1230602][INFO]      >>> Column 4 <-- score -0.01598776472093104 <-- Model xgboost_regressor({'max_depth': 3, 'lr': 0.1})
[2021-12-29T22:53:00.336390+0200][1230602][INFO]   > BO iter 1
[2021-12-29T22:53:09.562614+0200][1230602][INFO]      >>> Column 3 <-- score 0.9587314276420895 <-- Model catboost(

  > eval  MNAR 0.5


[2021-12-29T22:56:34.832052+0200][1230602][INFO]      >>> Column 0 <-- score -0.023000500859409644 <-- Model random_forest_regressor({'criterion': 0, 'max_features': 1, 'min_samples_split': 10, 'min_samples_leaf': 2})
[2021-12-29T22:56:38.663904+0200][1230602][INFO]      >>> Column 1 <-- score -0.02540047136344101 <-- Model xgboost_regressor({'max_depth': 2, 'lr': 0.1})
[2021-12-29T22:56:47.744598+0200][1230602][INFO]      >>> Column 2 <-- score 0.8826239715662005 <-- Model catboost({})
[2021-12-29T22:56:53.686216+0200][1230602][INFO]      >>> Column 3 <-- score 0.7229325224597529 <-- Model catboost({})
[2021-12-29T22:56:57.702539+0200][1230602][INFO]      >>> Column 4 <-- score -0.03133347596309755 <-- Model random_forest_regressor({'criterion': 0, 'max_features': 2, 'min_samples_split': 10, 'min_samples_leaf': 5})
[2021-12-29T22:57:28.609724+0200][1230602][INFO]   > BO iter 1
[2021-12-29T22:57:38.319429+0200][1230602][INFO]      >>> Column 3 <-- score 0.8651682919528211 <-- Model cat

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean
MNAR,0.3,0.210953,0.28825
MNAR,0.5,0.304341,0.294885




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean
MNAR,0.3,0.0425288,0.354737
MNAR,0.5,0.0987608,0.588281




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean
MNAR,0.3,4.49123,8.25341
MNAR,0.5,11.0653,17.9549


# Datasets

In [4]:
import hyperimpute.logger as log

log.remove()

|           Dataset          |  Size |
|:--------------------------:|:-----:|
|     airfoil self noise     |  1503 |
|     blood transfusion      |  748  |
|  breast cancer diagnostic  |  569  |
|         california         | 20640 |
|   climate model crashes    |  540  |
|    concrete compression    |  1030 |
|       concrete slump       |  103  |
| connectionist bench sonar  |  208  |
|  connectionist bench vowel |  990  |
|            iris            |  150  |
|      wine quality red      |  1599 |
|     wine quality white     |  4899 |
|            yeast           |  1484 |

## Dataset: UCI Airfoil Self-Noise Data Set

https://archive.ics.uci.edu/ml/datasets/airfoil+self-noise


In [5]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat', header = None, sep ="\\t")

df

Unnamed: 0,0,1,2,3,4,5
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461
...,...,...,...,...,...,...
1498,2500,15.6,0.1016,39.6,0.052849,110.264
1499,3150,15.6,0.1016,39.6,0.052849,109.254
1500,4000,15.6,0.1016,39.6,0.052849,106.604
1501,5000,15.6,0.1016,39.6,0.052849,106.224


In [6]:
df.describe()

Unnamed: 0,0,1,2,3,4,5
count,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0
mean,2886.380572,6.782302,0.136548,50.860745,0.01114,124.835943
std,3152.573137,5.918128,0.093541,15.572784,0.01315,6.898657
min,200.0,0.0,0.0254,31.7,0.000401,103.38
25%,800.0,2.0,0.0508,39.6,0.002535,120.191
50%,1600.0,5.4,0.1016,39.6,0.004957,125.721
75%,4000.0,9.9,0.2286,71.3,0.015576,129.9955
max,20000.0,22.2,0.3048,71.3,0.058411,140.987


In [7]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("airfoil", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.137888,0.289021,0.211573,0.24471,0.265159,0.265082,0.39394
MNAR,0.3,0.203378,0.283704,0.242587,0.272988,0.298877,0.274423,0.400286
MNAR,0.5,0.303077,0.288694,0.298657,0.291961,0.321405,0.290374,0.458197
MNAR,0.7,0.343444,0.289026,0.302604,0.301296,0.320849,0.303579,0.423807




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0121973,0.111784,0.0555301,0.0624729,0.0819451,0.0468091,0.0702572
MNAR,0.3,0.0488025,0.347131,0.159852,0.194978,0.234571,0.152696,0.186347
MNAR,0.5,0.124324,0.58995,0.329407,0.354073,0.389086,0.287382,0.600654
MNAR,0.7,0.410425,0.826842,0.468159,0.546595,0.599986,0.434184,0.804399




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,1.49529,2.42751,1.38854,2.41034,2.28515,2.0251,2.11694
MNAR,0.3,4.20158,6.83927,7.49641,5.7809,6.78144,5.94334,8.24971
MNAR,0.5,9.77463,15.236,15.9471,18.2523,14.5127,15.579,16.8694
MNAR,0.7,24.0166,33.7427,25.2919,33.2813,30.5611,28.8577,27.4864


## Dataset: UCI Blood Transfusion Service Center Data Set

https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data

In [8]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data')

df

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
...,...,...,...,...,...
743,23,2,500,38,0
744,21,2,500,52,0
745,23,3,750,62,0
746,39,1,250,39,0


In [9]:
df.describe()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
count,748.0,748.0,748.0,748.0,748.0
mean,9.506684,5.514706,1378.676471,34.282086,0.237968
std,8.095396,5.839307,1459.826781,24.376714,0.426124
min,0.0,1.0,250.0,2.0,0.0
25%,2.75,2.0,500.0,16.0,0.0
50%,7.0,4.0,1000.0,28.0,0.0
75%,14.0,7.0,1750.0,50.0,0.0
max,74.0,50.0,12500.0,98.0,1.0


In [10]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("blood", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.135292,0.214845,0.124245,0.157324,0.177167,0.158293,0.16884
MNAR,0.3,0.123604,0.176414,0.129511,0.139195,0.160631,0.162339,0.166191
MNAR,0.5,0.21528,0.175235,0.146192,0.148925,0.177908,0.176406,0.23916
MNAR,0.7,0.173779,0.168315,0.1579,0.159423,0.195158,0.191376,0.256895




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0158006,0.0559862,0.0188127,0.0179273,0.0432258,0.0203438,0.0237798
MNAR,0.3,0.0353447,0.135569,0.0602915,0.0469719,0.105831,0.051749,0.0706487
MNAR,0.5,0.171217,0.2358,0.104643,0.0972316,0.167547,0.0801171,0.202968
MNAR,0.7,0.182826,0.317938,0.183051,0.189964,0.186451,0.132598,0.46329




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0,0.0,0.0,0.0026738,0.0,0.0,-1.38778e-17
MNAR,0.3,0.0026738,0.0,0.00534759,0.0,0.00802139,0.0026738,-0.0026738
MNAR,0.5,0.0,0.0026738,-0.0026738,-0.0026738,0.0,0.0026738,0.0
MNAR,0.7,-0.0026738,0.0026738,0.0,-0.0026738,2.77556e-17,0.0026738,0.0187166


## Dataset: Breast Cancer Wisconsin (Diagnostic)

https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)

In [11]:
from sklearn.datasets import load_breast_cancer

X_raw, y = load_breast_cancer(as_frame = True, return_X_y = True)

X_raw

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [12]:
X_raw.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [13]:
evaluate_dataset_repeated("bc", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0505683,0.182645,0.0732241,0.0455135,0.0857182,0.0831076,0.106746
MNAR,0.3,0.0559658,0.170334,0.07472,0.0550407,0.1187,0.0838445,0.0999355
MNAR,0.5,0.0605019,0.154272,0.0723372,0.0532613,0.078384,0.083486,0.100944
MNAR,0.7,0.0756699,0.155519,0.0873359,0.0650654,0.111181,0.0951874,0.110033




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0107113,0.106369,0.024249,0.0103572,0.0253049,0.0241849,0.0326098
MNAR,0.3,0.0280414,0.300947,0.0588902,0.027658,0.115786,0.0610202,0.0951024
MNAR,0.5,0.0447511,0.443124,0.0978966,0.0400208,0.116901,0.121238,0.124762
MNAR,0.7,0.124466,0.628104,0.157033,0.0799024,0.236744,0.201571,0.159839




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0,0.0104895,-0.013986,-0.0034965,-0.0034965,0.0034965,3.4694500000000004e-18
MNAR,0.3,0.0,0.0034965,-0.0104895,0.0034965,-0.0034965,0.00699301,0.0034965
MNAR,0.5,0.0,0.0,0.0104895,-0.0034965,0.00699301,-0.0104895,0.00699301
MNAR,0.7,0.0034965,-0.0034965,-0.00699301,0.0,-0.0034965,0.0034965,0.0104895


## Dataset: California Housing



In [14]:
from sklearn.datasets import fetch_california_housing

X_raw, y = fetch_california_housing(as_frame = True, return_X_y = True)

X_raw

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [15]:
X_raw.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


In [16]:
evaluate_dataset_repeated("california", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0986329,0.145088,0.10498,0.301198,0.152585,0.164476,0.208548
MNAR,0.3,0.12218,0.151184,0.11994,0.19681,0.192641,0.184766,0.205498
MNAR,0.5,0.146705,0.151366,0.138455,0.223805,0.270516,0.192213,0.228538
MNAR,0.7,0.156143,0.1566,0.150231,0.15345,0.210079,0.202274,0.257456




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.00919148,0.0723877,0.0253332,0.0298205,0.0616818,0.0296458,0.0474469
MNAR,0.3,0.0445679,0.225521,0.083701,0.0911464,0.192774,0.125537,0.130646
MNAR,0.5,0.131583,0.374481,0.195822,0.182245,0.612314,0.215724,0.273389
MNAR,0.7,0.282175,0.533653,0.306773,0.351669,0.495133,0.399231,0.599503




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0252396,0.0202783,0.0216428,0.0231539,0.025648,0.0332015,0.0302222
MNAR,0.3,0.0674465,0.0563046,0.0524139,0.0647359,0.0615216,0.0821126,0.0715752
MNAR,0.5,0.166391,0.138605,0.141016,0.137393,0.157064,0.172705,0.14757
MNAR,0.7,0.198069,0.337954,0.280199,0.311734,0.301791,0.332155,0.29119


# Dataset: Climate Model Simulation Crashes
https://archive.ics.uci.edu/ml/datasets/climate+model+simulation+crashes

In [17]:
samples = np.loadtxt("https://archive.ics.uci.edu/ml/machine-learning-databases/00252/pop_failures.dat", skiprows=1)
df = pd.DataFrame(samples)

df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,1.0,1.0,0.859036,0.927825,0.252866,0.298838,0.170521,0.735936,0.428325,0.567947,...,0.245675,0.104226,0.869091,0.997518,0.448620,0.307522,0.858310,0.796997,0.869893,0.0
1,1.0,2.0,0.606041,0.457728,0.359448,0.306957,0.843331,0.934851,0.444572,0.828015,...,0.616870,0.975786,0.914344,0.845247,0.864152,0.346713,0.356573,0.438447,0.512256,1.0
2,1.0,3.0,0.997600,0.373238,0.517399,0.504993,0.618903,0.605571,0.746225,0.195928,...,0.679355,0.803413,0.643995,0.718441,0.924775,0.315371,0.250642,0.285636,0.365858,1.0
3,1.0,4.0,0.783408,0.104055,0.197533,0.421837,0.742056,0.490828,0.005525,0.392123,...,0.471463,0.597879,0.761659,0.362751,0.912819,0.977971,0.845921,0.699431,0.475987,1.0
4,1.0,5.0,0.406250,0.513199,0.061812,0.635837,0.844798,0.441502,0.191926,0.487546,...,0.551543,0.743877,0.312349,0.650223,0.522261,0.043545,0.376660,0.280098,0.132283,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535,3.0,176.0,0.657136,0.489375,0.133713,0.411950,0.087780,0.356289,0.480204,0.029678,...,0.280546,0.384117,0.885948,0.768482,0.459479,0.334482,0.573002,0.610183,0.737706,1.0
536,3.0,177.0,0.915894,0.842720,0.518947,0.090622,0.336981,0.893576,0.978703,0.674868,...,0.798108,0.353546,0.044796,0.990900,0.347027,0.512499,0.810549,0.593332,0.142565,0.0
537,3.0,178.0,0.478600,0.941185,0.769245,0.950776,0.189406,0.112743,0.745645,0.527096,...,0.193103,0.829563,0.101506,0.548878,0.381966,0.198811,0.867108,0.461632,0.652817,1.0
538,3.0,179.0,0.007793,0.779287,0.867468,0.704820,0.983282,0.420303,0.710612,0.174746,...,0.761134,0.436714,0.690132,0.825133,0.981656,0.113193,0.364799,0.201469,0.536535,1.0


In [18]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
count,540.0,540.0,540.0,540.0,540.0,540.0,540.0,540.0,540.0,540.0,...,540.0,540.0,540.0,540.0,540.0,540.0,540.0,540.0,540.0,540.0
mean,2.0,90.5,0.500026,0.500097,0.500027,0.500119,0.500001,0.499913,0.500059,0.500076,...,0.500111,0.499984,0.500032,0.499933,0.499944,0.499946,0.500044,0.50002,0.500021,0.914815
std,0.817254,52.008901,0.288939,0.288922,0.289067,0.288993,0.288827,0.288852,0.28901,0.288909,...,0.288966,0.289127,0.289014,0.288822,0.288949,0.288923,0.288813,0.288936,0.289013,0.279416
min,1.0,1.0,0.000414,0.001922,0.001181,0.001972,0.000858,0.000476,0.00459,0.000296,...,0.002015,0.000419,0.001188,0.001312,0.002509,0.000732,0.000891,0.000219,0.000263,0.0
25%,1.0,45.75,0.24965,0.251597,0.25154,0.250158,0.25063,0.251325,0.253048,0.250402,...,0.250758,0.251676,0.249669,0.249988,0.249586,0.249974,0.250412,0.252739,0.249723,1.0
50%,2.0,90.5,0.499998,0.499595,0.500104,0.500456,0.500903,0.499174,0.49907,0.500074,...,0.500393,0.500322,0.500151,0.500625,0.49908,0.499959,0.500384,0.498955,0.499431,1.0
75%,3.0,135.25,0.750042,0.750011,0.74918,0.750348,0.748988,0.748166,0.750109,0.749091,...,0.749447,0.749346,0.749164,0.749569,0.750012,0.747978,0.749256,0.748539,0.749792,1.0
max,3.0,180.0,0.999194,0.998815,0.998263,0.997673,0.998944,0.997142,0.99893,0.998506,...,0.999536,0.999942,0.997718,0.997518,0.999795,0.999155,0.997265,0.999306,0.999655,1.0


In [19]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("climate_model", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.287859,0.28306,0.290236,0.28311,0.363173,0.317211,0.291916
MNAR,0.3,0.294643,0.287712,0.297117,0.287956,0.354126,0.315111,0.296438
MNAR,0.5,0.346934,0.303221,0.343585,0.303478,0.364054,0.32801,0.325354
MNAR,0.7,0.378981,0.308003,0.386998,0.341389,0.378576,0.32534,0.335088




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.173329,0.192695,0.164193,0.191419,0.0899157,0.113467,0.15053
MNAR,0.3,0.457035,0.597078,0.443683,0.594507,0.283097,0.371211,0.466478
MNAR,0.5,0.558688,1.03845,0.601769,1.0329,0.521045,0.709754,0.784535
MNAR,0.7,0.620155,1.47216,0.579898,1.15299,0.617892,1.08635,1.05561




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0,0,0,0,0,0,0
MNAR,0.3,0,0,0,0,0,0,0
MNAR,0.5,0,0,0,0,0,0,0
MNAR,0.7,0,0,0,0,0,0,0


## Concrete Compressive Strength Data Set
https://archive.ics.uci.edu/ml/datasets/concrete+compressive+strength

In [20]:
df = pd.read_excel("https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls")

df

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.052780
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075
...,...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,44.284354
1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,31.178794
1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,23.696601
1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28,32.768036


In [21]:
df.describe()

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.165631,73.895485,54.187136,181.566359,6.203112,972.918592,773.578883,45.662136,35.817836
std,104.507142,86.279104,63.996469,21.355567,5.973492,77.753818,80.175427,63.169912,16.705679
min,102.0,0.0,0.0,121.75,0.0,801.0,594.0,1.0,2.331808
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.707115
50%,272.9,22.0,0.0,185.0,6.35,968.0,779.51,28.0,34.442774
75%,350.0,142.95,118.27,192.0,10.16,1029.4,824.0,56.0,46.136287
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.599225


In [22]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("concrete_compressive", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.11887,0.219815,0.148811,0.138441,0.200937,0.131812,0.259624
MNAR,0.3,0.179697,0.222409,0.177142,0.170735,0.218329,0.175573,0.26632
MNAR,0.5,0.244706,0.228274,0.224279,0.212638,0.284013,0.217428,0.314198
MNAR,0.7,0.304949,0.226116,0.257923,0.224975,0.27479,0.235244,0.358522




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0282774,0.143162,0.0553229,0.0449431,0.111388,0.0325122,0.088109
MNAR,0.3,0.106218,0.424526,0.175923,0.159589,0.313519,0.12321,0.206319
MNAR,0.5,0.309506,0.741561,0.274542,0.391151,0.72792,0.295361,0.320342
MNAR,0.7,0.587368,1.02278,0.541158,0.760599,0.795753,0.466766,0.7164




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,9.10006,9.27888,10.208,6.13272,10.3282,4.10911,6.15998
MNAR,0.3,40.1454,30.3145,19.6877,33.4018,24.0301,26.8035,33.1615
MNAR,0.5,79.4334,97.6654,57.0396,62.0016,64.012,64.415,51.216
MNAR,0.7,132.075,245.709,161.868,106.247,115.551,113.482,132.084


## Concrete Slump Test Data Set

https://archive.ics.uci.edu/ml/datasets/concrete+slump+test

In [23]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/slump/slump_test.data")

df

Unnamed: 0,No,Cement,Slag,Fly ash,Water,SP,Coarse Aggr.,Fine Aggr.,SLUMP(cm),FLOW(cm),Compressive Strength (28-day)(Mpa)
0,1,273.0,82.0,105.0,210.0,9.0,904.0,680.0,23.0,62.0,34.99
1,2,163.0,149.0,191.0,180.0,12.0,843.0,746.0,0.0,20.0,41.14
2,3,162.0,148.0,191.0,179.0,16.0,840.0,743.0,1.0,20.0,41.81
3,4,162.0,148.0,190.0,179.0,19.0,838.0,741.0,3.0,21.5,42.08
4,5,154.0,112.0,144.0,220.0,10.0,923.0,658.0,20.0,64.0,26.82
...,...,...,...,...,...,...,...,...,...,...,...
98,99,248.3,101.0,239.1,168.9,7.7,954.2,640.6,0.0,20.0,49.97
99,100,248.0,101.0,239.9,169.1,7.7,949.9,644.1,2.0,20.0,50.23
100,101,258.8,88.0,239.6,175.3,7.6,938.9,646.0,0.0,20.0,50.50
101,102,297.1,40.9,239.9,194.0,7.5,908.9,651.8,27.5,67.0,49.17


In [24]:
df.describe()

Unnamed: 0,No,Cement,Slag,Fly ash,Water,SP,Coarse Aggr.,Fine Aggr.,SLUMP(cm),FLOW(cm),Compressive Strength (28-day)(Mpa)
count,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0
mean,52.0,229.894175,77.973786,149.014563,197.167961,8.539806,883.978641,739.604854,18.048544,49.61068,36.039417
std,29.877528,78.87723,60.461363,85.41808,20.208158,2.80753,88.391393,63.342117,8.750844,17.56861,7.838232
min,1.0,137.0,0.0,0.0,160.0,4.4,708.0,640.6,0.0,20.0,17.19
25%,26.5,152.0,0.05,115.5,180.0,6.0,819.5,684.5,14.5,38.5,30.9
50%,52.0,248.0,100.0,164.0,196.0,8.0,879.0,742.7,21.5,54.0,35.52
75%,77.5,303.9,125.0,235.95,209.5,10.0,952.8,788.0,24.0,63.75,41.205
max,103.0,374.0,193.0,260.0,240.0,19.0,1049.9,902.0,29.0,78.0,58.53


In [25]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns = ["No", last_col])

evaluate_dataset_repeated("climate_model", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.144309,0.286147,0.185838,0.133588,0.263795,0.1965,0.287668
MNAR,0.3,0.215879,0.282419,0.231158,0.208653,0.288167,0.245092,0.320916
MNAR,0.5,0.288301,0.298447,0.295158,0.271829,0.365456,0.288173,0.382265
MNAR,0.7,0.345994,0.30125,0.340678,0.316409,0.414042,0.302856,0.436922




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.052878,0.208215,0.0921318,0.0583438,0.120742,0.0964822,0.125787
MNAR,0.3,0.198687,0.566755,0.246571,0.235115,0.366657,0.247672,0.297138
MNAR,0.5,0.408634,1.02423,0.479457,0.565182,0.69609,0.566204,0.577282
MNAR,0.7,0.657739,1.35478,0.646531,0.889705,0.6575,0.756153,0.709917




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.352813,0.176556,2.05079,0.109525,1.89019,7.42249,0.363184
MNAR,0.3,15.2492,1.70489,9.5842,1.2437,19.5515,19.3167,1.24542
MNAR,0.5,46.936,45.8749,27.9868,28.5731,41.3062,32.8364,32.0931
MNAR,0.7,66.2627,39.8308,30.2347,32.2555,39.6182,28.9435,14.5923


## Connectionist Bench (Sonar, Mines vs. Rocks) Data Set

https://archive.ics.uci.edu/ml/datasets/connectionist+bench+(sonar,+mines+vs.+rocks)

In [26]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data", header = None)

df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,0.0200,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.0180,0.0084,0.0090,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.0140,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.2280,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.0180,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.0100,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.0150,0.0085,0.0073,0.0050,0.0044,0.0040,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.0590,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.0110,0.0015,0.0072,0.0048,0.0107,0.0094,R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,0.0187,0.0346,0.0168,0.0177,0.0393,0.1630,0.2028,0.1694,0.2328,0.2684,...,0.0116,0.0098,0.0199,0.0033,0.0101,0.0065,0.0115,0.0193,0.0157,M
204,0.0323,0.0101,0.0298,0.0564,0.0760,0.0958,0.0990,0.1018,0.1030,0.2154,...,0.0061,0.0093,0.0135,0.0063,0.0063,0.0034,0.0032,0.0062,0.0067,M
205,0.0522,0.0437,0.0180,0.0292,0.0351,0.1171,0.1257,0.1178,0.1258,0.2529,...,0.0160,0.0029,0.0051,0.0062,0.0089,0.0140,0.0138,0.0077,0.0031,M
206,0.0303,0.0353,0.0490,0.0608,0.0167,0.1354,0.1465,0.1123,0.1945,0.2354,...,0.0086,0.0046,0.0126,0.0036,0.0035,0.0034,0.0079,0.0036,0.0048,M


In [27]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
count,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,...,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0
mean,0.029164,0.038437,0.043832,0.053892,0.075202,0.10457,0.121747,0.134799,0.178003,0.208259,...,0.016069,0.01342,0.010709,0.010941,0.00929,0.008222,0.00782,0.007949,0.007941,0.006507
std,0.022991,0.03296,0.038428,0.046528,0.055552,0.059105,0.061788,0.085152,0.118387,0.134416,...,0.012008,0.009634,0.00706,0.007301,0.007088,0.005736,0.005785,0.00647,0.006181,0.005031
min,0.0015,0.0006,0.0015,0.0058,0.0067,0.0102,0.0033,0.0055,0.0075,0.0113,...,0.0,0.0008,0.0005,0.001,0.0006,0.0004,0.0003,0.0003,0.0001,0.0006
25%,0.01335,0.01645,0.01895,0.024375,0.03805,0.067025,0.0809,0.080425,0.097025,0.111275,...,0.008425,0.007275,0.005075,0.005375,0.00415,0.0044,0.0037,0.0036,0.003675,0.0031
50%,0.0228,0.0308,0.0343,0.04405,0.0625,0.09215,0.10695,0.1121,0.15225,0.1824,...,0.0139,0.0114,0.00955,0.0093,0.0075,0.00685,0.00595,0.0058,0.0064,0.0053
75%,0.03555,0.04795,0.05795,0.0645,0.100275,0.134125,0.154,0.1696,0.233425,0.2687,...,0.020825,0.016725,0.0149,0.0145,0.0121,0.010575,0.010425,0.01035,0.010325,0.008525
max,0.1371,0.2339,0.3059,0.4264,0.401,0.3823,0.3729,0.459,0.6828,0.7106,...,0.1004,0.0709,0.039,0.0352,0.0447,0.0394,0.0355,0.044,0.0364,0.0439


In [28]:
last_col = df.columns[-1]
y = (df[last_col] == 'M').astype(int)
X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("connectionist_sonar", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0929781,0.219259,0.100708,0.0871726,0.183903,0.128057,0.178561
MNAR,0.3,0.10379,0.198791,0.107832,0.0966558,0.186849,0.130853,0.168566
MNAR,0.5,0.120779,0.209297,0.124019,0.140446,0.182964,0.150077,0.18493
MNAR,0.7,0.131001,0.217554,0.132781,0.133056,0.225955,0.158893,0.198128




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.036514,0.146117,0.0453367,0.0319113,0.0540783,0.0534029,0.0829495
MNAR,0.3,0.0990164,0.368971,0.101715,0.072218,0.129581,0.149063,0.21942
MNAR,0.5,0.201826,0.638238,0.201307,0.170371,0.186023,0.294707,0.376223
MNAR,0.7,0.272071,0.960081,0.285487,0.210516,0.332129,0.506233,0.580325




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.00961538,0.0,-0.00961538,-0.0480769,0.0288462,-0.0192308,0.0192308
MNAR,0.3,0.0384615,0.0288462,0.00961538,0.00961538,-0.00961538,-0.0192308,0.0384615
MNAR,0.5,-0.0384615,0.0384615,0.0192308,0.0384615,0.0384615,0.0384615,0.0673077
MNAR,0.7,-0.0480769,0.0673077,0.0,-0.0192308,0.0673077,0.00961538,0.0576923


## Wine-Red dataset

In [30]:
# Wine Quality Data Set

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep = ';')

df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [31]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [32]:
last_col = df.columns[-1]

y = df[last_col]
mapped_labels = sorted(y.unique())
mapping = {}
for idx, label in enumerate(mapped_labels):
    mapping[label] = idx
y = y.map(mapping)       

X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("wine_red", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0852813,0.144922,0.109323,0.103689,0.137154,0.106748,0.14743
MNAR,0.3,0.0953745,0.142997,0.110543,0.113395,0.151952,0.113129,0.142075
MNAR,0.5,0.108801,0.133806,0.115247,0.119512,0.147692,0.110525,0.141273
MNAR,0.7,0.144958,0.141423,0.138706,0.125455,0.180876,0.135913,0.157957




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0181747,0.0856393,0.0370266,0.0239706,0.0687607,0.0223589,0.0523251
MNAR,0.3,0.0512516,0.247021,0.0949845,0.0658551,0.171139,0.0625974,0.121308
MNAR,0.5,0.0698419,0.381822,0.137154,0.125719,0.277077,0.112709,0.189927
MNAR,0.7,0.164959,0.599117,0.274123,0.336311,0.585325,0.217572,0.306067




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,-0.0075,-0.0025,-0.0175,0.025,0.00125,0.01375,0.01125
MNAR,0.3,0.04875,0.01125,0.0125,0.03875,0.03625,5.55112e-17,-0.0075
MNAR,0.5,0.0075,0.04,-0.0175,0.025,0.00625,-0.00375,0.06375
MNAR,0.7,0.02,0.05125,0.06,0.0625,-0.0075,0.10375,0.08375


## Wine-White dataset

In [33]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep = ';')

df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [34]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


In [35]:
last_col = df.columns[-1]

y = df[last_col]
mapped_labels = sorted(y.unique())
mapping = {}
for idx, label in enumerate(mapped_labels):
    mapping[label] = idx
y = y.map(mapping)  

X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("wine_white", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0719226,0.095612,0.0819784,0.0778147,0.0958793,0.102698,0.0990831
MNAR,0.3,0.0870807,0.108937,0.0886079,0.0847453,0.139656,0.111403,0.123357
MNAR,0.5,0.119318,0.107025,0.100734,0.0977815,0.113693,0.116393,0.117017
MNAR,0.7,0.122978,0.110922,0.111775,0.105843,0.113931,0.122556,0.135951




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0161069,0.0533173,0.0281104,0.0194983,0.0379897,0.0104158,0.0298755
MNAR,0.3,0.0463315,0.183328,0.0763232,0.0507978,0.102326,0.0439913,0.0797187
MNAR,0.5,0.0945249,0.311844,0.139418,0.115437,0.209301,0.0852539,0.153696
MNAR,0.7,0.0979981,0.442069,0.235005,0.241573,0.304403,0.151811,0.228573




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,-0.00326531,0.0077551,0.00530612,0.00408163,-0.00204082,-0.00244898,-0.00122449
MNAR,0.3,-0.00244898,0.000816327,0.00326531,-0.000408163,-0.00367347,0.0044898,0.0044898
MNAR,0.5,-0.00285714,-0.00367347,0.00408163,0.0122449,0.0138776,0.022449,-0.000816327
MNAR,0.7,0.022449,-0.0110204,0.00244898,-0.00367347,0.0171429,0.0461224,0.0261224


## Yeast Data Set


In [36]:
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data",sep="\s+", header = None)

df = df.drop(columns = [0])

for col in [9]:
    df[col] = LabelEncoder().fit_transform(df[col])

df

Unnamed: 0,1,2,3,4,5,6,7,8,9
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,6
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,6
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,6
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,7
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,6
...,...,...,...,...,...,...,...,...,...
1479,0.81,0.62,0.43,0.17,0.5,0.0,0.53,0.22,4
1480,0.47,0.43,0.61,0.40,0.5,0.0,0.48,0.47,7
1481,0.67,0.57,0.36,0.19,0.5,0.0,0.56,0.22,4
1482,0.43,0.40,0.60,0.16,0.5,0.0,0.53,0.39,7


In [37]:
df.describe()

Unnamed: 0,1,2,3,4,5,6,7,8,9
count,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0
mean,0.500121,0.499933,0.500034,0.261186,0.504717,0.0075,0.499885,0.276199,4.126011
std,0.137299,0.123924,0.08667,0.137098,0.048351,0.075683,0.057797,0.106491,3.037575
min,0.11,0.13,0.21,0.0,0.5,0.0,0.0,0.0,0.0
25%,0.41,0.42,0.46,0.17,0.5,0.0,0.48,0.22,0.0
50%,0.49,0.49,0.51,0.22,0.5,0.0,0.51,0.22,5.0
75%,0.58,0.57,0.55,0.32,0.5,0.0,0.53,0.3,7.0
max,1.0,1.0,1.0,1.0,1.0,0.83,0.73,1.0,9.0


In [38]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns = [last_col])

evaluate_dataset_repeated("yeast", X_raw, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.108436,0.122706,0.107288,0.108289,0.13652,0.14728,0.12404
MNAR,0.3,0.116919,0.121848,0.128008,0.600405,0.151164,0.144832,0.149515
MNAR,0.5,0.177698,0.118528,0.144285,0.235861,0.141717,0.146581,0.184166
MNAR,0.7,0.220957,0.11882,0.142066,0.13298,0.171586,0.144496,0.239972




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0348034,0.0615386,0.0371888,0.0375528,0.0620434,0.0287413,0.0336986
MNAR,0.3,0.081175,0.17819,0.0982308,0.192141,0.146732,0.0820586,0.102601
MNAR,0.5,0.134002,0.291263,0.14562,0.206623,0.242683,0.128729,0.268121
MNAR,0.7,0.370202,0.404068,0.26118,0.3042,0.341991,0.190091,0.592934




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,-0.547675,0.539475,0.0864276,0.206234,-0.0422057,-0.190587,0.546038
MNAR,0.3,0.21414,1.28762,0.705546,0.336005,0.1365,0.0796255,-0.100846
MNAR,0.5,0.70831,1.96419,1.89355,0.975546,1.05103,-0.0663834,1.17159
MNAR,0.7,1.26263,5.17296,2.8893,1.42106,1.54964,0.886509,0.219599


## Diabetes


In [39]:
from sklearn.datasets import load_diabetes

X, y = load_diabetes(as_frame = True, return_X_y = True)

X

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068330,-0.092204
2,0.085299,0.050680,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018118,0.044485
439,0.041708,0.050680,-0.015906,0.017282,-0.037344,-0.013840,-0.024993,-0.011080,-0.046879,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044528,-0.025930


In [40]:
X.describe()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-3.639623e-16,1.309912e-16,-8.013951e-16,1.289818e-16,-9.042540000000001e-17,1.301121e-16,-4.563971e-16,3.863174e-16,-3.848103e-16,-3.398488e-16
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905
min,-0.1072256,-0.04464164,-0.0902753,-0.1123996,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260974,-0.1377672
25%,-0.03729927,-0.04464164,-0.03422907,-0.03665645,-0.03424784,-0.0303584,-0.03511716,-0.03949338,-0.03324879,-0.03317903
50%,0.00538306,-0.04464164,-0.007283766,-0.005670611,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947634,-0.001077698
75%,0.03807591,0.05068012,0.03124802,0.03564384,0.02835801,0.02984439,0.0293115,0.03430886,0.03243323,0.02791705
max,0.1107267,0.05068012,0.1705552,0.1320442,0.1539137,0.198788,0.1811791,0.1852344,0.133599,0.1356118


In [41]:
evaluate_dataset_repeated("diabetes", X, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.241812,0.243612,0.204244,0.19589,0.254049,0.262751,0.226236
MNAR,0.3,0.252966,0.249187,0.215432,0.21738,0.267172,0.287615,0.245666
MNAR,0.5,0.2659,0.251802,0.236297,0.253446,0.325002,0.29557,0.260098
MNAR,0.7,0.274771,0.250597,0.278175,0.239138,0.278413,0.293754,0.266219




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0389125,0.151145,0.0784611,0.0733704,0.0692896,0.0681265,0.0759945
MNAR,0.3,0.110436,0.454813,0.210285,0.176442,0.211969,0.234642,0.206431
MNAR,0.5,0.218449,0.780844,0.30507,0.322695,0.466059,0.4436,0.344194
MNAR,0.7,0.358546,1.04671,0.380985,0.591633,0.694597,0.5977,0.482607




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,-401.806,5.36933,54.2215,-336.643,-589.377,-80.1913,-166.889
MNAR,0.3,9.57904,470.9,-194.324,125.893,346.553,-123.286,-33.6629
MNAR,0.5,-144.156,664.627,634.879,199.592,639.278,101.567,-34.2925
MNAR,0.7,802.129,1844.95,699.076,82.5283,578.073,445.426,524.278


## Iris


In [42]:
from sklearn.datasets import load_iris

X, y = load_iris(as_frame = True, return_X_y = True)

X

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [43]:
X.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [44]:
evaluate_dataset_repeated("iris", X, y)

RMSE score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0944589,0.281447,0.0819451,0.0989049,0.14856,0.106804,0.132979
MNAR,0.3,0.132906,0.268895,0.124243,0.14863,0.186381,0.168059,0.252428
MNAR,0.5,0.203705,0.282226,0.226493,0.183419,0.249081,0.205133,0.328937
MNAR,0.7,0.296212,0.284833,0.238229,0.229569,0.334297,0.236932,0.499636




Wasserstein score


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0173057,0.0943313,0.0169337,0.0164484,0.0360895,0.0186105,0.0262153
MNAR,0.3,0.0500398,0.244638,0.0537543,0.0554786,0.0967101,0.0712931,0.115862
MNAR,0.5,0.130084,0.487941,0.170699,0.173013,0.286511,0.207455,0.385221
MNAR,0.7,0.417552,0.669873,0.267829,0.375544,0.581315,0.345311,0.878772




Downstream model prediction error


Scenario,"miss_pct [0, 1]",Our method,mean,missforest,ice,gain,sinkhorn,softimpute
MNAR,0.1,0.0,-0.0131579,0.0,-0.0131579,-0.0263158,0.0,0.0
MNAR,0.3,0.0,0.0131579,0.0526316,-0.0131579,-0.0131579,-0.0131579,0.0
MNAR,0.5,0.0263158,0.0789474,0.0131579,0.0263158,0.0263158,0.0131579,0.0131579
MNAR,0.7,0.0657895,0.0394737,0.0394737,0.0789474,0.105263,0.0526316,0.105263


# Conclusion
