In [1]:
# imports
import pandas as pd
import numpy as np
from itertools import product, chain

from misscreate import induce_mcar, induce_mar, induce_mnar

In [3]:
# load the data
df2h = pd.read_parquet('outdata/datasets/complete/data2h.parquet')
df4h = pd.read_parquet('outdata/datasets/complete/data4h.parquet')
df6h = pd.read_parquet('outdata/datasets/complete/data6h.parquet')

no_miss_cols = df2h.select_dtypes(exclude='number').columns.to_list() + ['hospital_expire_flag'] # list of all non-numeric columns + outcome

for df in [df2h, df4h, df6h]:
    df['age'] = df['age'].factorize(sort=True)[0] # factorize the age so it can be used as a conditional variable

In [8]:
# options
runs = 20
datasets =      {'2h': df2h, '4h': df4h, '6h': df6h}                         
miss_rates =    [0.05, 0.1, 0.25, 0.5, 0.75]
miss_funs =     [induce_mcar, induce_mar, induce_mnar]                      
miss_types =    ['single', 'patient', 'both'] 
miss_weights =  ['equal', 'squared', [1,1,1,5,10]]
cond_vars =     ['age']


### MCAR

In [5]:
# 3m35s
options = list(product(datasets, miss_rates, miss_types))

for dfkey, rate, type in options:
    for i in range(runs):
        tmp = induce_mcar(df=datasets[dfkey], miss_rate=rate, miss_type=type, ignore_cols=no_miss_cols)
        path = f"outdata/datasets/missing/mcar/mcar_{dfkey}_{rate}_{type}_{i}.parquet"
        tmp.to_parquet(path)

### MAR

In [9]:
# 20m3s
options = list(product(datasets, miss_rates, miss_types, miss_weights))

for dfkey, rate, type, weight in options:
    for i in range(runs):
        tmp = induce_mar(df=datasets[dfkey], miss_rate=rate, miss_type=type, miss_weights=weight, cond_var='age', ignore_cols=no_miss_cols)
        path = f"outdata/datasets/missing/mar/mar_{dfkey}_{rate}_{type}_{weight}_{i}.parquet"
        tmp.to_parquet(path)

### MNAR

In [6]:
#  19m56s
options = list(product(datasets, miss_rates, miss_types, miss_weights))

for dfkey, rate, type, weight in options:
    for i in range(runs):
        tmp = induce_mnar(df=datasets[dfkey], miss_rate=rate, miss_type=type, miss_weights=weight, ignore_cols=no_miss_cols)
        path = f"outdata/datasets/missing/mnar/mnar_{dfkey}_{rate}_{type}_{weight}_{i}.parquet"
        tmp.to_parquet(path)