### Missing At Random

In [1]:
import os
os.chdir("..")
import numpy as np
import pandas as pd

In [2]:
from utils.data import create_adult_dataset

In [3]:
data = create_adult_dataset()
X = data.X.copy().drop(columns=data.protected_features)
X.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,hours-per-week
0,39,7,9,13,4,1,1,4,40
1,50,6,9,13,2,4,0,4,13
2,38,4,11,9,0,6,1,4,40
3,53,4,1,7,2,6,0,2,40
4,28,4,9,13,2,10,5,2,40


In [4]:
X.shape

(32561, 9)

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

def convert_single_feature(K_df, observed_df, range_min, range_max, scalar_0=0.05, scalar_1=2):
    max_iter = 10
    while(max_iter > 0):
        scalars = np.random.standard_normal(observed_df.shape[1]) * scalar_0
        scalar_a = np.random.standard_normal() - scalar_1
        observed = observed_df.to_numpy()
        observed = scaler.fit_transform(observed)
        M = scalar_a + (scalars * observed).sum(axis=1)
        p = 1 / (1 + np.exp(-M))
        ratio = p.sum() / K_df.shape[0]
        if ratio > range_min and ratio < range_max:
            break
        max_iter -= 1
    if max_iter <= 0:
        return None
    p_bi = np.random.binomial(1, p).astype(np.bool)
    K_df = K_df.where(~p_bi, other=np.nan)
    return K_df

In [6]:
observed_df = X[["age", "workclass"]].copy()
K_df = X["education"].copy()
K_df_converted = convert_single_feature(K_df, observed_df, 0.0, 1.0)
K_df_converted.isnull().sum()

4723

In [7]:
X[["age", "workclass"]].describe()

Unnamed: 0,age,workclass
count,32561.0,32561.0
mean,38.581647,3.868892
std,13.640433,1.45596
min,17.0,0.0
25%,28.0,4.0
50%,37.0,4.0
75%,48.0,4.0
max,90.0,8.0


In [8]:
ls = []
for _ in range(100):
    K_df_converted = convert_single_feature(K_df, observed_df, 0.1, 0.3)
    ls.append(K_df_converted.isnull().sum() / X.shape[0])
ls

[0.19919535640797273,
 0.29068517551672246,
 0.19372869383618438,
 0.2749608427259605,
 0.2964896655508123,
 0.19495715733546268,
 0.23568072233653758,
 0.2970117625380056,
 0.13270476950953594,
 0.11154448573446761,
 0.12299990786523755,
 0.09628082675593501,
 0.18641933601547864,
 0.1939436749485581,
 0.2821473541967384,
 0.1028531064770738,
 0.10644636221246276,
 0.15165381898590338,
 0.20871594852737937,
 0.16492122477810878,
 0.23079757992690642,
 0.0986456189920457,
 0.1333190012591751,
 0.11765609164337705,
 0.10346733822671293,
 0.1492890267497927,
 0.22741930530389115,
 0.17164706243665734,
 0.29047019440434874,
 0.10586284205030558,
 0.2706919320659685,
 0.3027855409846135,
 0.15583059488344952,
 0.1802463069316053,
 0.1466478302263444,
 0.17579312674672154,
 0.1922238260495685,
 0.10822763428641627,
 0.20340284389300084,
 0.1495040078621664,
 0.12800589662479653,
 0.1229691962777556,
 0.24943951352845428,
 0.12109578944135622,
 0.17005005988759558,
 0.1562912686956789,
 0.16