In [1]:
import numpy as np
import torch
import pandas as pd

import torch.nn as nn
from torch import optim
from torch.utils.data import TensorDataset, DataLoader
from missforest import MissForest
from sklearn.ensemble import RandomForestRegressor
import warnings

In [8]:
n = 2*10**4
d = 10

lr = 0.01
epochs = 60
l1_lambda = 0

X = np.random.uniform(0, 1, (n, d))
epsilon = np.random.normal(0, 0.25, n)
Y = np.zeros(n)
probs = np.zeros(n)

def reg_func(x):
    out = np.exp(10*x[1]**2) / (np.exp(10*x[1]**2) + np.exp(10*x[2]*x[3] + 2*x[3]))   
    return out
for i in range(n):
    probs[i] = reg_func(X[i,:])
    Y[i] = np.random.binomial(1, probs[i], 1)[0]

Omega = np.random.binomial(1, 0.5, (n, d))
for i in range(n):
    if probs[i] <= 0.6:
        Omega[i,1] = 0
    else:
        Omega[i,1] = 1
sample_mean = np.sum(X*Omega, axis = 0) / np.sum(Omega, axis = 0)
Z_ZI = X * Omega
Z_MI = X * Omega + sample_mean * (1 - Omega)
Z_RI = X * Omega + (1 - Omega) * np.random.uniform(0, 1, (n, d))


Z_ZI_train = Z_ZI[0:int(n/2), :]
Z_ZI_test = Z_ZI[int(n/2):n, :]
Z_MI_train = Z_MI[0:int(n/2), :]
Z_MI_test = Z_MI[int(n/2):n, :]
Omega_train = Omega[0:int(n/2), :]
Omega_test = Omega[int(n/2):n, :]

Y_train = Y[0:int(n/2)]
Y_test = Y[int(n/2):n]

Z_ZI_train = torch.tensor(Z_ZI_train, dtype=torch.float32)
Z_ZI_test = torch.tensor(Z_ZI_test, dtype=torch.float32)
Z_MI_train = torch.tensor(Z_MI_train, dtype=torch.float32)
Z_MI_test = torch.tensor(Z_MI_test, dtype=torch.float32)
Omega_train = torch.tensor(Omega_train, dtype=torch.float32)
Omega_test = torch.tensor(Omega_test, dtype=torch.float32)
Z_Omega_train = torch.cat((Z_ZI_train, Omega_train), dim = 1)
Z_Omega_test = torch.cat((Z_ZI_test, Omega_test), dim = 1)

Y_train = torch.tensor(Y_train, dtype=torch.long)

In [9]:
np.mean([min(y, 1-y) for y in probs])

np.float64(0.13590952064185927)

In [29]:
### pattern augmented NN
class PANN(nn.Module):
    def __init__(self):
        super().__init__()
        self.arch1 = nn.Sequential(
            nn.Linear(2*d, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 16),
            nn.ReLU(),
            nn.Linear(16, 16),
            nn.ReLU(),
            nn.Linear(16, 2),
        )

    def forward(self, x):
        out = self.arch1(x)
        return out
    
model_PA = PANN()
PA_train_data = TensorDataset(Z_Omega_train, Y_train)
PA_train_loader = DataLoader(dataset = PA_train_data, batch_size=20, shuffle=True)

optimizer = optim.SGD(model_PA.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()

for epoch in range(epochs):

    for x_batch, y_batch in PA_train_loader:
        optimizer.zero_grad()
        pred = model_PA(x_batch)
        loss = loss_fn(pred, y_batch)

        # L1 penalty
        l1_penalty = 0
        for param in model_PA.parameters():
            l1_penalty += torch.sum(torch.abs(param))
        # Add L1 penalty to the loss
        loss += l1_lambda * l1_penalty

        loss.backward()
        optimizer.step()

    if epoch % 10 == 9:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

_, labels = torch.max(model_PA(Z_Omega_test), dim=1)
pred = labels.detach().numpy()
ER_PA = np.mean(np.abs(pred - Y_test))
print(ER_PA)

Epoch 9, Loss: 0.7774490714073181
Epoch 19, Loss: 0.7479819059371948
Epoch 29, Loss: 0.31456458568573
Epoch 39, Loss: 0.34804767370224
Epoch 49, Loss: 0.20058691501617432
Epoch 59, Loss: 0.4071352779865265
0.136


In [18]:
### mean imputation
class MINN(nn.Module):
    def __init__(self):
        super().__init__()
        self.arch1 = nn.Sequential(
            nn.Linear(d, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 16),
            nn.ReLU(),
            nn.Linear(16, 16),
            nn.ReLU(),
            nn.Linear(16, 2),
        )

    def forward(self, x):
        out = self.arch1(x)
        return out
    
model_MI = MINN()
train_data = TensorDataset(Z_MI_train, Y_train)
train_loader = DataLoader(dataset = train_data, batch_size=20, shuffle=True)

optimizer = optim.SGD(model_MI.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()

for epoch in range(epochs):

    for x_batch, y_batch in train_loader:
        optimizer.zero_grad()
        pred = model_MI(x_batch)
        loss = loss_fn(pred, y_batch)

        # L1 penalty
        l1_penalty = 0
        for param in model_MI.parameters():
            l1_penalty += torch.sum(torch.abs(param))
        # Add L1 penalty to the loss
        loss += l1_lambda * l1_penalty

        loss.backward()
        optimizer.step()

    if epoch % 10 == 9:
        print(f'Epoch {epoch}, Loss: {loss.item()}')


_, labels = torch.max(model_MI(Z_MI_test), dim=1)
pred = labels.detach().numpy()
ER_MI = np.mean(np.abs(pred - Y_test))
print(ER_MI)

Epoch 9, Loss: 0.683150053024292
Epoch 19, Loss: 0.6804202198982239
Epoch 29, Loss: 0.6860231161117554
Epoch 39, Loss: 0.628186047077179
Epoch 49, Loss: 0.7132441401481628
Epoch 59, Loss: 0.4683532118797302
0.3437


In [12]:
### MissForest imputation
Z_nan = np.copy(Z_ZI)
for i in range(n):
    for j in range(d):
        if Omega[i, j] == 0:
            Z_nan[i,j] = np.nan
Z_nan_train = pd.DataFrame(Z_nan[0:int(n/2), :])
Z_nan_test = pd.DataFrame(Z_nan[int(n/2):n, :])
rgr = RandomForestRegressor(n_jobs=-1)
warnings.filterwarnings('ignore')
mf = MissForest(rgr)
mf.fit(x=Z_nan_train)
Z_MF_train = mf.transform(Z_nan_train)
Z_MF_test = mf.transform(Z_nan_test)
Z_MF_train = Z_MF_train.to_numpy()
Z_MF_test = Z_MF_test.to_numpy()

Z_MF_train = torch.tensor(Z_MF_train, dtype=torch.float32)
Z_MF_test = torch.tensor(Z_MF_test, dtype=torch.float32)

100%|██████████| 5/5 [00:05<00:00,  1.17s/it]
100%|██████████| 5/5 [00:04<00:00,  1.21it/s]


In [13]:
### MissForest imputation
class MFNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.arch1 = nn.Sequential(
            nn.Linear(d, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 16),
            nn.ReLU(),
            nn.Linear(16, 16),
            nn.ReLU(),
            nn.Linear(16, 2),
        )

    def forward(self, x):
        out = self.arch1(x)
        return out
    
model_MF = MFNN()
train_data = TensorDataset(Z_MF_train, Y_train)
train_loader = DataLoader(dataset = train_data, batch_size=20, shuffle=True)

optimizer = optim.SGD(model_MF.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()

for epoch in range(epochs):

    for x_batch, y_batch in train_loader:
        optimizer.zero_grad()
        pred = model_MF(x_batch)
        loss = loss_fn(pred, y_batch)

        # L1 penalty
        l1_penalty = 0
        for param in model_MF.parameters():
            l1_penalty += torch.sum(torch.abs(param))
        # Add L1 penalty to the loss
        loss += l1_lambda * l1_penalty

        loss.backward()
        optimizer.step()

    if epoch % 10 == 9:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

_, labels = torch.max(model_MF(Z_MF_test), dim=1)
pred = labels.detach().numpy()
ER_MF = np.mean(np.abs(pred - Y_test))
print(ER_MF)

Epoch 9, Loss: 0.6236905455589294
Epoch 19, Loss: 0.5519787073135376
Epoch 29, Loss: 0.5759331583976746
Epoch 39, Loss: 0.5725918412208557
Epoch 49, Loss: 0.49116554856300354
Epoch 59, Loss: 0.6833811402320862
0.3028


In [34]:
### MICE imputation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

Z_nan = np.copy(Z_ZI)
for i in range(n):
    for j in range(d):
        if Omega[i, j] == 0:
            Z_nan[i,j] = np.nan
Z_nan_train = pd.DataFrame(Z_nan[0:int(n/2), :])
Z_nan_test = pd.DataFrame(Z_nan[int(n/2):n, :])

imputer = IterativeImputer(max_iter=10, random_state=0)
fitted_imputer = imputer.fit(Z_nan_train)
Z_MICE_train = fitted_imputer.transform(Z_nan_train)
Z_MICE_test = fitted_imputer.transform(Z_nan_test)
Z_MICE_train = Z_MICE_train
Z_MICE_test = Z_MICE_test

Z_MICE_train = torch.tensor(Z_MICE_train, dtype=torch.float32)
Z_MICE_test = torch.tensor(Z_MICE_test, dtype=torch.float32)



In [35]:
### MICE imputation
class MICENN(nn.Module):
    def __init__(self):
        super().__init__()
        self.arch1 = nn.Sequential(
            nn.Linear(d, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 16),
            nn.ReLU(),
            nn.Linear(16, 16),
            nn.ReLU(),
            nn.Linear(16, 2),
        )

    def forward(self, x):
        out = self.arch1(x)
        return out
    
model_MICE = MICENN()
train_data = TensorDataset(Z_MICE_train, Y_train)
train_loader = DataLoader(dataset = train_data, batch_size=20, shuffle=True)

optimizer = optim.SGD(model_MICE.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()

for epoch in range(epochs):

    for x_batch, y_batch in train_loader:
        optimizer.zero_grad()
        pred = model_MICE(x_batch)
        loss = loss_fn(pred, y_batch)

        # L1 penalty
        l1_penalty = 0
        for param in model_MICE.parameters():
            l1_penalty += torch.sum(torch.abs(param))
        # Add L1 penalty to the loss
        loss += l1_lambda * l1_penalty

        loss.backward()
        optimizer.step()

    if epoch % 10 == 9:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

_, labels = torch.max(model_MICE(Z_MICE_test), dim=1)
pred = labels.detach().numpy()
ER_MICE = np.mean(np.abs(pred - Y_test))
print(ER_MICE)

Epoch 9, Loss: 0.695248007774353
Epoch 19, Loss: 0.6834633946418762
Epoch 29, Loss: 0.7101015448570251
Epoch 39, Loss: 0.6057878732681274
Epoch 49, Loss: 0.7067535519599915
Epoch 59, Loss: 0.624566912651062
0.3473
