## libraries and function 

In [1]:
!pip install impyute
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as skLDA
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from scipy import stats
import numpy as np
import impyute as impy
from fancyimpute import IterativeSVD, SoftImpute, NuclearNormMinimization
import pandas as pd
import time

Collecting impyute
  Downloading https://files.pythonhosted.org/packages/37/28/86829f67c9affb847facaab94687761d3555539ec675f7577778c5b2680a/impyute-0.0.8-py2.py3-none-any.whl
Installing collected packages: impyute
Successfully installed impyute-0.0.8




The function `mle` allows us to compute the MLEs from training data with monotone missing data.

We denote
$$n = \begin{pmatrix}
n_1^{(1)} & n_1^{(2)} &...&n_1^{(K)}\\
\vdots & \vdots &\ddots&\vdots\\
n_G^{(1)} & n_G^{(2)} &...&n_G^{(K)}
\end{pmatrix}$$
$$p = (p_1,p_2,...,p_K)$$
G is the number of classes.

### MLE function 

In [2]:
import numpy as np
def mle(Xtrain, n, p, G):
    '''
    Xtrain: list of input. The ith element of the list contains the sample from
    the ith class.
    '''
    if p[0]==1:
        # the array that contains the means of each block for the 1st block
        mus = [np.mean(Xtrain[g][:,0]) for g in np.arange(G)]
        S = [(n[g,0]-1)*np.var(Xtrain[g][:,0]) for g in np.arange(G)]
    else:
        mus = [np.mean(Xtrain[g][:,0:p[0]], axis = 0) for g  in np.arange(G)]
        S = [(n[g,0]-1)*np.cov(Xtrain[g][:,0:p[0]],rowvar =False) 
             for g in np.arange(G)]
    
    mus = np.asarray(mus).T # so that each column is the mean of a class
    S = sum(S)/(sum(n[:,0])) 
    S = S.reshape((p[0],-1))
    for i in np.arange(1,len(p)):
        W = [(n[g,i]-1)*np.cov(Xtrain[g][0:n[g,i],0:p[i]],
                              rowvar=False) for g in np.arange(G)]
        W = sum(W)
        
        P = np.matmul(W[(p[i-1]):p[i], 0:p[i-1]],
                      np.linalg.inv(W[0:p[i-1],0:p[i-1]]))
        Q = (W[p[i-1]:p[i],p[i-1]:p[i]]-
            np.matmul(P, W[0:p[i-1],p[i-1]:p[i]]))/sum(n[:,i])
        xmeans = [np.mean(Xtrain[g][0:n[g,i],0:p[i]], axis = 0) 
                  for g in np.arange(G)]
        
        xmeans = np.asarray(xmeans)
        xmeans = xmeans.T
        mus = np.vstack((mus, xmeans[p[i-1]:p[i],:]
                       - np.matmul(P, xmeans[0:p[i-1]]-mus)))
        S21 = np.matmul(P, S)
        S = np.vstack((np.hstack((S, S21.T)),
                       np.hstack((S21, Q+np.matmul(P, S21.T)))))
    return [mus, S]

### nan function 


In [3]:
'''
function that create data list that contain missing values
The input X is a numpy array, y is the label
the function return a list where the ith element of 
the list belongs to the ith class
'''

def make_nan_list(X,y,G, n, p):
    # note that the label should go from 0 to G-1
    data = []
    for g in np.arange(G):
        data.append(X[y==g,:])
        for k in np.arange(len(p)-1):
            data[g][n[g,k+1]:n[g,k], p[k]:] = np.nan
    return data


In [4]:
def missing_rate(Xtrain, ytrain, n, p, G):
    # function that compute the missing rate of a given pattern    
    Xtr_nan_list = make_nan_list(Xtrain,ytrain,G, n, p)
    # make NA data
    # since making function changes the order of observation
    # we need to generate new ytr from Xtr_nan    
    Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0]))
    for g in np.arange(1,G):
        Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g]))
        ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g]))))

    # percentage of missing values
    per_missing = np.mean(np.isnan(Xtr_nan))
    return per_missing

### compute_err function 

In [None]:
def err(mus, S, mus_est, S_est):
  err_rate = (np.linalg.norm(mus_est-mus))/mus.size 
  err_rate += (np.linalg.norm(S_est-S))/S.size 
  return err_rate

In [None]:
def compute_err_EM(Xtrain, ytrain, n, p, G):      
    # make NAs
    Xtr_nan_list = make_nan_list(Xtrain,ytrain,G, n, p)
    # make NA data
    # since making function changes the order of observation
    # we need to generate new ytr from Xtr_nan    
    Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0]))
    for g in np.arange(1,G):
        Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g]))
        ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g]))))

    scaler = StandardScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtrain = scaler.transform(Xtrain)
    for g in range(G):
      Xtr_nan_list[g] = scaler.transform(Xtr_nan_list[g])

    mus = [np.mean(Xtrain[ytrain==g,:], axis=0) for g in np.arange(G)]
    mus = np.asarray(mus) # each row is a mean of a class
    S = [(sum(ytrain==g)-1)*np.cov(Xtrain[ytrain==g,:],rowvar =False) 
             for g in np.arange(G)]
    S = np.asarray(S)/len(ytrain)

    # percentage of missing values
    per_missing = np.mean(np.isnan(Xtr_nan))

    start = time.time()
    Xtr_em = impy.em(Xtr_nan, loops=10)
    mus_em = np.array([np.mean(Xtr_em[ytrain==g,:], axis=0) for g in np.arange(G)])
    S_em = np.array([(sum(ytrain==g)-1)*np.cov(Xtr_em[ytrain==g,:], rowvar =False) 
             for g in np.arange(G)])
    S_em = S_em/len(ytrain)
    em_err = err(mus, S, mus_em, S_em)
    em_time = time.time()-start   

    return em_err, em_time, per_missing

In [None]:
def compute_err_MICE(Xtrain, ytrain, n, p, G):      
    # make NAs
    Xtr_nan_list = make_nan_list(Xtrain,ytrain,G, n, p)
    # make NA data
    # since making function changes the order of observation
    # we need to generate new ytr from Xtr_nan    
    Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0]))
    for g in np.arange(1,G):
        Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g]))
        ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g]))))

    scaler = StandardScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtrain = scaler.transform(Xtrain)
    for g in range(G):
      Xtr_nan_list[g] = scaler.transform(Xtr_nan_list[g])

    mus = [np.mean(Xtrain[ytrain==g,:], axis=0) for g in np.arange(G)]
    mus = np.asarray(mus) # each row is a mean of a class
    S = [(sum(ytrain==g)-1)*np.cov(Xtrain[ytrain==g,:],rowvar =False) 
             for g in np.arange(G)]
    S = np.asarray(S)/len(ytrain)

    # percentage of missing values
    per_missing = np.mean(np.isnan(Xtr_nan))
    # MLEs approach
    start = time.time()
    Xtr_mice = IterativeImputer(max_iter=10).fit(Xtr_nan).transform(Xtr_nan)
    mus_mice = np.asarray([np.mean(Xtr_mice[ytrain==g,:], axis=0
                                   ) for g in np.arange(G)])
    S_mice = np.asarray([(sum(ytrain==g)-1)*np.cov(Xtr_mice[ytrain==g,:], rowvar =False) 
             for g in np.arange(G)])
    S_mice = S_mice/len(ytrain)
    mice_err = err(mus, S, mus_mice, S_mice)
    mice_time = time.time()-start  

    return mice_err, mice_time, per_missing

In [None]:
def compute_err_MLE(Xtrain, ytrain, n, p, G):      
    # make NAs
    Xtr_nan_list = make_nan_list(Xtrain,ytrain,G, n, p)
    # make NA data
    # since making function changes the order of observation
    # we need to generate new ytr from Xtr_nan    
    Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0]))
    for g in np.arange(1,G):
        Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g]))
        ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g]))))

    scaler = StandardScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtrain = scaler.transform(Xtrain)
    for g in range(G):
      Xtr_nan_list[g] = scaler.transform(Xtr_nan_list[g])

    mus = [np.mean(Xtrain[ytrain==g,:], axis=0) for g in np.arange(G)]
    mus = np.asarray(mus) # each row is a mean of a class
    S = [(sum(ytrain==g)-1)*np.cov(Xtrain[ytrain==g,:],rowvar =False) 
             for g in np.arange(G)]
    S = np.asarray(S)/len(ytrain)

    # percentage of missing values
    per_missing = np.mean(np.isnan(Xtr_nan))
    # MLEs approach
    start = time.time()
    mus_mle, S_mle = mle(Xtr_nan_list, n, p, G)
    mle_err = err(mus, S, mus_mle.T, S_mle)
    mle_time = time.time()-start    

    return mle_err, mle_time, per_missing

In [None]:
def compute_err_SOFT(Xtrain, ytrain, n, p, G):      
    # make NAs
    Xtr_nan_list = make_nan_list(Xtrain,ytrain,G, n, p)
    # make NA data
    # since making function changes the order of observation
    # we need to generate new ytr from Xtr_nan    
    Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0]))
    for g in np.arange(1,G):
        Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g]))
        ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g]))))

    scaler = StandardScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtrain = scaler.transform(Xtrain)
    for g in range(G):
      Xtr_nan_list[g] = scaler.transform(Xtr_nan_list[g])

    mus = [np.mean(Xtrain[ytrain==g,:], axis=0) for g in np.arange(G)]
    mus = np.asarray(mus) # each row is a mean of a class
    S = [(sum(ytrain==g)-1)*np.cov(Xtrain[ytrain==g,:],rowvar =False) 
             for g in np.arange(G)]
    S = np.asarray(S)/len(ytrain)

    # percentage of missing values
    per_missing = np.mean(np.isnan(Xtr_nan))

    start = time.time()
    Xtr_softimpute = SoftImpute(max_iters = 100).fit_transform(Xtr_nan)
    mus_softimpute = np.asarray([np.mean(Xtr_softimpute[ytrain==g,:], axis=0
                                         ) for g in np.arange(G)])
    S_softimpute = np.asarray([(sum(ytrain==g)-1)*np.cov(Xtr_softimpute[ytrain==g,:], rowvar =False) 
             for g in np.arange(G)])
    S_softimpute = S_softimpute/len(ytrain)
    softimpute_err =  err(mus, S, mus_softimpute, S_softimpute)
    softimpute_time = time.time()-start    

    return softimpute_err, softimpute_time, per_missing

# Import Fashion MNIST

In [5]:
import tensorflow as tf
fashion_mnist = tf.keras.datasets.fashion_mnist
(Xtrain, ytrain), (Xtest, ytest) = fashion_mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [6]:
Xtrain = Xtrain.astype(float).reshape((60000,784))
Xtest = Xtest.astype(float).reshape((10000,784))

X = np.vstack((Xtrain, Xtest))
y = np.hstack((ytrain, ytest))

# set random seed and shuffle the data
np.random.seed(1)
idx = np.arange(len(y))
np.random.shuffle(idx)
X, y = X[idx,:], y[idx]  

In [7]:
# number of sample per class in training data
ng = np.asarray([sum(y==i) for i in np.arange(10)])
ng

array([7000, 7000, 7000, 7000, 7000, 7000, 7000, 7000, 7000, 7000])

### 20%

In [None]:
n = np.hstack((ng.reshape((-1,1)), np.tile([5000,4500,4200, 4000],
                                 10).reshape((10,-1))))
p = np.array([380,450,500, 520,784])   
missing_rate(X, y, n, p, 10)

0.20280612244897958

In [None]:
compute_err_MLE(X, y, n, p, 10)

(0.00010190720380898267, 2.4604036808013916, 0.20280612244897958)

In [None]:
compute_err_SOFT(X, y, n, p, 10)

[SoftImpute] Max Singular Value of X_init = 3033.101261
[SoftImpute] Iter 1: observed MAE=0.122119 rank=503
[SoftImpute] Iter 2: observed MAE=0.122205 rank=497
[SoftImpute] Iter 3: observed MAE=0.122250 rank=495
[SoftImpute] Iter 4: observed MAE=0.122268 rank=494
[SoftImpute] Iter 5: observed MAE=0.122273 rank=494
[SoftImpute] Iter 6: observed MAE=0.122270 rank=493
[SoftImpute] Iter 7: observed MAE=0.122264 rank=493
[SoftImpute] Iter 8: observed MAE=0.122256 rank=493
[SoftImpute] Iter 9: observed MAE=0.122247 rank=493
[SoftImpute] Iter 10: observed MAE=0.122237 rank=493
[SoftImpute] Iter 11: observed MAE=0.122228 rank=493
[SoftImpute] Iter 12: observed MAE=0.122218 rank=493
[SoftImpute] Iter 13: observed MAE=0.122209 rank=493
[SoftImpute] Iter 14: observed MAE=0.122200 rank=493
[SoftImpute] Iter 15: observed MAE=0.122191 rank=493
[SoftImpute] Iter 16: observed MAE=0.122182 rank=493
[SoftImpute] Iter 17: observed MAE=0.122174 rank=493
[SoftImpute] Iter 18: observed MAE=0.122166 rank=493

(0.0064376999163578, 892.3388366699219, 0.20280612244897958)

### 30%

In [None]:
    n = np.hstack((ng.reshape((-1,1)), np.tile([4000,3500,3000, 2800],
                                 10).reshape((10,-1))))
    p = np.array([350,450,500, 520,784])   
    missing_rate(X, y, n, p, 10)

0.30317055393586007

In [None]:
compute_err_MLE(X, y, n, p, 10)

(0.00012185129481936274, 1.9895505905151367, 0.30317055393586007)

In [None]:
compute_err_SOFT(X, y, n, p, 10)

[SoftImpute] Max Singular Value of X_init = 2853.791506
[SoftImpute] Iter 1: observed MAE=0.120633 rank=492
[SoftImpute] Iter 2: observed MAE=0.120694 rank=486
[SoftImpute] Iter 3: observed MAE=0.120740 rank=483
[SoftImpute] Iter 4: observed MAE=0.120762 rank=482
[SoftImpute] Iter 5: observed MAE=0.120770 rank=481
[SoftImpute] Iter 6: observed MAE=0.120770 rank=480
[SoftImpute] Iter 7: observed MAE=0.120766 rank=479
[SoftImpute] Iter 8: observed MAE=0.120759 rank=479
[SoftImpute] Iter 9: observed MAE=0.120750 rank=479
[SoftImpute] Iter 10: observed MAE=0.120741 rank=479
[SoftImpute] Iter 11: observed MAE=0.120732 rank=479
[SoftImpute] Iter 12: observed MAE=0.120723 rank=479
[SoftImpute] Iter 13: observed MAE=0.120714 rank=478
[SoftImpute] Iter 14: observed MAE=0.120705 rank=478
[SoftImpute] Iter 15: observed MAE=0.120697 rank=478
[SoftImpute] Iter 16: observed MAE=0.120688 rank=478
[SoftImpute] Iter 17: observed MAE=0.120680 rank=478
[SoftImpute] Iter 18: observed MAE=0.120672 rank=478

(0.006434659425644326, 920.1538162231445, 0.30317055393586007)

## 40%

In [None]:
n = np.hstack((ng.reshape((-1,1)), np.tile([4500,3490,3350, 3100],
                                 10).reshape((10,-1))))
p = np.array([180,250,400, 450,784])   
missing_rate(X, y, n, p, 10)

0.398432944606414

In [None]:
compute_err_MLE(X, y, n, p, 10)

(0.00013589292309761314, 1.5331597328186035, 0.398432944606414)

In [None]:
compute_err_SOFT(X, y, n, p, 10)

[SoftImpute] Max Singular Value of X_init = 2565.429677
[SoftImpute] Iter 1: observed MAE=0.117258 rank=501
[SoftImpute] Iter 2: observed MAE=0.117371 rank=495
[SoftImpute] Iter 3: observed MAE=0.117440 rank=494
[SoftImpute] Iter 4: observed MAE=0.117476 rank=493
[SoftImpute] Iter 5: observed MAE=0.117494 rank=492
[SoftImpute] Iter 6: observed MAE=0.117501 rank=492
[SoftImpute] Iter 7: observed MAE=0.117502 rank=492
[SoftImpute] Iter 8: observed MAE=0.117499 rank=492
[SoftImpute] Iter 9: observed MAE=0.117494 rank=492
[SoftImpute] Iter 10: observed MAE=0.117487 rank=492
[SoftImpute] Iter 11: observed MAE=0.117480 rank=492
[SoftImpute] Iter 12: observed MAE=0.117472 rank=492
[SoftImpute] Iter 13: observed MAE=0.117464 rank=492
[SoftImpute] Iter 14: observed MAE=0.117456 rank=492
[SoftImpute] Iter 15: observed MAE=0.117448 rank=492
[SoftImpute] Iter 16: observed MAE=0.117440 rank=492
[SoftImpute] Iter 17: observed MAE=0.117432 rank=492
[SoftImpute] Iter 18: observed MAE=0.117424 rank=492

(0.006428482656460864, 945.2519092559814, 0.398432944606414)

## 50%

In [None]:
n = np.hstack((ng.reshape((-1,1)), np.tile([3500,3290,3100, 3000],
                                 10).reshape((10,-1))))
p = np.array([90,110,200, 250,784])   
missing_rate(X, y, n, p, 10)

0.4983418367346939

In [None]:
compute_err_MLE(X, y, n, p, 10)

(0.00015705266416606177, 0.971776008605957, 0.4983418367346939)

In [None]:
compute_err_SOFT(X, y, n, p, 10)

[SoftImpute] Max Singular Value of X_init = 2315.886393
[SoftImpute] Iter 1: observed MAE=0.116056 rank=500
[SoftImpute] Iter 2: observed MAE=0.116150 rank=497
[SoftImpute] Iter 3: observed MAE=0.116210 rank=497
[SoftImpute] Iter 4: observed MAE=0.116247 rank=496
[SoftImpute] Iter 5: observed MAE=0.116270 rank=496
[SoftImpute] Iter 6: observed MAE=0.116284 rank=496
[SoftImpute] Iter 7: observed MAE=0.116293 rank=496
[SoftImpute] Iter 8: observed MAE=0.116299 rank=496
[SoftImpute] Iter 9: observed MAE=0.116302 rank=496
[SoftImpute] Iter 10: observed MAE=0.116304 rank=496
[SoftImpute] Iter 11: observed MAE=0.116305 rank=496
[SoftImpute] Iter 12: observed MAE=0.116306 rank=496
[SoftImpute] Iter 13: observed MAE=0.116305 rank=496
[SoftImpute] Iter 14: observed MAE=0.116304 rank=496
[SoftImpute] Iter 15: observed MAE=0.116303 rank=496
[SoftImpute] Iter 16: observed MAE=0.116302 rank=496
[SoftImpute] Iter 17: observed MAE=0.116301 rank=496
[SoftImpute] Iter 18: observed MAE=0.116299 rank=496

(0.006425950734152229, 934.3220479488373, 0.4983418367346939)