## libraries and function 


In [1]:
!pip install impyute
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as skLDA
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from scipy import stats
import numpy as np
import impyute as impy
from fancyimpute import IterativeSVD, SoftImpute, NuclearNormMinimization
import pandas as pd
import time

Collecting impyute
  Downloading https://files.pythonhosted.org/packages/37/28/86829f67c9affb847facaab94687761d3555539ec675f7577778c5b2680a/impyute-0.0.8-py2.py3-none-any.whl
Installing collected packages: impyute
Successfully installed impyute-0.0.8




The function `mle` allows us to compute the MLEs from training data with monotone missing data.

We denote
$$n = \begin{pmatrix}
n_1^{(1)} & n_1^{(2)} &...&n_1^{(K)}\\
\vdots & \vdots &\ddots&\vdots\\
n_G^{(1)} & n_G^{(2)} &...&n_G^{(K)}
\end{pmatrix}$$
$$p = (p_1,p_2,...,p_K)$$
G is the number of classes.

### MLE function 

In [2]:
import numpy as np
def mle(Xtrain, n, p, G):
    '''
    Xtrain: list of input. The ith element of the list contains the sample from
    the ith class.
    '''
    if p[0]==1:
        # the array that contains the means of each block for the 1st block
        mus = [np.mean(Xtrain[g][:,0]) for g in np.arange(G)]
        S = [(n[g,0]-1)*np.var(Xtrain[g][:,0]) for g in np.arange(G)]
    else:
        mus = [np.mean(Xtrain[g][:,0:p[0]], axis = 0) for g  in np.arange(G)]
        S = [(n[g,0]-1)*np.cov(Xtrain[g][:,0:p[0]],rowvar =False) 
             for g in np.arange(G)]
    
    mus = np.asarray(mus).T # so that each column is the mean of a class
    S = sum(S)/(sum(n[:,0])) 
    S = S.reshape((p[0],-1))
    for i in np.arange(1,len(p)):
        W = [(n[g,i]-1)*np.cov(Xtrain[g][0:n[g,i],0:p[i]],
                              rowvar=False) for g in np.arange(G)]
        W = sum(W)
        
        P = np.matmul(W[(p[i-1]):p[i], 0:p[i-1]],
                      np.linalg.inv(W[0:p[i-1],0:p[i-1]]))
        Q = (W[p[i-1]:p[i],p[i-1]:p[i]]-
            np.matmul(P, W[0:p[i-1],p[i-1]:p[i]]))/sum(n[:,i])
        xmeans = [np.mean(Xtrain[g][0:n[g,i],0:p[i]], axis = 0) 
                  for g in np.arange(G)]
        
        xmeans = np.asarray(xmeans)
        xmeans = xmeans.T
        mus = np.vstack((mus, xmeans[p[i-1]:p[i],:]
                       - np.matmul(P, xmeans[0:p[i-1]]-mus)))
        S21 = np.matmul(P, S)
        S = np.vstack((np.hstack((S, S21.T)),
                       np.hstack((S21, Q+np.matmul(P, S21.T)))))
    return [mus, S]

### LDA and nan function 


In [3]:
# function that return the misclassfication rate
# for LDA with missing data
def lda_miss(mus, S, Xtest, ytrain, ytest, G):
    f = lambda g: np.log(np.mean(ytrain==g)) - np.matmul(
                  np.matmul(mus[:,g].T, np.linalg.inv(S)), mus[:,g]/2)
    last2 = [f(g) for g in np.arange(G)]
    
    h = lambda g,i: last2[g] + np.matmul(mus[:,g].T, np.matmul(
                    np.linalg.inv(S), Xtest[i,:].T))
    pred_label = [np.argmax([h(g,i) for g in np.arange(G)]) 
                  for i in np.arange(len(Xtest))]
    pred_label = np.asarray(pred_label)
    return np.mean(pred_label.flatten() != ytest)

'''
function that create data list that contain missing values
The input X is a numpy array, y is the label
the function return a list where the ith element of 
the list belongs to the ith class
'''

def make_nan_list(X,y,G, n, p):
    # note that the label should go from 0 to G-1
    data = []
    for g in np.arange(G):
        data.append(X[y==g,:])
        for k in np.arange(len(p)-1):
            data[g][n[g,k+1]:n[g,k], p[k]:] = np.nan
    return data

### compute_err function 

In [4]:
def missing_rate(Xtrain, ytrain, n, p, G):    
    Xtr_nan_list = make_nan_list(Xtrain,ytrain,G, n, p)
    # make NA data
    # since making function changes the order of observation
    # we need to generate new ytr from Xtr_nan    
    Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0]))
    for g in np.arange(1,G):
        Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g]))
        ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g]))))

    # percentage of missing values
    per_missing = np.mean(np.isnan(Xtr_nan))
    return per_missing

In [5]:
def compute_err_MICE(Xtrain, ytrain, Xtest, ytest, n, p, G):    
    Xtr_nan_list = make_nan_list(Xtrain,ytrain,G, n, p)
    # make NA data
    # since making function changes the order of observation
    # we need to generate new ytr from Xtr_nan    
    Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0]))
    for g in np.arange(1,G):
        Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g]))
        ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g]))))

    # percentage of missing values
    per_missing = np.mean(np.isnan(Xtr_nan))

    scaler = MinMaxScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtest = scaler.transform(Xtest)
    Xtr_nan_list2 = []
    for g in range(G):
      Xtr_nan_list2.append(scaler.transform(Xtr_nan_list[g]))

    
    #impute,classify and get the error rates for imputation approaches    
    start = time.time()
    Xtr_mice = IterativeImputer(max_iter=10).fit(Xtr_nan).transform(Xtr_nan)
    clf_mice = skLDA().fit(Xtr_mice, ytr)
    mice_err = np.mean(clf_mice.predict(Xtest).flatten() != ytest)
    mice_time = time.time()-start

    return mice_err, mice_time

In [6]:
def compute_err_MLE(Xtrain, ytrain, Xtest, ytest, n, p, G):    
    Xtr_nan_list = make_nan_list(Xtrain,ytrain,G, n, p)
    # make NA data
    # since making function changes the order of observation
    # we need to generate new ytr from Xtr_nan    
    Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0]))
    for g in np.arange(1,G):
        Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g]))
        ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g]))))

    # percentage of missing values
    per_missing = np.mean(np.isnan(Xtr_nan))

    scaler = MinMaxScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtest = scaler.transform(Xtest)
    Xtr_nan_list2 = []
    for g in range(G):
      Xtr_nan_list2.append(scaler.transform(Xtr_nan_list[g]))

    # MLEs approach
    start = time.time()
    mus, S = mle(Xtr_nan_list2, n, p, G)
    mle_err = lda_miss(mus, S, Xtest, ytrain, ytest, G)
    mle_time = time.time()-start
  
    return mle_err, mle_time

In [7]:
def compute_err_SOFT(Xtrain, ytrain, Xtest, ytest, n, p, G):    
    Xtr_nan_list = make_nan_list(Xtrain,ytrain,G, n, p)
    # make NA data
    # since making function changes the order of observation
    # we need to generate new ytr from Xtr_nan    
    Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0]))
    for g in np.arange(1,G):
        Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g]))
        ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g]))))

    # percentage of missing values
    per_missing = np.mean(np.isnan(Xtr_nan))

    scaler = MinMaxScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtest = scaler.transform(Xtest)
    Xtr_nan_list2 = []
    for g in range(G):
      Xtr_nan_list2.append(scaler.transform(Xtr_nan_list[g]))

    start = time.time()
    Xtr_softimpute = SoftImpute(max_iters = 100).fit_transform(Xtr_nan)
    clf_softimpute = skLDA().fit(Xtr_softimpute, ytr)
    softimpute_err = np.mean(clf_softimpute.predict(Xtest).flatten() != ytest)
    softimpute_time = time.time()-start

    return softimpute_err, softimpute_time

## Import Fashion MNIST

In [8]:
import tensorflow as tf
fashion_mnist = tf.keras.datasets.fashion_mnist
(Xtrain, ytrain), (Xtest, ytest) = fashion_mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [9]:
Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape

((60000, 28, 28), (10000, 28, 28), (60000,), (10000,))

In [10]:
Xtrain = Xtrain.astype(float).reshape((60000,784))

# set random seed and shuffle the data
np.random.seed(1)
idx = np.arange(len(ytrain))
np.random.shuffle(idx)
Xtrain, ytrain = Xtrain[idx,:], ytrain[idx]  

Xtrain.shape, ytrain.shape 

((60000, 784), (60000,))

In [11]:
# convert the test set to NumPy arrays and flatten the data
Xtest = Xtest.astype(float).reshape((10000,784))

In [12]:
    # number of sample per class in training data
    ng = np.asarray([sum(ytrain==i) for i in np.arange(10)])
    ng

array([6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000])

## 20%

In [12]:
n = np.hstack((ng.reshape((-1,1)), np.tile([4500,4200,4000, 3800],
                                 10).reshape((10,-1))))
p = np.array([310,400,480, 520,784])   
missing_rate(Xtrain, ytrain, n, p, 10)

0.1997874149659864

In [None]:
compute_err_MLE(Xtrain, ytrain, Xtest, ytest, n, p, 10)

(0.1842, 6260.772351264954)

In [None]:
compute_err_SOFT(Xtrain, ytrain, Xtest, ytest, n, p, 10)

[SoftImpute] Max Singular Value of X_init = 2160.256657
[SoftImpute] Iter 1: observed MAE=0.066428 rank=115
[SoftImpute] Iter 2: observed MAE=0.066349 rank=113
[SoftImpute] Iter 3: observed MAE=0.066339 rank=111
[SoftImpute] Iter 4: observed MAE=0.066337 rank=111
[SoftImpute] Iter 5: observed MAE=0.066336 rank=111
[SoftImpute] Iter 6: observed MAE=0.066334 rank=111
[SoftImpute] Iter 7: observed MAE=0.066333 rank=111
[SoftImpute] Iter 8: observed MAE=0.066331 rank=110
[SoftImpute] Iter 9: observed MAE=0.066329 rank=110
[SoftImpute] Iter 10: observed MAE=0.066328 rank=110
[SoftImpute] Iter 11: observed MAE=0.066326 rank=110
[SoftImpute] Iter 12: observed MAE=0.066325 rank=110
[SoftImpute] Iter 13: observed MAE=0.066324 rank=110
[SoftImpute] Iter 14: observed MAE=0.066323 rank=110
[SoftImpute] Iter 15: observed MAE=0.066322 rank=110
[SoftImpute] Iter 16: observed MAE=0.066321 rank=110
[SoftImpute] Iter 17: observed MAE=0.066320 rank=110
[SoftImpute] Iter 18: observed MAE=0.066320 rank=110

(0.1891, 1351.6873190402985)

## 30%

In [None]:
n = np.hstack((ng.reshape((-1,1)), np.tile([4400,4000,3400, 3000],
                                 10).reshape((10,-1))))
p = np.array([250,310,400, 450,784])   
missing_rate(Xtrain, ytrain, n, p, 10)

0.29931972789115646

In [None]:
compute_err_MLE(Xtrain, ytrain, Xtest, ytest, n, p, 10)

(0.1842, 6270.7891061306)

In [None]:
compute_err_SOFT(Xtrain, ytrain, Xtest, ytest, n, p, 10)

[SoftImpute] Max Singular Value of X_init = 1953.152221
[SoftImpute] Iter 1: observed MAE=0.064632 rank=124
[SoftImpute] Iter 2: observed MAE=0.064510 rank=118
[SoftImpute] Iter 3: observed MAE=0.064479 rank=116
[SoftImpute] Iter 4: observed MAE=0.064468 rank=115
[SoftImpute] Iter 5: observed MAE=0.064462 rank=115
[SoftImpute] Iter 6: observed MAE=0.064458 rank=115
[SoftImpute] Iter 7: observed MAE=0.064454 rank=115
[SoftImpute] Iter 8: observed MAE=0.064450 rank=115
[SoftImpute] Iter 9: observed MAE=0.064447 rank=115
[SoftImpute] Iter 10: observed MAE=0.064443 rank=115
[SoftImpute] Iter 11: observed MAE=0.064439 rank=115
[SoftImpute] Iter 12: observed MAE=0.064436 rank=115
[SoftImpute] Iter 13: observed MAE=0.064433 rank=115
[SoftImpute] Iter 14: observed MAE=0.064429 rank=115
[SoftImpute] Iter 15: observed MAE=0.064426 rank=114
[SoftImpute] Iter 16: observed MAE=0.064424 rank=114
[SoftImpute] Iter 17: observed MAE=0.064421 rank=114
[SoftImpute] Iter 18: observed MAE=0.064418 rank=114

(0.1947, 1621.440566778183)

## 40% 



In [13]:
n = np.hstack((ng.reshape((-1,1)), np.tile([3600,3400,3000, 2500],
                                 10).reshape((10,-1))))
p = np.array([200,220,300, 400,784])   
missing_rate(Xtrain, ytrain, n, p, 10)

0.4039115646258503

In [None]:
compute_err_MLE(Xtrain, ytrain, Xtest, ytest, n, p, 10)

(0.1844, 6533.909581661224)

In [None]:
compute_err_SOFT(Xtrain, ytrain, Xtest, ytest, n, p, 10)

[SoftImpute] Max Singular Value of X_init = 1752.070804
[SoftImpute] Iter 1: observed MAE=0.062324 rank=125
[SoftImpute] Iter 2: observed MAE=0.062224 rank=121
[SoftImpute] Iter 3: observed MAE=0.062203 rank=119
[SoftImpute] Iter 4: observed MAE=0.062197 rank=118
[SoftImpute] Iter 5: observed MAE=0.062195 rank=118
[SoftImpute] Iter 6: observed MAE=0.062193 rank=118
[SoftImpute] Iter 7: observed MAE=0.062191 rank=118
[SoftImpute] Iter 8: observed MAE=0.062188 rank=118
[SoftImpute] Iter 9: observed MAE=0.062186 rank=118
[SoftImpute] Iter 10: observed MAE=0.062184 rank=118
[SoftImpute] Iter 11: observed MAE=0.062182 rank=118
[SoftImpute] Iter 12: observed MAE=0.062180 rank=118
[SoftImpute] Iter 13: observed MAE=0.062178 rank=118
[SoftImpute] Iter 14: observed MAE=0.062176 rank=118
[SoftImpute] Iter 15: observed MAE=0.062174 rank=118
[SoftImpute] Iter 16: observed MAE=0.062173 rank=118
[SoftImpute] Iter 17: observed MAE=0.062171 rank=118
[SoftImpute] Iter 18: observed MAE=0.062170 rank=118

(0.2024, 1366.3064613342285)

## 50%

In [13]:
n = np.hstack((ng.reshape((-1,1)), np.tile([3000,2900,2700, 2500],
                                 10).reshape((10,-1))))
p = np.array([100,150,220, 250,784])   
missing_rate(Xtrain, ytrain, n, p, 10)

0.4963860544217687

In [14]:
compute_err_MLE(Xtrain, ytrain, Xtest, ytest, n, p, 10)

(0.185, 3550.665013074875)

In [15]:
compute_err_SOFT(Xtrain, ytrain, Xtest, ytest, n, p, 10)

[SoftImpute] Max Singular Value of X_init = 1679.875710
[SoftImpute] Iter 1: observed MAE=0.063546 rank=115
[SoftImpute] Iter 2: observed MAE=0.063487 rank=112
[SoftImpute] Iter 3: observed MAE=0.063484 rank=112
[SoftImpute] Iter 4: observed MAE=0.063489 rank=112
[SoftImpute] Iter 5: observed MAE=0.063495 rank=112
[SoftImpute] Iter 6: observed MAE=0.063498 rank=111
[SoftImpute] Iter 7: observed MAE=0.063501 rank=111
[SoftImpute] Iter 8: observed MAE=0.063502 rank=111
[SoftImpute] Iter 9: observed MAE=0.063504 rank=111
[SoftImpute] Iter 10: observed MAE=0.063505 rank=111
[SoftImpute] Iter 11: observed MAE=0.063506 rank=111
[SoftImpute] Iter 12: observed MAE=0.063506 rank=111
[SoftImpute] Iter 13: observed MAE=0.063507 rank=111
[SoftImpute] Iter 14: observed MAE=0.063508 rank=111
[SoftImpute] Iter 15: observed MAE=0.063508 rank=111
[SoftImpute] Iter 16: observed MAE=0.063508 rank=111
[SoftImpute] Iter 17: observed MAE=0.063509 rank=111
[SoftImpute] Iter 18: observed MAE=0.063509 rank=111

(0.1977, 770.5144274234772)