## libraries and function 




In [7]:
# !pip install impyute
!pip install fancyimpute
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from scipy import stats
import numpy as np
# import impyute as impy
from fancyimpute import IterativeSVD, SoftImpute, NuclearNormMinimization
import pandas as pd
import time
# !pip install missingpy
# from missingpy import MissForest
# note that MissForest uses sklearn.__version__ 0.22.2.post1



## MLE estimation function  




In [8]:
def diag_term(i,X,y):
  arr0 = X[:,i]
  nar2 = 0
  arr = arr0[~np.isnan(arr0)]
  y_arr = y[~np.isnan(arr0)]

  _, counts = np.unique(y_arr, return_counts=True)
  ind = np.insert(np.cumsum(counts), 0, 0)
  
  return sum([(ind[g]-ind[g-1])*np.var(arr[ind[g-1]:ind[g]]) for 
                       g in range(1,G+1)])/len(y_arr)                       

In [9]:
def mle(X,y,G):
    '''
    X: input, should be a numpy array
    y: label
    G: number of classes
    output:
    - mus: each row is a class mean
    - S: common covariance matrix of class 1,2,..., G 
    '''
    epsilon = 1e-5 # define epsilon to put r down to 0 if r < epsilon
    n,p = X.shape[0], X.shape[1]

    mus = np.array([np.nanmean(X[y==g,:],axis=0) for g in range (G)]).T # so that each column is the mean of a class
 

    S = np.diag([diag_term(i,X,y) for i in range(p)]) 

    for i in range(p):      
      for j in range(i):
        mat = X[:,[i,j]]

        # drop rows with NA
        idx = ~np.isnan(mat).any(axis=1)
        mat, y_arr = mat[idx], y[idx]

        _, counts = np.unique(y_arr, return_counts=True)
        ind = np.insert(np.cumsum(counts), 0, 0)

        m_g = counts
        
        A = len(y_arr)

        scaled_mat = [mat[ind[g-1]:ind[g],:]-mus[[i,j],g-1] for g in range(1,G+1)]
    
        q = lambda g: np.dot(scaled_mat[g][:,0],scaled_mat[g][:,0])
        s11 = sum(map(q,range(G))) 
        q = lambda g: np.dot(scaled_mat[g][:,1],scaled_mat[g][:,1])
        s22 = sum(map(q,range(G))) 
        d = lambda g: np.dot(scaled_mat[g][:,0],scaled_mat[g][:,1])
        s12 = sum(map(d,range(G)))  

        start_solve = time.time()
        B = S[i,i]*S[j,j]*A - s22 * S[i,i] - s11 * S[j,j]
        coefficient = [-A, s12, B, s12*S[i,i]*S[j,j]]
        r = np.roots(coefficient)

        r = r[abs(np.imag(r)) < epsilon]
        r = np.real(r)
        r[abs(r) < epsilon] = 0

        if len(r)>1:
          condi_var = S[j,j] - r**2/S[i,i]
          eta = -A*np.log(condi_var)-(S[j,j]-2*r/S[i,i]*s12 +
                                      r**2/S[i,i]**2*s11)/condi_var
          # if condi_var <0 then eta = NA. in practice, it's impossible for cov to be negative
          #  therefore, we drop NA elements of eta  
          r = r[eta == max(eta[~np.isnan(eta)])]

        if len(r) > 1:        
            w = [m_g[g-1]*np.cov(mat[ind[g-1]:ind[g],], rowvar=False) for
                 g in range(1,G+1)]
            w = np.sum(w, axis = 0)   
            r = r[np.abs(r-w[0,1]).argmin()] # choose r that is w[0,1] 
              
        S[i,j] = S[j,i] = r
    return [mus, S]

### compute_err function 
  

In [10]:
def err(mus, S, mus_est, S_est):
  er = [np.linalg.norm(mus_est-mus)/mus.size,
         np.linalg.norm(S_est.flatten()-S.flatten())/S.size]
  return np.mean(er)  

def generate_nan(Xtrain, missing_rate):
  Xshape = Xtrain.shape
  na_id = np.random.randint(0,Xtrain.size,round(missing_rate*Xtrain.size))
  Xtr_nan = Xtrain.flatten()
  Xtr_nan[na_id] = np.nan 
  return Xtr_nan.reshape(Xshape)

### Algorithm to compute error for DPER only

In [11]:
def compute_err_mle(Xtrain, ytrain, G, missing_rate, runs = 10):
  e_rate = []
  for i in  range(runs):
    Xtr_nan = generate_nan(Xtrain, missing_rate)
    Xtr_nan = generate_nan(Xtrain, missing_rate)
    
    scaler = StandardScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtrain = scaler.transform(Xtrain)
    
    # estimate parameters from full data
    mus = [np.mean(Xtrain[ytrain==g,:], axis=0) for g in np.arange(G)]
    mus = np.asarray(mus) # each row is a mean of a class
    S = [sum(ytrain==g)*np.cov(Xtrain[ytrain==g,:],rowvar =False) 
             for g in np.arange(G)]
    S = np.sum(S, axis = 0)/len(ytrain)

    # MLEs approach
    start = time.time()
    mus_mle, S_mle = mle(Xtr_nan,ytrain, G)
    mle_err = err(mus, S, mus_mle.T, S_mle)
    mle_time = time.time()-start    
    e_rate.append(mle_err)
  e_rate = np.asarray(e_rate)
  return np.mean(e_rate), np.std(e_rate)    

In [12]:
def compute_err_soft(Xtrain, ytrain, G, missing_rate, runs = 10):
  e_rate = []
  for i in  range(runs):
    Xtr_nan = generate_nan(Xtrain, missing_rate)
    Xtr_nan = generate_nan(Xtrain, missing_rate)
    
    scaler = StandardScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtrain = scaler.transform(Xtrain)
    
    # estimate parameters from full data
    mus = [np.mean(Xtrain[ytrain==g,:], axis=0) for g in np.arange(G)]
    mus = np.asarray(mus) # each row is a mean of a class
    S = [sum(ytrain==g)*np.cov(Xtrain[ytrain==g,:],rowvar =False) 
             for g in np.arange(G)]
    S = np.sum(S, axis = 0)/len(ytrain)

    start = time.time()
    Xtr_softimpute = SoftImpute(max_iters = 100).fit_transform(Xtr_nan)
    mus_softimpute = np.asarray([np.mean(Xtr_softimpute[ytrain==g,:], axis=0
                                         ) for g in np.arange(G)])
    S_softimpute = np.asarray([(sum(ytrain==g))*np.cov(Xtr_softimpute[ytrain==g,:], rowvar =False) 
             for g in np.arange(G)])
    S_softimpute = np.sum(S_softimpute, axis = 0)/len(ytrain) 
    softimpute_err =  err(mus, S, mus_softimpute, S_softimpute)
    softimpute_time = time.time()-start

    e_rate.append(softimpute_err)
  e_rate = np.asarray(e_rate)
  return np.mean(e_rate), np.std(e_rate)    

# FASHION MNIST

In [13]:
import tensorflow as tf
import tensorflow_datasets as tfds

fashion_mnist = tf.keras.datasets.fashion_mnist
(Xtrain, ytrain), (Xtest, ytest) = fashion_mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [14]:
Xtrain = Xtrain.astype(float).reshape((60000,784))
Xtest = Xtest.astype(float).reshape((10000,784))

X = np.vstack((Xtrain, Xtest))
y = np.hstack((ytrain, ytest))

# set random seed and shuffle the data
np.random.seed(1)
idx = np.arange(len(y))
np.random.shuffle(idx)
X, y = X[idx,:], y[idx]  

X.shape, y.shape 

((70000, 784), (70000,))

In [15]:
# check if a column is all 0
id = [np.sum(Xtrain[:,i] != 0)>10 for i in range(28**2)]
# number of columns that mostly zero
print(28**2-np.sum(id))
# number of columns that has at least more than 10 non-zero
np.sum(id)
X = X[:, id]

0


In [None]:
G = 10
mle_err = np.array([compute_err_mle(X, y, G, .2, runs = 10),
                    compute_err_mle(X, y, G, .35, runs = 10),
                    compute_err_mle(X, y, G, .5, runs = 10),
                    compute_err_mle(X, y, G, .65, runs = 10),
                    compute_err_mle(X, y, G, .8, runs = 10)])
print('fashion mnist assume equal cov: dper error')
print(mle_err.round(3))

In [None]:
G = 10
soft_err = np.array([compute_err_soft(X, y, G, .2, runs = 10),
                    compute_err_soft(X, y, G, .35, runs = 10),
                    compute_err_soft(X, y, G, .5, runs = 10),
                    compute_err_soft(X, y, G, .65, runs = 10),
                    compute_err_soft(X, y, G, .8, runs = 10)])
print('fashion mnist assume equal cov: soft impute error')
print(soft_err.round(3))