<a href="https://colab.research.google.com/github/thunguyen177/DPER/blob/main/DPER_without_equal_covariance_assumption.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## libraries and function 

In [None]:
!pip install impyute
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as skLDA
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from scipy import stats
import numpy as np
import impyute as impy
from fancyimpute import IterativeSVD, SoftImpute, NuclearNormMinimization
import pandas as pd
import time 
!pip install missingpy
from missingpy import MissForest

Collecting impyute
  Downloading https://files.pythonhosted.org/packages/37/28/86829f67c9affb847facaab94687761d3555539ec675f7577778c5b2680a/impyute-0.0.8-py2.py3-none-any.whl
Installing collected packages: impyute
Successfully installed impyute-0.0.8




Collecting missingpy
[?25l  Downloading https://files.pythonhosted.org/packages/b5/be/998d04d27054b58f0974b5f09f8457778a0a72d4355e0b7ae877b6cfb850/missingpy-0.2.0-py3-none-any.whl (49kB)
[K     |██████▊                         | 10kB 16.3MB/s eta 0:00:01[K     |█████████████▍                  | 20kB 21.3MB/s eta 0:00:01[K     |████████████████████            | 30kB 15.9MB/s eta 0:00:01[K     |██████████████████████████▊     | 40kB 14.0MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 4.5MB/s 
[?25hInstalling collected packages: missingpy
Successfully installed missingpy-0.2.0




### MLE estimation function 

In [None]:
def diag_term(X,i):
  arr0 = X[:,i].flatten()
  arr = arr0[~np.isnan(arr0)]
  return np.var(arr)

def musMLE(X,y,G):
    n,p = X.shape[0], X.shape[1]
    f = lambda g: np.nanmean(X[y==g,:],axis=0)
    musMLE = np.array([f(g) for g in range(G)])    
    return musMLE.T

def Smle(X,y,musMLE,g):
    '''
    function to compute the covariance matrix for the g-th class
    X: input, should be a numpy array
    y: label
    G: number of classes
    g: class index
    output:
    - mus: each row is a class mean
    - S: common covariance matrix of class 1,2,..., G 
    '''
    epsilon = 1e-5 # define epsilon to put r down to 0 if r < epsilon
    Xg, yg = X[y==g,:], y[y==g]
    n,p = Xg.shape[0], Xg.shape[1] 
 
    S = np.diag([diag_term(Xg,i) for i in range(p)]) 

    for i in range(p):      
      for j in range(i):
        if ((S[i,i] == 0.) | (S[j,j] == 0.)):
          S[i,j] = S[j,i] = 0.
          continue

        mat = Xg[:,[i,j]]

        # drop rows with NA
        idx = ~np.isnan(mat).any(axis=1)
        mat, y_arr = mat[idx], yg[idx]
        A = mg = len(y_arr) 

        s11 = mg*np.var(mat[:,0])
        s22 = mg*np.var(mat[:,1])
        s12 = sum((mat[:,0]-musMLE[i,g])*(mat[:,1]-musMLE[j,g]))
        B = S[i,i]*S[j,j]*A - s22 * S[i,i] - s11 * S[j,j]
        coefficient = [-A, s12, B, s12*S[i,i]*S[j,j]]
        r = np.roots(coefficient)
        r = r[abs(np.imag(r)) < epsilon]
        r = np.real(r)
        r[abs(r) < epsilon] = 0
 
        if len(r)>1:
          condi_var = S[j,j] - r**2/S[i,i]
          eta = -A*np.log(condi_var)-(S[j,j]-2*r/S[i,i]*s12 + r**2/S[i,i]**2*s11)/condi_var
          r = r[eta == max(eta[~np.isnan(eta)])]

        if len(r) > 1: 
          if sum(r==0.0) == len(r):
            r = 0.
          else:  
            w = np.cov(mat, rowvar=False)  
            #r = r[w[0,1]*r>=0]
            r = r[np.abs(r-w[0,1]).argmin()] # select r that is closet to w[0,1] 

        S[i,j] = S[j,i] = r
    return S

### compute_err function 

In [None]:
def err(mus, S, mus_est, S_est):
  er = [np.linalg.norm(mus_est-mus)/mus.size,
        np.linalg.norm(S_est.flatten().flatten()-S.flatten())/S.size]  
  return np.mean(er)  

def generate_nan(Xtrain, missing_rate):
  Xshape = Xtrain.shape
  na_id = np.random.randint(0,Xtrain.size,round(missing_rate*Xtrain.size))
  Xtr_nan = Xtrain.flatten()
  Xtr_nan[na_id] = np.nan 
  return Xtr_nan.reshape(Xshape) 

def compute_err(Xtrain, ytrain, G, missing_rate):  
    Xtr_nan = generate_nan(Xtrain, missing_rate)
    
    scaler = StandardScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtrain = scaler.transform(Xtrain)
    
    # estimate parameters from full data
    # each row is a mean of a class
    mus = np.array([np.mean(Xtrain[ytrain==g,:], axis=0) for g in np.arange(G)])
    S = np.array([np.cov(Xtrain[ytrain==g,:],rowvar =False) 
             for g in np.arange(G)])
    print('original mus',mus)
    print('S', S) 

    # MLEs approach
    start = time.time()
    mus_mle = musMLE(Xtr_nan,y,G)
    S_mle = np.array([Smle(Xtr_nan,y,mus_mle, g) for g in range(G)])   
    print('mus MLE',mus_mle)
    print('S MLE', S_mle)          
    mle_err = err(mus, S, mus_mle.T, S_mle)
    mle_time = time.time()-start

    start = time.time()
    Xtr_softimpute = SoftImpute(max_iters = 100).fit_transform(Xtr_nan)
    mus_softimpute = np.asarray([np.mean(Xtr_softimpute[ytrain==g,:], axis=0
                                         ) for g in np.arange(G)])
    S_softimpute = np.asarray([np.cov(Xtr_softimpute[ytrain==g,:], rowvar =False) 
             for g in np.arange(G)])

    print('mus soft impute', mus_softimpute)
    print('S soft impute', S_softimpute)
    softimpute_err =  err(mus, S, mus_softimpute, S_softimpute)
    softimpute_time = time.time()-start

    start = time.time()
    Xtr_mice = IterativeImputer(max_iter=100).fit(Xtr_nan).transform(Xtr_nan)
    mus_mice = np.asarray([np.mean(Xtr_mice[ytrain==g,:], axis=0
                                   ) for g in np.arange(G)])
    S_mice = np.asarray([np.cov(Xtr_mice[ytrain==g,:], rowvar =False) 
             for g in np.arange(G)])

    mice_err = err(mus, S, mus_mice, S_mice)
    mice_time = time.time()-start

    start = time.time()
    Xtr_nuclear = NuclearNormMinimization(max_iters=100).fit_transform(Xtr_nan)
    mus_nuclear = np.asarray([np.mean(Xtr_nuclear[ytrain==g,:], axis=0
                                      ) for g in np.arange(G)])
    S_nuclear = np.asarray([(sum(ytrain==g))*np.cov(Xtr_nuclear[ytrain==g,:], rowvar =False) 
             for g in np.arange(G)])
    nuclear_err = err(mus, S, mus_nuclear, S_nuclear)
    nuclear_time = time.time()-start
    
    start = time.time()
    Xtr_mforest = MissForest(random_state=0).fit_transform(Xtr_nan)
    mus_mforest = np.asarray([np.mean(Xtr_mforest[ytrain==g,:], axis=0
                                      ) for g in np.arange(G)])
    S_mforest = np.asarray([(sum(ytrain==g))*np.cov(Xtr_mforest[ytrain==g,:], rowvar =False) 
             for g in np.arange(G)])
    mforest_err = err(mus, S, mus_mforest, S_mforest)
    mforest_time = time.time()-start

    err_rate = np.vstack((mle_err, mice_err, softimpute_err,nuclear_err, mforest_err))
    return err_rate

## Inosphere

In [None]:
data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data',
                  sep = ",", header = None)
# print(data.head())
data = pd.DataFrame.to_numpy(data)
X, y = data[:,:34].astype(np.float64), data[:,34]
le2 = LabelEncoder()
y = le2.fit_transform(y)
print(len(y))
X = np.delete(X,[0,1], axis = 1)
X.shape

351


(351, 32)

In [None]:
G = 2
np.random.seed(3)
e20 = compute_err(X, y, G,0.2)
e35 = compute_err(X, y, G,0.35)
e50 = compute_err(X, y, G,0.5)
e65 = compute_err(X, y, G,0.65)
e80 = compute_err(X, y, G,0.8)
res = np.asarray(list((e20, e35, e50, e65, e80)))

In [None]:
np.round(res.reshape((-1,5)).transpose(),3)

array([[0.003, 0.004, 0.006, 0.007, 0.008],
       [0.004, 0.005, 0.005, 0.006, 0.01 ],
       [0.003, 0.005, 0.005, 0.007, 0.009],
       [0.67 , 0.615, 0.57 , 0.531, 0.511],
       [0.717, 0.704, 0.693, 0.688, 0.682]])

In [None]:
np.around(e20,3)  

array([[0.003],
       [0.004],
       [0.003],
       [0.67 ],
       [0.717]])

# seeds 

In [None]:
data = pd.read_table('https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt',
                     sep = '\s+', header = None)
data = pd.DataFrame.to_numpy(data)
X,y = data[:,:7], data[:,7]-1 # reset the labels to go start from 0  

In [None]:
G = 3
np.random.seed(3)
e20 = compute_err(X, y, G,0.2)
e35 = compute_err(X, y, G,0.35)
e50 = compute_err(X, y, G,0.5)
e65 = compute_err(X, y, G,0.65)
e80 = compute_err(X, y, G,0.8)
res = np.asarray(list((e20, e35, e50, e65, e80)))

In [None]:
np.round(res.reshape((-1,5)).transpose(),3)

array([[0.004, 0.008, 0.007, 0.009, 0.009],
       [0.004, 0.009, 0.01 , 0.017, 0.025],
       [0.007, 0.015, 0.016, 0.025, 0.028],
       [0.522, 0.517, 0.459, 0.489, 0.486],
       [0.52 , 0.543, 0.504, 0.607, 0.647]])

In [None]:
np.around(e20,3)

array([[0.004],
       [0.004],
       [0.007],
       [0.522],
       [0.52 ]])

# wine
The data set is also available in sklearn, as noted in the package's website. So, we load it directly from sklearn

In [None]:
wine = datasets.load_wine()
X,y = wine.data, wine.target.ravel() 
# sum(y==0), sum(y==1), sum(y==2)

In [None]:
G = 3
np.random.seed(3)
e20 = compute_err(X, y, G,0.2)
e35 = compute_err(X, y, G,0.35)
e50 = compute_err(X, y, G,0.5)
e65 = compute_err(X, y, G,0.65)
e80 = compute_err(X, y, G,0.8)
res = np.asarray(list((e20, e35, e50, e65, e80)))

In [None]:
np.round(res.reshape((-1,5)).transpose(),3)

array([[0.005, 0.009, 0.008, 0.012, 0.013],
       [0.005, 0.01 , 0.011, 0.014, 0.018],
       [0.006, 0.012, 0.016, 0.023, 0.025],
       [0.27 , 0.256, 0.239, 0.229, 0.215],
       [0.288, 0.277, 0.276, 0.304, 0.286]])

In [None]:
np.around(e20,3)

array([[0.005],
       [0.005],
       [0.006],
       [0.27 ],
       [0.288]])

# Iris
The data set is also available in sklearn, as noted in the package's website. So, we load it directly from sklearn

In [None]:
iris = datasets.load_iris()
X,y = iris.data, iris.target.ravel() 
G = 3
np.random.seed(3)
e20 = compute_err(X, y, G,0.2)
e35 = compute_err(X, y, G,0.35)
e50 = compute_err(X, y, G,0.5)
e65 = compute_err(X, y, G,0.65)
e80 = compute_err(X, y, G,0.8)
res = np.asarray(list((e20, e35, e50, e65, e80)))

In [None]:
np.round(res.reshape((-1,5)).transpose(),3)

array([[0.007, 0.009, 0.006, 0.013, 0.024],
       [0.009, 0.015, 0.017, 0.033, 0.035],
       [0.013, 0.025, 0.026, 0.049, 0.049],
       [0.693, 0.665, 0.597, 0.672, 0.616],
       [0.71 , 0.857, 0.812, 0.998, 0.951]])

In [None]:
np.around(e20,3)

array([[0.007],
       [0.009],
       [0.013],
       [0.693],
       [0.71 ]])