<a href="https://colab.research.google.com/github/thunguyen177/DPER/blob/main/DPER_multiple_class_assuming_equal_covariance_matrix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## libraries and function 


In [None]:
!pip install impyute
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from scipy import stats
import numpy as np
import impyute as impy
from fancyimpute import IterativeSVD, SoftImpute, NuclearNormMinimization
import pandas as pd
import time
!pip install missingpy
from missingpy import MissForest
# note that MissForest uses sklearn.__version__ 0.22.2.post1

## MLE estimation function  




In [None]:
def diag_term(i,X,y):
  arr0 = X[:,i]
  nar2 = 0
  arr = arr0[~np.isnan(arr0)]
  y_arr = y[~np.isnan(arr0)]

  _, counts = np.unique(y_arr, return_counts=True)
  ind = np.insert(np.cumsum(counts), 0, 0)
  
  return sum([(ind[g]-ind[g-1])*np.var(arr[ind[g-1]:ind[g]]) for 
                       g in range(1,G+1)])/len(y_arr)                       

In [None]:
def mle(X,y,G):
    '''
    X: input, should be a numpy array
    y: label
    G: number of classes
    output:
    - mus: each row is a class mean
    - S: common covariance matrix of class 1,2,..., G 
    '''
    epsilon = 1e-5 # define epsilon to put r down to 0 if r < epsilon
    n,p = X.shape[0], X.shape[1]

    # Estimating class means
    mus = np.array([np.nanmean(X[y==g,:],axis=0) for g in range (G)]).T # so that each column is the mean of a class
 
    S = np.diag([diag_term(i,X,y) for i in range(p)]) 

    for i in range(p):      
      for j in range(i):
        mat = X[:,[i,j]]

        # drop rows with NA
        idx = ~np.isnan(mat).any(axis=1)
        mat, y_arr = mat[idx], y[idx]

        _, counts = np.unique(y_arr, return_counts=True)
        ind = np.insert(np.cumsum(counts), 0, 0)

        m_g = counts
 
        A = len(y_arr) 
        scaled_mat = [mat[ind[g-1]:ind[g],:]-mus[[i,j],g-1] for g in range(1,G+1)]   

        q = lambda g: np.dot(scaled_mat[g][:,0],scaled_mat[g][:,0])
        s11 = sum(map(q,range(G))) 
        q = lambda g: np.dot(scaled_mat[g][:,1],scaled_mat[g][:,1])
        s22 = sum(map(q,range(G))) 
        d = lambda g: np.dot(scaled_mat[g][:,0],scaled_mat[g][:,1])
        s12 = sum(map(d,range(G)))  

        start_solve = time.time()
        B = S[i,i]*S[j,j]*A - s22 * S[i,i] - s11 * S[j,j]
        coefficient = [-A, s12, B, s12*S[i,i]*S[j,j]]
        r = np.roots(coefficient)

        r = r[abs(np.imag(r)) < epsilon]
        r = np.real(r)
        r[abs(r) < epsilon] = 0

        if len(r)>1:
          condi_var = S[j,j] - r**2/S[i,i]
          eta = -A*np.log(condi_var)-(S[j,j]-2*r/S[i,i]*s12 +
                                      r**2/S[i,i]**2*s11)/condi_var
          # if condi_var <0 then eta = NA. in practice, it's impossible for cov to be negative
          #  therefore, we drop NA elements of eta  
          r = r[eta == max(eta[~np.isnan(eta)])]

        if len(r) > 1:        
            w = [m_g[g-1]*np.cov(mat[ind[g-1]:ind[g],], rowvar=False) for
                 g in range(1,G+1)]
            w = np.sum(w, axis = 0)    
            r = r[np.abs(r-w[0,1]).argmin()] # select r that is closet to w[0,1] 
              
        S[i,j] = S[j,i] = r
    return [mus, S]

### compute_err function 
  

In [None]:
def err(mus, S, mus_est, S_est):
  er = [np.linalg.norm(mus_est-mus)/mus.size,
         np.linalg.norm(S_est.flatten()-S.flatten())/S.size]
  return np.mean(er)  

def generate_nan(Xtrain, missing_rate):
  Xshape = Xtrain.shape
  na_id = np.random.randint(0,Xtrain.size,round(missing_rate*Xtrain.size))
  Xtr_nan = Xtrain.flatten()
  Xtr_nan[na_id] = np.nan 
  return Xtr_nan.reshape(Xshape)

def compute_err(Xtrain, ytrain, G, missing_rate):  
    Xtr_nan = generate_nan(Xtrain, missing_rate)
    
    scaler = StandardScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtrain = scaler.transform(Xtrain)
    
    # estimate parameters from full data
    mus = [np.mean(Xtrain[ytrain==g,:], axis=0) for g in np.arange(G)]
    mus = np.asarray(mus) # each row is a mean of a class
    S = [sum(ytrain==g)*np.cov(Xtrain[ytrain==g,:],rowvar =False) 
             for g in np.arange(G)]
    S = np.sum(S, axis = 0)/len(ytrain)

    # MLEs approach
    start = time.time()
    mus_mle, S_mle = mle(Xtr_nan,ytrain, G)
    mle_err = err(mus, S, mus_mle.T, S_mle)
    mle_time = time.time()-start

    start = time.time()
    Xtr_softimpute = SoftImpute(max_iters = 100).fit_transform(Xtr_nan)
    mus_softimpute = np.asarray([np.mean(Xtr_softimpute[ytrain==g,:], axis=0
                                         ) for g in np.arange(G)])
    S_softimpute = np.asarray([(sum(ytrain==g))*np.cov(Xtr_softimpute[ytrain==g,:], rowvar =False) 
             for g in np.arange(G)])
    S_softimpute = np.sum(S_softimpute, axis = 0)/len(ytrain) 
    softimpute_err =  err(mus, S, mus_softimpute, S_softimpute)
    softimpute_time = time.time()-start

    start = time.time()
    Xtr_mice = IterativeImputer(max_iter=100).fit(Xtr_nan).transform(Xtr_nan)
    mus_mice = np.asarray([np.mean(Xtr_mice[ytrain==g,:], axis=0
                                   ) for g in np.arange(G)])
    S_mice = np.asarray([(sum(ytrain==g))*np.cov(Xtr_mice[ytrain==g,:], rowvar =False) 
             for g in np.arange(G)])
    S_mice = np.sum(S_mice, axis = 0)/len(ytrain) 
    mice_err = err(mus, S, mus_mice, S_mice)
    mice_time = time.time()-start

    start = time.time()
    Xtr_nuclear = NuclearNormMinimization(max_iters=100).fit_transform(Xtr_nan)
    mus_nuclear = np.asarray([np.mean(Xtr_nuclear[ytrain==g,:], axis=0
                                      ) for g in np.arange(G)])
    S_nuclear = np.asarray([(sum(ytrain==g))*np.cov(Xtr_nuclear[ytrain==g,:], rowvar =False) 
             for g in np.arange(G)])
    S_nuclear = np.sum(S_nuclear, axis = 0)/len(ytrain)
    nuclear_err = err(mus, S, mus_nuclear, S_nuclear)
    nuclear_time = time.time()-start 

    start = time.time()
    Xtr_mforest = MissForest(random_state=0).fit_transform(Xtr_nan)
    mus_mforest = np.asarray([np.mean(Xtr_mforest[ytrain==g,:], axis=0
                                      ) for g in np.arange(G)])
    S_mforest = np.asarray([(sum(ytrain==g))*np.cov(Xtr_mforest[ytrain==g,:], rowvar =False) 
             for g in np.arange(G)])
    S_mforest = np.sum(S_mforest, axis = 0)/len(ytrain)
    mforest_err = err(mus, S, mus_mforest, S_mforest)
    mforest_time = time.time()-start      

    err_rate = np.vstack((mle_err, mice_err, softimpute_err,nuclear_err,mforest_err))
    # running_time = np.array([mle_time, knn_time,mice_time,
    #                          softimpute_time,em_time, nuclear_time,
    #                         ])    
    return err_rate


## Inosphere

In [None]:
data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data',
                  sep = ",", header = None)
# print(data.head())
data = pd.DataFrame.to_numpy(data)
X, y = data[:,:34].astype(np.float64), data[:,34]
le2 = LabelEncoder()
y = le2.fit_transform(y)
print(len(y))
X = np.delete(X,[0,1], axis = 1)
X.shape

351


(351, 32)

In [None]:
G = 2
np.random.seed(8)
e20 = compute_err(X, y, 2,0.2)
e35 = compute_err(X, y, 2,0.35)
e50 = compute_err(X, y, 2,0.5)
e65 = compute_err(X, y, 2,0.65)
e80 = compute_err(X, y, 2,0.8)
res = np.asarray(list((e20, e35, e50, e65, e80)))

In [None]:
np.round(res.reshape((-1,5)).transpose(),3)

array([[0.004, 0.005, 0.006, 0.008, 0.008],
       [0.003, 0.004, 0.006, 0.007, 0.01 ],
       [0.003, 0.004, 0.006, 0.008, 0.009],
       [0.003, 0.004, 0.006, 0.007, 0.009],
       [0.003, 0.004, 0.005, 0.007, 0.008]])

In [None]:
np.round(e20,3)

array([[0.004],
       [0.003],
       [0.003],
       [0.003],
       [0.003]])

# seeds 

In [None]:
data = pd.read_table('https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt',
                     sep = '\s+', header = None)
data = pd.DataFrame.to_numpy(data)
X,y = data[:,:7], data[:,7]-1 # reset the labels to go start from 0  

In [None]:
G = 3
np.random.seed(4)
e20 = compute_err(X, y, G,0.2)
e35 = compute_err(X, y, G,0.35)
e50 = compute_err(X, y, G,0.5)
e65 = compute_err(X, y, G,0.65)
e80 = compute_err(X, y, G,0.8)
res = np.asarray(list((e20, e35, e50, e65, e80)))

In [None]:
np.round(res.reshape((-1,5)).transpose(),3)

array([[0.004, 0.007, 0.009, 0.008, 0.014],
       [0.003, 0.006, 0.013, 0.014, 0.034],
       [0.005, 0.01 , 0.02 , 0.024, 0.033],
       [0.005, 0.01 , 0.019, 0.023, 0.025],
       [0.004, 0.008, 0.011, 0.016, 0.019]])

In [None]:
np.round(e20,3)

array([[0.004],
       [0.003],
       [0.005],
       [0.005],
       [0.004]])

# wine
The data set is also available in sklearn, as noted in the package's website. So, we load it directly from sklearn

In [None]:
wine = datasets.load_wine()
X,y = wine.data, wine.target.ravel() 
# sum(y==0), sum(y==1), sum(y==2)

In [None]:
G = 3
np.random.seed(11) 
e20 = compute_err(X, y, G,0.2)
e35 = compute_err(X, y, G,0.35)
e50 = compute_err(X, y, G,0.5)
e65 = compute_err(X, y, G,0.65)
e80 = compute_err(X, y, G,0.8)
res = np.asarray(list((e20, e35, e50, e65, e80)))

In [None]:
np.round(res.reshape((-1,5)).transpose(),3)

array([[0.005, 0.006, 0.009, 0.009, 0.012],
       [0.006, 0.009, 0.01 , 0.014, 0.021],
       [0.007, 0.011, 0.016, 0.021, 0.025],
       [0.007, 0.011, 0.015, 0.02 , 0.023],
       [0.005, 0.007, 0.01 , 0.013, 0.017]])

In [None]:
np.round(e20,3)

array([[0.005],
       [0.006],
       [0.007],
       [0.007],
       [0.005]])

# Iris
The data set is also available in sklearn, as noted in the package's website. So, we load it directly from sklearn

In [None]:
iris = datasets.load_iris()
X,y = iris.data, iris.target.ravel() 
G = 3
np.random.seed(6)
e20 = compute_err(X, y, G,0.2)
e35 = compute_err(X, y, G,0.35)
e50 = compute_err(X, y, G,0.5)
e65 = compute_err(X, y, G,0.65)
e80 = compute_err(X, y, G,0.8)
res = np.asarray(list((e20, e35, e50, e65, e80)))

In [None]:
np.round(res.reshape((-1,5)).transpose(),3)

array([[0.008, 0.01 , 0.012, 0.01 , 0.016],
       [0.008, 0.018, 0.029, 0.03 , 0.049],
       [0.014, 0.029, 0.043, 0.045, 0.069],
       [0.014, 0.028, 0.04 , 0.042, 0.061],
       [0.01 , 0.017, 0.021, 0.037, 0.042]])

In [None]:
np.round(e20,3)

array([[0.008],
       [0.008],
       [0.014],
       [0.014],
       [0.01 ]])