#Library and functions

In [None]:
!pip install scikit-learn
!pip install fancyimpute
!pip install DistributedMissForest
!pip install missingpy
!pip install MissForest
!pip install pandas

Collecting fancyimpute
  Downloading fancyimpute-0.7.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting knnimpute>=0.1.0 (from fancyimpute)
  Downloading knnimpute-0.1.0.tar.gz (8.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nose (from fancyimpute)
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: fancyimpute, knnimpute
  Building wheel for fancyimpute (setup.py) ... [?25l[?25hdone
  Created wheel for fancyimpute: filename=fancyimpute-0.7.0-py3-none-any.whl size=29879 sha256=5b46b39b2e5c84895da9b35b41a5a9a5c27b9657e8fcaf2c78da0a3b7bb49a48
  Stored in directory: /root/.cache/pip/wheels/7b/0c/d3/ee82d1fbdcc0858d96434af108608d01703505d453720c84ed
  Building wheel for knnimpute (setup.py) ... [?25l[?25hdone
  Created wheel for knnimpute: filename=knnimpute-0.1.0-py3-none-

In [None]:
import numpy as np
import pandas as pd
import time
import math

import sklearn.neighbors._base
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

from sklearn import datasets
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as skLDA
from sklearn.experimental import enable_iterative_imputer
from sklearn.model_selection import train_test_split
from sklearn.impute import IterativeImputer

from scipy import stats
from fancyimpute import SoftImpute
from missforest.missforest import MissForest
from numpy.linalg import norm, inv

In [None]:
#finding root closest CD
def solving(a,b,c,d,del_case):
  roots = np.roots([a,b,c,d])
  real_roots = np.real(roots[np.isreal(roots)])
  if len(real_roots)==1:
    return real_roots[0]
  else:
    f = lambda x: abs(x-del_case)
    F=[f(x) for x in real_roots]
    return real_roots[np.argmin(F)]

#variance matrix err
def error(sig, sig_est):
  er = np.linalg.norm(sig_est.flatten()-sig.flatten())/sig.size
  return er

#normalizing data
def normalize_data(X):
  scaler = StandardScaler()
  scaler.fit(X)
  return scaler.transform(X)

#generating NaN
'''def generate_nan(X, missing_rate):
  na_id = np.random.randint(0,X.size,round(missing_rate*X.size))
  Xnan = X.flatten()
  Xnan[na_id] = np.nan
  return Xnan.reshape(X.shape)'''

def generate_nan(X, missing_rate):
    X_copy=np.copy(X)

    X_non_missing = X_copy[[0],:]
    X_missing = X_copy[[i for i in range(1,X.shape[0],1)],:]

    XmShape = X_missing.shape
    na_id = np.random.randint(0, X_missing.size, round(missing_rate * X_missing.size))
    X_nan = X_missing.flatten()
    X_nan[na_id] = np.nan
    X_nan = X_nan.reshape(XmShape)

    res = np.vstack((X_non_missing, X_nan))
    return res

#Algorithms

In [None]:
#For single class
def sig_estimate(X,mus0,mus1):
  m=n=l=sig11=sig22=s11=s12=s22=0
  del_case=0
  for i in X.T:
    if np.isfinite(i[0]) and np.isfinite(i[1]):
      m=m+1
      s11=s11+(i[0]-mus0)**2
      s22=s22+(i[1]-mus1)**2
      s12=s12+(i[0]-mus0)*(i[1]-mus1)
      sig11=sig11+(i[0]-mus0)**2
      sig22=sig22+(i[1]-mus1)**2
      del_case=del_case+(i[0]-mus0)*(i[1]-mus1)
    elif np.isfinite(i[0]) and np.isnan(i[1]):
      n=n+1
      sig11=sig11+(i[0]-mus0)**2
    elif np.isnan(i[0]) and np.isfinite(i[1]):
      l=l+1
      sig22=sig22+(i[1]-mus1)**2
  del_case = max(del_case/(m-1),0)
  sig11=sig11/(m+n)
  sig22=sig22/(m+l)
  sig12=solving(-m,s12,(m*sig11*sig22-s22*sig11-s11*sig22),s12*sig11*sig22,del_case)
  return sig11,sig22,sig12

def DPER(X):
  sig=np.zeros((X.shape[1],X.shape[1]))     #estimated covariance matrix
  #estimation of mean
  mu=np.nanmean(X,axis=0)
  #estimation of covariane
  for a in range(X.shape[1]):
    for b in range(a):
      temp=sig_estimate(np.array([X[:,b],X[:,a]]),mu[b],mu[a])
      sig[b][b]=temp[0]
      sig[a][a]=temp[1]
      sig[b][a]=sig[a][b]=temp[2]
  return sig

In [None]:
#For multiclass (X,y) where y is a class
def sig_estimate_multi(X,mu0,mu1,y):
  del_case=0
  res=np.array([0]*8)  # [m,n,l,s11,s12,s22,sig11,sig22]
  numlabel=len(np.unique(y))
  for g in range(numlabel):
    m=n=l=s11=s12=s22=sig11=sig22=0
    mus0=mu0[g]
    mus1=mu1[g]
    Xg=(X.T)[y==g]
    for i in Xg:
      if np.isfinite(i[0]) and np.isfinite(i[1]):
        m=m+1
        s11=s11+(i[0]-mus0)**2
        s22=s22+(i[1]-mus1)**2
        s12=s12+(i[0]-mus0)*(i[1]-mus1)
        sig11=sig11+(i[0]-mus0)**2
        sig22=sig22+(i[1]-mus1)**2
      elif np.isfinite(i[0]) and np.isnan(i[1]):
        n=n+1
        sig11=sig11+(i[0]-mus0)**2
      elif np.isnan(i[0]) and np.isfinite(i[1]):
        l=l+1
        sig22=sig22+(i[1]-mus1)**2
    res = res+np.array([m,n,l,s11,s12,s22,sig11,sig22])
  m,n,l,s11,s12,s22,sig11,sig22 = res
  del_case = max(0,del_case/(m-1))
  sig11=sig11/(m+n)
  sig22=sig22/(m+l)
  sig12=solving(-m,s12,(m*sig11*sig22-s22*sig11-s11*sig22),s12*sig11*sig22,del_case)
  return sig11,sig22,sig12

def DPERmulticlass(X,y):            #with assumption of equal covariance matrices
  numlabel=len(np.unique(y))        #number of unique label in y
  p=X.shape[1]
  sig=np.zeros((p,p))               #estimated covariance matrix
  #compute mu_est
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  #estimation of covariane matrix
  for a in range(p):
    for b in range(a):
      temp=sig_estimate_multi(np.array([X[:,b],X[:,a]]),mu[:,b],mu[:,a],y)
      sig[b][b]=temp[0]
      sig[a][a]=temp[1]
      sig[b][a]=sig[a][b]=temp[2]
  return sig

#Pima Indians diabetes dataset

In [None]:
def experiments(X, y, run_time, missing_rate):
    G = len(np.unique(y))
    err_equal = []
    err_not = []
    S0 = sum([(sum(y==g) - 1)*np.cov(X[y==g], rowvar = False) for g in range(G)])/(len(y) - G)
    S2 = np.array([np.cov(X[y==g], rowvar = False) for g in range(G)])

    for i in range(run_time):
        Xnan = generate_nan(X, missing_rate)

        #Impute data
        XMice = IterativeImputer(max_iter = 10).fit(Xnan).transform(Xnan)
        Xd = pd.DataFrame.from_records(Xnan)
        mf = MissForest()
        XMiss_df = mf.fit_transform(Xd)
        XMiss = XMiss_df.to_numpy()
        XSoft =  SoftImpute(max_iters = 10, verbose = False).fit_transform(Xnan)

        #Not assume equal covariance matrices
        SDper2 = np.array([DPER(Xnan[y==g]) for g in range(G)])
        SMice2 = np.array([np.cov(XMice[y==g], rowvar = False) for g in range(G)])
        SMiss2 = np.array([np.cov(XMiss[y==g], rowvar = False) for g in range(G)])
        SSoft2 = np.array([np.cov(XSoft[y==g], rowvar = False) for g in range(G)])
        err_not.append(np.array([error(S2, SDper2),error(S2, SMice2), error(S2, SMiss2),error(S2, SSoft2)]))

        #Assume equal covariance matrices
        SDper = DPERmulticlass(Xnan,y)
        SMice =  sum([(sum(y==g) - 1)*np.cov(XMice[y==g], rowvar = False) for g in range(G)])/(len(y) - G)
        SMiss =  sum([(sum(y==g) - 1)*np.cov(XMiss[y==g], rowvar = False) for g in range(G)])/(len(y) - G)
        SSoft =  sum([(sum(y==g) - 1)*np.cov(XSoft[y==g], rowvar = False) for g in range(G)])/(len(y) - G)
        err_equal.append(np.array([error(S0, SDper),error(S0, SMice), error(S0, SMiss),error(S0, SSoft)]))

    return err_not, err_equal

In [None]:
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
data = pd.read_csv(url,sep = ",", header = None)
data.head()

data = data.to_numpy()
X,y = data[:, [x for x in range(data.shape[1]) if x != 8]].astype(np.float32),data[:,-1]
G = len(np.unique(y))
le2 = LabelEncoder()
y = le2.fit_transform(y)
X = normalize_data(X)
X.shape

(768, 8)

In [None]:
run_time = 100
e20,ee20 = experiments(X,y,run_time,.2)
e40,ee40 = experiments(X,y,run_time,.4)
e60,ee60 = experiments(X,y,run_time,.6)
e80,ee80 = experiments(X,y,run_time,.8)

[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000036 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 216
[LightGBM] [Info] Number of data points in the train set: 153, number of used features: 7
[LightGBM] [Info] Start training from score -0.052952
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000039 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 233
[LightGBM] [Info] Number of data points in the train set: 153, number of used features: 7
[LightGBM] [Info] Start training from score -0.010183
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000052 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 237
[LightGBM] [Info] Number of data points in the train set



[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 36, number of used features: 0
[LightGBM] [Info] Start training from score 0.054263
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 36, number of used features: 0
[LightGBM] [Info] Start training from score 0.268457
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 36, number of used features: 0
[LightGBM] [Info] Start training from score 0.128100
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 36, number of used features: 0
[LightGBM] [Info] Start training from score 0.459461
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 36, number of used features: 0
[LightGBM] [Info] Start training from score 0.222914
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 36, number of used features: 0
[LightGBM] [Inf



[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 36, number of used features: 0
[LightGBM] [Info] Start training from score 0.054263
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 36, number of used features: 0
[LightGBM] [Info] Start training from score -0.122757
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 36, number of used features: 0
[LightGBM] [Info] Start training from score -0.215116
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 36, number of used features: 0
[LightGBM] [Info] Start training from score 0.205064
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 36, number of used features: 0
[LightGBM] [Info] Start training from score 0.086399
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 36, number of used features: 0
[LightGBM] [I



[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 36, number of used features: 0
[LightGBM] [Info] Start training from score 0.136754
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 36, number of used features: 0
[LightGBM] [Info] Start training from score 0.048508
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 36, number of used features: 0
[LightGBM] [Info] Start training from score -0.035610
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 36, number of used features: 0
[LightGBM] [Info] Start training from score -0.106834
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 36, number of used features: 0
[LightGBM] [Info] Start training from score -0.065069
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of da



[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 25, number of used features: 0
[LightGBM] [Info] Start training from score 0.046014
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 25, number of used features: 0
[LightGBM] [Info] Start training from score 0.043361
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 25, number of used features: 0
[LightGBM] [Info] Start training from score 0.000751
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 25, number of used features: 0
[LightGBM] [Info] Start training from score 0.126933
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 25, number of used features: 0
[LightGBM] [Info] Start training from score -0.057997
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data



[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 1, number of used features: 0
[LightGBM] [Info] Start training from score 0.639947
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 1, number of used features: 0
[LightGBM] [Info] Start training from score 0.848324
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 1, number of used features: 0
[LightGBM] [Info] Start training from score 0.149641
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 1, number of used features: 0
[LightGBM] [Info] Start training from score 0.907270
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 1, number of used features: 0
[LightGBM] [Info] Start training from score -0.692891
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data poin

In [None]:
pm = pd.DataFrame(np.repeat(" ± ",16).reshape(4,4),
                  index = ["20%", "40%", "60%", "80%"],
                  columns = ["DPER", "MICE", "MissForest", "Soft-Impute"])
print('Without the assumption of equal covariance matrices')
not_err = pd.DataFrame(np.vstack((np.mean(e20, axis = 0), np.mean(e40, axis = 0),
                      np.mean(e60, axis = 0),np.mean(e80, axis = 0))).round(3),
                      index = pm.index,
                      columns = pm.columns)
not_std = pd.DataFrame(np.vstack((np.std(e20, axis = 0), np.std(e40, axis = 0),
                      np.std(e60, axis = 0),np.std(e80, axis = 0))).round(3),
                      index = pm.index,
                      columns = pm.columns)
print(not_err.astype(str)+pm+not_std.astype(str))
print("\n")
print('Under the assumption of equal covariance matrices')
equal_err = pd.DataFrame(np.vstack((np.mean(ee20, axis = 0), np.mean(ee40, axis = 0),
                        np.mean(ee60, axis = 0),np.mean(ee80, axis = 0))).round(3),
                        index = ["20%", "40%", "60%", "80%"],
                        columns = ["DPER", "MICE", "MissForest", "Soft-Impute"])
equal_std = pd.DataFrame(np.vstack((np.std(ee20, axis = 0), np.std(ee40, axis = 0),
                      np.std(ee60, axis = 0),np.std(ee80, axis = 0))).round(3),
                      index = pm.index,
                      columns = pm.columns)
print(equal_err.astype(str)+pm+equal_std.astype(str))

Without the assumption of equal covariance matrices
              DPER           MICE     MissForest    Soft-Impute
20%    0.004 ± 0.0  0.006 ± 0.001  0.005 ± 0.001  0.008 ± 0.001
40%  0.006 ± 0.001   0.01 ± 0.001  0.014 ± 0.001  0.014 ± 0.001
60%  0.008 ± 0.001  0.012 ± 0.001  0.018 ± 0.001  0.018 ± 0.001
80%  0.011 ± 0.001  0.014 ± 0.001  0.021 ± 0.001  0.021 ± 0.001


Under the assumption of equal covariance matrices
              DPER           MICE     MissForest    Soft-Impute
20%    0.003 ± 0.0  0.006 ± 0.001  0.006 ± 0.001   0.01 ± 0.001
40%  0.005 ± 0.001  0.011 ± 0.001  0.018 ± 0.002  0.018 ± 0.001
60%  0.007 ± 0.001  0.014 ± 0.001  0.023 ± 0.001  0.024 ± 0.001
80%  0.009 ± 0.001  0.017 ± 0.001  0.026 ± 0.002  0.028 ± 0.001
