#Library and functions

In [None]:
!pip install scikit-learn
!pip install fancyimpute
!pip install DistributedMissForest
!pip install MissForest

Collecting fancyimpute
  Using cached fancyimpute-0.7.0-py3-none-any.whl
Collecting knnimpute>=0.1.0 (from fancyimpute)
  Using cached knnimpute-0.1.0-py3-none-any.whl
Collecting nose (from fancyimpute)
  Using cached nose-1.3.7-py3-none-any.whl (154 kB)
Installing collected packages: nose, knnimpute, fancyimpute
Successfully installed fancyimpute-0.7.0 knnimpute-0.1.0 nose-1.3.7
Collecting DistributedMissForest
  Using cached DistributedMissForest-1.4.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: DistributedMissForest
  Building wheel for DistributedMissForest (setup.py) ... [?25l[?25hdone
  Created wheel for DistributedMissForest: filename=DistributedMissForest-1.4-py3-none-any.whl size=17469 sha256=554a1937d83995b9bf9babe71d322082e8e724a1327aff8b31b557345d5052e1
  Stored in directory: /root/.cache/pip/wheels/62/93/ad/606370d3635fc2b977fafeeccd59929f886b09f19d4386e0ac
Successfully built DistributedMissForest
Installing c

In [None]:
import numpy as np
import pandas as pd
import time
import math

import sklearn.neighbors._base
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

from sklearn import datasets
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as skLDA
from sklearn.experimental import enable_iterative_imputer
from sklearn.model_selection import train_test_split
from sklearn.impute import IterativeImputer

from scipy import stats
from fancyimpute import SoftImpute
from missforest.missforest import MissForest
from numpy.linalg import norm, inv

In [None]:
#finding root closest CD
def solving(a,b,c,d,del_case):
  roots = np.roots([a,b,c,d])
  real_roots = np.real(roots[np.isreal(roots)])
  if len(real_roots)==1:
    return real_roots[0]
  else:
    f = lambda x: abs(x-del_case)
    F=[f(x) for x in real_roots]
    return real_roots[np.argmin(F)]

#variance matrix err
def error(sig, sig_est):
  er = np.linalg.norm(sig_est.flatten()-sig.flatten())/sig.size
  return er

#normalizing data
def normalize_data(X):
  scaler = StandardScaler()
  scaler.fit(X)
  return scaler.transform(X)

#generating NaN
'''def generate_nan(X, missing_rate):
  na_id = np.random.randint(0,X.size,round(missing_rate*X.size))
  Xnan = X.flatten()
  Xnan[na_id] = np.nan
  return Xnan.reshape(X.shape)'''

def generate_nan(X, missing_rate):
    X_copy=np.copy(X)

    X_non_missing = X_copy[[0],:]
    X_missing = X_copy[[i for i in range(1,X.shape[0],1)],:]

    XmShape = X_missing.shape
    na_id = np.random.randint(0, X_missing.size, round(missing_rate * X_missing.size))
    X_nan = X_missing.flatten()
    X_nan[na_id] = np.nan
    X_nan = X_nan.reshape(XmShape)

    res = np.vstack((X_non_missing, X_nan))
    return res

#Algorithms

In [None]:
#For single class
def sig_estimate(X,mus0,mus1):
  m=n=l=sig11=sig22=s11=s12=s22=0
  del_case=0
  for i in X.T:
    if np.isfinite(i[0]) and np.isfinite(i[1]):
      m=m+1
      s11=s11+(i[0]-mus0)**2
      s22=s22+(i[1]-mus1)**2
      s12=s12+(i[0]-mus0)*(i[1]-mus1)
      sig11=sig11+(i[0]-mus0)**2
      sig22=sig22+(i[1]-mus1)**2
      del_case=del_case+(i[0]-mus0)*(i[1]-mus1)
    elif np.isfinite(i[0]) and np.isnan(i[1]):
      n=n+1
      sig11=sig11+(i[0]-mus0)**2
    elif np.isnan(i[0]) and np.isfinite(i[1]):
      l=l+1
      sig22=sig22+(i[1]-mus1)**2
  del_case = max(del_case/(m-1),0)
  sig11=sig11/(m+n)
  sig22=sig22/(m+l)
  sig12=solving(-m,s12,(m*sig11*sig22-s22*sig11-s11*sig22),s12*sig11*sig22,del_case)
  return sig11,sig22,sig12

def DPER(X):
  sig=np.zeros((X.shape[1],X.shape[1]))     #estimated covariance matrix
  #estimation of mean
  mu=np.nanmean(X,axis=0)
  #estimation of covariane
  for a in range(X.shape[1]):
    for b in range(a):
      temp=sig_estimate(np.array([X[:,b],X[:,a]]),mu[b],mu[a])
      sig[b][b]=temp[0]
      sig[a][a]=temp[1]
      sig[b][a]=sig[a][b]=temp[2]
  return sig

In [None]:
#For multiclass (X,y) where y is a class
def sig_estimate_multi(X,mu0,mu1,y):
  del_case=0
  res=np.array([0]*8)  # [m,n,l,s11,s12,s22,sig11,sig22]
  numlabel=len(np.unique(y))
  for g in range(numlabel):
    m=n=l=s11=s12=s22=sig11=sig22=0
    mus0=mu0[g]
    mus1=mu1[g]
    Xg=(X.T)[y==g]
    for i in Xg:
      if np.isfinite(i[0]) and np.isfinite(i[1]):
        m=m+1
        s11=s11+(i[0]-mus0)**2
        s22=s22+(i[1]-mus1)**2
        s12=s12+(i[0]-mus0)*(i[1]-mus1)
        sig11=sig11+(i[0]-mus0)**2
        sig22=sig22+(i[1]-mus1)**2
      elif np.isfinite(i[0]) and np.isnan(i[1]):
        n=n+1
        sig11=sig11+(i[0]-mus0)**2
      elif np.isnan(i[0]) and np.isfinite(i[1]):
        l=l+1
        sig22=sig22+(i[1]-mus1)**2
    res = res+np.array([m,n,l,s11,s12,s22,sig11,sig22])
  m,n,l,s11,s12,s22,sig11,sig22 = res
  del_case = max(0,del_case/(m-1))
  sig11=sig11/(m+n)
  sig22=sig22/(m+l)
  sig12=solving(-m,s12,(m*sig11*sig22-s22*sig11-s11*sig22),s12*sig11*sig22,del_case)
  return sig11,sig22,sig12

def DPERmulticlass(X,y):            #with assumption of equal covariance matrices
  numlabel=len(np.unique(y))        #number of unique label in y
  p=X.shape[1]
  sig=np.zeros((p,p))               #estimated covariance matrix
  #compute mu_est
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  #estimation of covariane matrix
  for a in range(p):
    for b in range(a):
      temp=sig_estimate_multi(np.array([X[:,b],X[:,a]]),mu[:,b],mu[:,a],y)
      sig[b][b]=temp[0]
      sig[a][a]=temp[1]
      sig[b][a]=sig[a][b]=temp[2]
  return sig

#Inosphere dataset

In [None]:
def experiments(X, y, run_time, missing_rate):
    G = len(np.unique(y))
    err_equal = []
    err_not = []
    S0 = sum([(sum(y==g) - 1)*np.cov(X[y==g], rowvar = False) for g in range(G)])/(len(y) - G)
    S2 = np.array([np.cov(X[y==g], rowvar = False) for g in range(G)])

    for i in range(run_time):
        Xnan = generate_nan(X, missing_rate)

        #Impute data
        XMice = IterativeImputer(max_iter = 10).fit(Xnan).transform(Xnan)
        Xd = pd.DataFrame.from_records(Xnan)
        mf = MissForest()
        XMiss_df = mf.fit_transform(Xd)
        XMiss = XMiss_df.to_numpy()
        XSoft =  SoftImpute(max_iters = 10, verbose = False).fit_transform(Xnan)

        #Not assume equal covariance matrices
        SDper2 = np.array([DPER(Xnan[y==g]) for g in range(G)])
        SMice2 = np.array([np.cov(XMice[y==g], rowvar = False) for g in range(G)])
        SMiss2 = np.array([np.cov(XMiss[y==g], rowvar = False) for g in range(G)])
        SSoft2 = np.array([np.cov(XSoft[y==g], rowvar = False) for g in range(G)])
        err_not.append(np.array([error(S2, SDper2),error(S2, SMice2), error(S2, SMiss2),error(S2, SSoft2)]))

        #Assume equal covariance matrices
        SDper = DPERmulticlass(Xnan,y)
        SMice =  sum([(sum(y==g) - 1)*np.cov(XMice[y==g], rowvar = False) for g in range(G)])/(len(y) - G)
        SMiss =  sum([(sum(y==g) - 1)*np.cov(XMiss[y==g], rowvar = False) for g in range(G)])/(len(y) - G)
        SSoft =  sum([(sum(y==g) - 1)*np.cov(XSoft[y==g], rowvar = False) for g in range(G)])/(len(y) - G)
        err_equal.append(np.array([error(S0, SDper),error(S0, SMice), error(S0, SMiss),error(S0, SSoft)]))

    return err_not, err_equal

In [None]:
data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data',
                  sep = ",", header = None)
data = pd.DataFrame.to_numpy(data)
X, y = data[:,:34].astype(np.float64), data[:,34]
le2 = LabelEncoder()
y = le2.fit_transform(y)
G = len(np.unique(y))
X = np.delete(X,[0,1], axis = 1)
for g in range(G):
  print(sum(y==g))
X.shape

126
225


(351, 32)

In [None]:
run_time = 100
e80,ee80 = experiments(X,y,run_time,.8)
e20,ee20 = experiments(X,y,run_time,.2)
e40,ee40 = experiments(X,y,run_time,.4)
e60,ee60 = experiments(X,y,run_time,.6)

[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 1, number of used features: 0
[LightGBM] [Info] Start training from score 0.582120
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 1, number of used features: 0
[LightGBM] [Info] Start training from score -0.321920
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 1, number of used features: 0
[LightGBM] [Info] Start training from score 0.569710
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 1, number of used features: 0
[LightGBM] [Info] Start training from score -0.296740
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 1, number of used features: 0
[LightGBM] [Info] Start training from score 0.369460
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data poi

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 1, number of used features: 0
[LightGBM] [Info] Start training from score 0.582120
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 1, number of used features: 0
[LightGBM] [Info] Start training from score -0.321920
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 1, number of used features: 0
[LightGBM] [Info] Start training from score 0.569710
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 1, number of used features: 0
[LightGBM] [Info] Start training from score -0.296740
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 1, number of used features: 0
[LightGBM] [Info] Start training from score 0.369460
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data poi

In [None]:
pm = pd.DataFrame(np.repeat(" ± ",16).reshape(4,4),
                  index = ["20%", "40%", "60%", "80%"],
                  columns = ["DPER", "MICE", "MissForest", "Soft-Impute"])
print('Without the assumption of equal covariance matrices')
not_err = pd.DataFrame(np.vstack((np.mean(e20, axis = 0)*100, np.mean(e40, axis = 0)*100,
                      np.mean(e60, axis = 0)*100,np.mean(e80, axis = 0)*100)).round(3),
                      index = pm.index,
                      columns = pm.columns)
not_std = pd.DataFrame(np.vstack((np.std(e20, axis = 0)*100, np.std(e40, axis = 0)*100,
                      np.std(e60, axis = 0)*100,np.std(e80, axis = 0)*100)).round(3),
                      index = pm.index,
                      columns = pm.columns)
print(not_err.astype(str)+pm+not_std.astype(str))
print("\n")
print('Under the assumption of equal covariance matrices')
equal_err = pd.DataFrame(np.vstack((np.mean(ee20, axis = 0)*100, np.mean(ee40, axis = 0)*100,
                        np.mean(ee60, axis = 0)*100,np.mean(ee80, axis = 0)*100)).round(3),
                        index = ["20%", "40%", "60%", "80%"],
                        columns = ["DPER", "MICE", "MissForest", "Soft-Impute"])
equal_std = pd.DataFrame(np.vstack((np.std(ee20, axis = 0)*100, np.std(ee40, axis = 0)*100,
                      np.std(ee60, axis = 0)*100,np.std(ee80, axis = 0)*100)).round(3),
                      index = pm.index,
                      columns = pm.columns)
print(equal_err.astype(str)+pm+equal_std.astype(str))

Without the assumption of equal covariance matrices
              DPER           MICE     MissForest    Soft-Impute
20%  0.049 ± 0.002  0.058 ± 0.004  0.082 ± 0.003  0.059 ± 0.002
40%  0.078 ± 0.003  0.073 ± 0.003  0.127 ± 0.003  0.103 ± 0.002
60%  0.107 ± 0.004  0.093 ± 0.004  0.158 ± 0.003  0.139 ± 0.003
80%  0.141 ± 0.005  0.121 ± 0.006  0.181 ± 0.002  0.168 ± 0.002


Under the assumption of equal covariance matrices
              DPER           MICE     MissForest    Soft-Impute
20%  0.039 ± 0.002  0.043 ± 0.003  0.098 ± 0.004  0.062 ± 0.002
40%  0.062 ± 0.002  0.058 ± 0.003  0.159 ± 0.004  0.121 ± 0.003
60%  0.084 ± 0.003  0.084 ± 0.005  0.201 ± 0.004   0.17 ± 0.004
80%   0.11 ± 0.004  0.127 ± 0.012  0.231 ± 0.003  0.209 ± 0.004
