#Library and functions

In [None]:
!pip install scikit-learn
!pip install fancyimpute
!pip install DistributedMissForest
!pip install MissForest

Collecting fancyimpute
  Downloading fancyimpute-0.7.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting knnimpute>=0.1.0 (from fancyimpute)
  Downloading knnimpute-0.1.0.tar.gz (8.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nose (from fancyimpute)
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: fancyimpute, knnimpute
  Building wheel for fancyimpute (setup.py) ... [?25l[?25hdone
  Created wheel for fancyimpute: filename=fancyimpute-0.7.0-py3-none-any.whl size=29879 sha256=f60ab6637e96ae4a038ce57f1a6114e476b78a949c2580afa99c9e3c6432e54d
  Stored in directory: /root/.cache/pip/wheels/7b/0c/d3/ee82d1fbdcc0858d96434af108608d01703505d453720c84ed
  Building wheel for knnimpute (setup.py) ... [?25l[?25hdone
  Created wheel for knnimpute: filename=knnimpute-0.1.0-py3-none-

In [None]:
import numpy as np
import pandas as pd
import time
import math

import sklearn.neighbors._base
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

from sklearn import datasets
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as skLDA
from sklearn.experimental import enable_iterative_imputer
from sklearn.model_selection import train_test_split
from sklearn.impute import IterativeImputer

from scipy import stats
from fancyimpute import SoftImpute
from missforest.missforest import MissForest
from numpy.linalg import norm, inv

In [None]:
#Covariance matrix err
def error(sig, sig_est):
  er = np.linalg.norm(sig_est.flatten()-sig.flatten())/sig.size
  return er

def normalize_data(X):
  scaler = StandardScaler()
  scaler.fit(X)
  return scaler.transform(X)

def generate_nan(X, missing_rate):
    X_copy=np.copy(X)
    X_non_missing = X_copy[[0],:]
    X_missing = X_copy[[i for i in range(1,X.shape[0],1)],:]
    XmShape = X_missing.shape
    na_id = np.random.randint(0, X_missing.size, round(missing_rate * X_missing.size))
    X_nan = X_missing.flatten()
    X_nan[na_id] = np.nan
    X_nan = X_nan.reshape(XmShape)
    return np.vstack((X_non_missing, X_nan))

In [None]:
#Single_class
def diag_term(X,i):
  arr0 = X[:,i].flatten()
  arr = arr0[~np.isnan(arr0)]
  return np.var(arr)

def DPER(X):
    mus = np.nanmean(X,axis=0).T
    epsilon = 1e-5 # define epsilon to put r down to 0 if r < epsilon
    n,p = X.shape[0], X.shape[1]
    S = np.diag([diag_term(X,i) for i in range(p)])
    for i in range(p):
      for j in range(i):
        if ((S[i,i] == 0.) | (S[j,j] == 0.)):
          S[i,j] = S[j,i] = 0.
          continue
        mat = X[:,[i,j]]
        # drop rows with NA
        idx = ~np.isnan(mat).any(axis=1)
        mat = mat[idx]
        A = len(mat)
        s11 = A*np.var(mat[:,0])
        s22 = A*np.var(mat[:,1])
        s12 = sum((mat[:,0]-mus[i])*(mat[:,1]-mus[j]))
        B = S[i,i]*S[j,j]*A - s22 * S[i,i] - s11 * S[j,j]
        coefficient = [-A, s12, B, s12*S[i,i]*S[j,j]]
        r = np.roots(coefficient)
        r = r[abs(np.imag(r)) < epsilon]
        r = np.real(r)
        r[abs(r) < epsilon] = 0
        if len(r)>1:
          condi_var = S[j,j] - r**2/S[i,i]
          eta = -A*np.log(condi_var)-(S[j,j]-2*r/S[i,i]*s12 + r**2/S[i,i]**2*s11)/condi_var
          r = r[eta == max(eta[~np.isnan(eta)])]
        if len(r) > 1:
          if sum(r==0.0) == len(r):
            r = 0.
          else:
            w = np.cov(mat, rowvar=False)
            #r = r[w[0,1]*r>=0]
            r = r[np.abs(r-w[0,1]).argmin()] # select r that is closet to w[0,1]
        S[i,j] = S[j,i] = r
    return S

In [None]:
#Multiclass
def diag_term_m(i,X,y):
  G = len(np.unique(y))
  arr0 = X[:,i]
  nar2 = 0
  arr = arr0[~np.isnan(arr0)]
  y_arr = y[~np.isnan(arr0)]

  _, counts = np.unique(y_arr, return_counts=True)
  ind = np.insert(np.cumsum(counts), 0, 0)

  return sum([(ind[g]-ind[g-1])*np.var(arr[ind[g-1]:ind[g]]) for
                       g in range(1,G+1)])/len(y_arr)

def DPERm(X,y):
    G = len(np.unique(y))
    epsilon = 1e-5 # define epsilon to put r down to 0 if r < epsilon
    n,p = X.shape[0], X.shape[1]

    # Estimating class means
    mus = np.array([np.nanmean(X[y==g,:],axis=0) for g in range (G)]).T # so that each column is the mean of a class

    S = np.diag([diag_term_m(i,X,y) for i in range(p)])

    for i in range(p):
      for j in range(i):
        mat = X[:,[i,j]]

        # drop rows with NA
        idx = ~np.isnan(mat).any(axis=1)
        mat, y_arr = mat[idx], y[idx]

        _, counts = np.unique(y_arr, return_counts=True)
        ind = np.insert(np.cumsum(counts), 0, 0)

        m_g = counts

        A = len(y_arr)
        scaled_mat = [mat[ind[g-1]:ind[g],:]-mus[[i,j],g-1] for g in range(1,G+1)]

        q = lambda g: np.dot(scaled_mat[g][:,0],scaled_mat[g][:,0])
        s11 = sum(map(q,range(G)))
        q = lambda g: np.dot(scaled_mat[g][:,1],scaled_mat[g][:,1])
        s22 = sum(map(q,range(G)))
        d = lambda g: np.dot(scaled_mat[g][:,0],scaled_mat[g][:,1])
        s12 = sum(map(d,range(G)))

        start_solve = time.time()
        B = S[i,i]*S[j,j]*A - s22 * S[i,i] - s11 * S[j,j]
        coefficient = [-A, s12, B, s12*S[i,i]*S[j,j]]
        r = np.roots(coefficient)

        r = r[abs(np.imag(r)) < epsilon]
        r = np.real(r)
        r[abs(r) < epsilon] = 0

        if len(r)>1:
          condi_var = S[j,j] - r**2/S[i,i]
          eta = -A*np.log(condi_var)-(S[j,j]-2*r/S[i,i]*s12 +
                                      r**2/S[i,i]**2*s11)/condi_var
          # if condi_var <0 then eta = NA. in practice, it's impossible for cov to be negative
          #  therefore, we drop NA elements of eta
          r = r[eta == max(eta[~np.isnan(eta)])]

        if len(r) > 1:
            w = [m_g[g-1]*np.cov(mat[ind[g-1]:ind[g],], rowvar=False) for
                 g in range(1,G+1)]
            w = np.sum(w, axis = 0)
            r = r[np.abs(r-w[0,1]).argmin()] # select r that is closet to w[0,1]

        S[i,j] = S[j,i] = r
    return S

In [None]:
def measure_time_not_assume(X, y, run_time, missing_rate):
    G = len(np.unique(y))
    time_dper = []
    time_mice = []
    time_mf = []
    time_soft = []
    for i in range(run_time):
        Xnan = generate_nan(X, missing_rate)

        #Not assume equal covariance matrices
        #DPER
        start_dper = time.time()
        SDper = [DPER(Xnan[y==g]) for g in range(G)]
        end_dper = time.time()
        time_dper.append(end_dper - start_dper)

        #XMice
        start_mice = time.time()
        XMice = IterativeImputer(max_iter = 100).fit(Xnan).transform(Xnan)
        SMice = [np.cov(XMice[y==g], rowvar = False) for g in range(G)]
        end_mice = time.time()
        time_mice.append(end_mice - start_mice)

        #MissForest
        start_mf = time.time()
        Xd = pd.DataFrame.from_records(Xnan)
        mf = MissForest()
        XMiss_df = mf.fit_transform(Xd)
        XMiss = XMiss_df.to_numpy()
        SMiss =  [np.cov(XMiss[y==g], rowvar = False) for g in range(G)]
        end_mf = time.time()
        time_mf.append(end_mf - start_mf)

        #Soft-impute
        start_soft = time.time()
        XSoft = SoftImpute(max_iters = 100).fit_transform(Xnan)
        SSoft = [np.cov(XSoft[y==g], rowvar =False)  for g in np.arange(G)]
        end_soft = time.time()
        time_soft.append(end_soft - start_soft)

    return np.mean(time_dper), np.mean(time_mice), np.mean(time_mf), np.mean(time_soft)

In [None]:
def measure_time_assume(X, y, run_time, missing_rate):
    G = len(np.unique(y))
    time_dper = []
    time_mice = []
    time_mf = []
    time_soft = []
    for i in range(run_time):
        Xnan = generate_nan(X, missing_rate)

        #Not assume equal covariance matrices
        #DPER
        start_dper = time.time()
        SDper = DPERm(Xnan,y)
        end_dper = time.time()
        time_dper.append(end_dper - start_dper)

        #XMice
        start_mice = time.time()
        XMice = IterativeImputer(max_iter = 10).fit(Xnan).transform(Xnan)
        SMice =  sum([(sum(y==g) - 1)*np.cov(XMice[y==g], rowvar = False) for g in range(G)])/(len(y) - G)
        end_mice = time.time()
        time_mice.append(end_mice - start_mice)

        #MissForest
        start_mf = time.time()
        Xd = pd.DataFrame.from_records(Xnan)
        mf = MissForest()
        XMiss_df = mf.fit_transform(Xd)
        XMiss = XMiss_df.to_numpy()
        SMiss =  sum([(sum(y==g) - 1)*np.cov(XMiss[y==g], rowvar = False) for g in range(G)])/(len(y) - G)
        end_mf = time.time()
        time_mf.append(end_mf - start_mf)

        #Soft-impute
        start_soft = time.time()
        XSoft = SoftImpute(max_iters = 10).fit_transform(Xnan)
        SSoft = np.asarray([((sum(y==g))-1)*np.cov(XSoft[y==g], rowvar =False)
             for g in np.arange(G)])
        SSoft = np.sum(SSoft, axis = 0)/(len(y) - G)
        end_soft = time.time()
        time_soft.append(end_soft - start_soft)

    return np.mean(time_dper).round(3), np.mean(time_mice).round(3), np.mean(time_mf).round(3), np.mean(time_soft).round(3)

#Not assume equal covariance matrices

In [None]:
df=pd.read_csv('/content/bank-additional.csv',delimiter=';')
#Continuous: 0, 10, 11, 12, 13, 15, 16, 17, 18, 19
#Categorical: 1, 2, 3, 4, 5, 6, 7, 8, 9, 14
#Class: 20
M = df.values
X = M[:,[0, 10 , 11, 12, 13, 15, 16, 17, 18, 19]] #10 continuous features
Z = M[:,[1, 2, 3, 4, 5, 6, 7, 8, 9, 14]]         #10 categorical features: number of label [12,4,8,3,3,3,2,10,5,3]
y = M[:,20]                                      #Class
label_encoder = LabelEncoder()
for i in range(Z.shape[1]):
    Z[:,i] = label_encoder.fit_transform(Z[:,i])
y = label_encoder.fit_transform(y)
X = normalize_data(X)
print(X.shape)

(4119, 10)


In [None]:
run_time = 10
running_time = np.vstack([measure_time_not_assume(X,y,run_time,.2),
                measure_time_not_assume(X,y,run_time,.4),
                measure_time_not_assume(X,y,run_time,.6),
                measure_time_not_assume(X,y,run_time,.8)])



[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000020 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 385
[LightGBM] [Info] Number of data points in the train set: 558, number of used features: 9
[LightGBM] [Info] Start training from score 0.048934
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000018 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 381
[LightGBM] [Info] Number of data points in the train set: 558, number of used features: 9
[LightGBM] [Info] Start training from score -0.052303
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000019 seconds.
You can set `force_r



[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000021 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 366
[LightGBM] [Info] Number of data points in the train set: 552, number of used features: 9
[LightGBM] [Info] Start training from score 0.006467
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000018 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 362
[LightGBM] [Info] Number of data points in the train set: 552, number of used features: 9
[LightGBM] [Info] Start training from score 0.046591
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000019 seconds.
You can set `force_ro



[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000081 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 383
[LightGBM] [Info] Number of data points in the train set: 563, number of used features: 9
[LightGBM] [Info] Start training from score 0.071609
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000105 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 379
[LightGBM] [Info] Number of data points in the train set: 563, number of used features: 9
[LightGBM] [Info] Start training from score -0.107440
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000252 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bi



[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 3, number of used features: 0
[LightGBM] [Info] Start training from score 0.201031
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 3, number of used features: 0
[LightGBM] [Info] Start training from score -0.351356
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 3, number of used features: 0
[LightGBM] [Info] Start training from score 0.158908
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 3, number of used features: 0
[LightGBM] [Info] Start training from score 0.036187
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 3, number of used features: 0
[LightGBM] [Info] Start training from score -0.188847
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data poi

In [None]:
print(running_time)

[[0.05397489 5.16724041 3.30919597 0.49021058]
 [0.11564565 5.82778811 1.35159357 0.5671958 ]
 [0.0511586  5.78052719 1.32094076 0.55225761]
 [0.1446218  5.46503675 1.00803263 0.56611912]]
