#Library and functions

In [None]:
!pip install scikit-learn
!pip install fancyimpute
!pip install DistributedMissForest
!pip install MissForest
!pip install pandas
!pip install git+https://github.com/maianhpuco/DIMVImputation.git

Collecting fancyimpute
  Downloading fancyimpute-0.7.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting knnimpute>=0.1.0 (from fancyimpute)
  Downloading knnimpute-0.1.0.tar.gz (8.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nose (from fancyimpute)
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: fancyimpute, knnimpute
  Building wheel for fancyimpute (setup.py) ... [?25l[?25hdone
  Created wheel for fancyimpute: filename=fancyimpute-0.7.0-py3-none-any.whl size=29879 sha256=564211c9a795e574de6252cde9590ce56329aab0db9e97195c6ce706c85b64c5
  Stored in directory: /root/.cache/pip/wheels/7b/0c/d3/ee82d1fbdcc0858d96434af108608d01703505d453720c84ed
  Building wheel for knnimpute (setup.py) ... [?25l[?25hdone
  Created wheel for knnimpute: filename=knnimpute-0.1.0-py3-none-

In [None]:
from DIMVImputation import DIMVImputation
import numpy as np
import pandas as pd
import time
import math

import sklearn.neighbors._base
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

from sklearn import datasets
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as skLDA
from sklearn.experimental import enable_iterative_imputer
from sklearn.model_selection import train_test_split
from sklearn.impute import IterativeImputer

from scipy import stats
from fancyimpute import SoftImpute
from missforest.missforest import MissForest
from numpy.linalg import norm, inv

In [None]:
def error(sig, sig_est):
  er = np.linalg.norm(sig_est.flatten()-sig.flatten())/sig.size
  return er

def normalize_data(X):
  scaler = StandardScaler()
  scaler.fit(X)
  return scaler.transform(X)

def generate_nan(X, missing_rate):
    X_copy=np.copy(X)

    X_non_missing = X_copy[[0],:]
    X_missing = X_copy[[i for i in range(1,X.shape[0],1)],:]

    XmShape = X_missing.shape
    na_id = np.random.randint(0, X_missing.size, round(missing_rate * X_missing.size))
    X_nan = X_missing.flatten()
    X_nan[na_id] = np.nan
    X_nan = X_nan.reshape(XmShape)

    res = np.vstack((X_non_missing, X_nan))
    return res

In [None]:
#Single_class
def diag_term(X,i):
  arr0 = X[:,i].flatten()
  arr = arr0[~np.isnan(arr0)]
  return np.var(arr)

def DPER(X):
    mus = np.nanmean(X,axis=0).T
    epsilon = 1e-5 # define epsilon to put r down to 0 if r < epsilon
    n,p = X.shape[0], X.shape[1]
    S = np.diag([diag_term(X,i) for i in range(p)])
    for i in range(p):
      for j in range(i):
        if ((S[i,i] == 0.) | (S[j,j] == 0.)):
          S[i,j] = S[j,i] = 0.
          continue
        mat = X[:,[i,j]]
        # drop rows with NA
        idx = ~np.isnan(mat).any(axis=1)
        mat = mat[idx]
        A = len(mat)
        s11 = A*np.var(mat[:,0])
        s22 = A*np.var(mat[:,1])
        s12 = sum((mat[:,0]-mus[i])*(mat[:,1]-mus[j]))
        B = S[i,i]*S[j,j]*A - s22 * S[i,i] - s11 * S[j,j]
        coefficient = [-A, s12, B, s12*S[i,i]*S[j,j]]
        r = np.roots(coefficient)
        r = r[abs(np.imag(r)) < epsilon]
        r = np.real(r)
        r[abs(r) < epsilon] = 0
        if len(r)>1:
          condi_var = S[j,j] - r**2/S[i,i]
          eta = -A*np.log(condi_var)-(S[j,j]-2*r/S[i,i]*s12 + r**2/S[i,i]**2*s11)/condi_var
          r = r[eta == max(eta[~np.isnan(eta)])]
        if len(r) > 1:
          if sum(r==0.0) == len(r):
            r = 0.
          else:
            w = np.cov(mat, rowvar=False)
            #r = r[w[0,1]*r>=0]
            r = r[np.abs(r-w[0,1]).argmin()] # select r that is closet to w[0,1]
        S[i,j] = S[j,i] = r
    return S

In [None]:
#Multiclass
def diag_term_m(i,X,y):
  G = len(np.unique(y))
  arr0 = X[:,i]
  nar2 = 0
  arr = arr0[~np.isnan(arr0)]
  y_arr = y[~np.isnan(arr0)]

  _, counts = np.unique(y_arr, return_counts=True)
  ind = np.insert(np.cumsum(counts), 0, 0)

  return sum([(ind[g]-ind[g-1])*np.var(arr[ind[g-1]:ind[g]]) for
                       g in range(1,G+1)])/len(y_arr)

def DPERm(X,y):
    G = len(np.unique(y))
    epsilon = 1e-5 # define epsilon to put r down to 0 if r < epsilon
    n,p = X.shape[0], X.shape[1]

    # Estimating class means
    mus = np.array([np.nanmean(X[y==g,:],axis=0) for g in range (G)]).T # so that each column is the mean of a class

    S = np.diag([diag_term_m(i,X,y) for i in range(p)])

    for i in range(p):
      for j in range(i):
        mat = X[:,[i,j]]

        # drop rows with NA
        idx = ~np.isnan(mat).any(axis=1)
        mat, y_arr = mat[idx], y[idx]

        _, counts = np.unique(y_arr, return_counts=True)
        ind = np.insert(np.cumsum(counts), 0, 0)

        m_g = counts

        A = len(y_arr)
        scaled_mat = [mat[ind[g-1]:ind[g],:]-mus[[i,j],g-1] for g in range(1,G+1)]

        q = lambda g: np.dot(scaled_mat[g][:,0],scaled_mat[g][:,0])
        s11 = sum(map(q,range(G)))
        q = lambda g: np.dot(scaled_mat[g][:,1],scaled_mat[g][:,1])
        s22 = sum(map(q,range(G)))
        d = lambda g: np.dot(scaled_mat[g][:,0],scaled_mat[g][:,1])
        s12 = sum(map(d,range(G)))

        start_solve = time.time()
        B = S[i,i]*S[j,j]*A - s22 * S[i,i] - s11 * S[j,j]
        coefficient = [-A, s12, B, s12*S[i,i]*S[j,j]]
        r = np.roots(coefficient)

        r = r[abs(np.imag(r)) < epsilon]
        r = np.real(r)
        r[abs(r) < epsilon] = 0

        if len(r)>1:
          condi_var = S[j,j] - r**2/S[i,i]
          eta = -A*np.log(condi_var)-(S[j,j]-2*r/S[i,i]*s12 +
                                      r**2/S[i,i]**2*s11)/condi_var
          # if condi_var <0 then eta = NA. in practice, it's impossible for cov to be negative
          #  therefore, we drop NA elements of eta
          r = r[eta == max(eta[~np.isnan(eta)])]

        if len(r) > 1:
            w = [m_g[g-1]*np.cov(mat[ind[g-1]:ind[g],], rowvar=False) for
                 g in range(1,G+1)]
            w = np.sum(w, axis = 0)
            r = r[np.abs(r-w[0,1]).argmin()] # select r that is closet to w[0,1]

        S[i,j] = S[j,i] = r
    return S

In [None]:
def LDA(mus, S, Xtest, ytrain, ytest):
#mus: each row is a class mean
#S: covariance matrix
  G = len(np.unique(ytrain))
  pi = np.array([np.sum(ytrain==i) for i in range(G)])/len(ytrain) #vector of prior probabilities
  f = lambda g: np.log(pi[g]) - np.log(det(S))/2
  class_terms = [f(g) for g in np.arange(G)]
  h = lambda g,i: class_terms[g] - np.matmul((Xtest[i]-mus[g]),np.matmul(inv(S), (Xtest[i]-mus[g]).T))/2
  pred_label = np.array([])

  for i in range(len(Xtest)):
    temp = np.array([])

    for g in range(G):
        temp = np.append(temp,h(g,i))
    pred_label = np.append(pred_label,np.argmax(temp))

  pred_label = np.asarray(pred_label)
  pred_label = pred_label.astype(int)
  return np.mean(ytest == pred_label)

In [None]:
def QDA(mus, S, Xtest, ytrain, ytest):
  G = len(np.unique(ytrain))
# mus: each row is a class mean
# S: the ith component is the covariance matrix of the g^th class
  pi = np.array([np.sum(ytrain == g) for g in range(G)])/len(ytrain) #vector of prior probabilities
  f = lambda g: np.log(pi[g]) - np.log(det(S[g]))/2
  class_terms = [f(g) for g in np.arange(G)]
  h = lambda g,i: class_terms[g] - np.matmul((Xtest[i]-mus[g]),np.matmul(inv(S[g]), (Xtest[i]-mus[g]).T))/2
  pred_label = np.array([])
  for i in range(len(Xtest)):
    temp = np.array([])
    for g in range(G):
        temp = np.append(temp,h(g,i))
    pred_label = np.append(pred_label,np.argmax(temp))
  pred_label = np.asarray(pred_label)
  return np.mean(pred_label.flatten() == ytest)

#LDA, QDA vs MissForest

In [None]:
df=pd.read_csv('/content/bank-additional.csv',delimiter=';')
'''Continuous: 0, 10, 11, 12, 13, 15, 16, 17, 18, 19
Categorical: 1, 2, 3, 4, 5, 6, 7, 8, 9, 14
Class: 20 (36548 obs have label 0 and 4640 obs have label 1)'''
M=df.values
X = normalize_data(M[:,[0, 10, 11, 12, 13, 15, 16, 17, 18, 19]]) #10 continuous features
Z = M[:,[1, 2, 3, 4, 5, 6, 7, 8, 9, 14]]      #10 categorical features 12 4 8 3 3 3 2 10 5 3
y = M[:,20]
label_encoder = LabelEncoder()
for i in range(Z.shape[1]):
    Z[:,i] = label_encoder.fit_transform(Z[:,i])
y = label_encoder.fit_transform(y)
print(sum(Z[:,6]==0))
print(sum(Z[:,6]==1))
print(X.shape)

2652
1467
(4119, 10)


In [None]:
def LDA_QDA_MF(X,y,missing_rate,runs):
  accuracy = []

  for i in range(runs):
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .4, random_state = 0)
    G = len(np.unique(y_train))
    X_trnan = generate_nan(X_train,missing_rate)

    pred_mf = np.repeat(np.nan,len(y_test))
    XNaN = np.vstack((np.hstack((X_trnan,y_train.reshape(-1,1))),np.hstack((X_test,pred_mf.reshape(-1,1)))))
    Xd = pd.DataFrame.from_records(XNaN)
    mf = MissForest()
    Xmf= mf.fit_transform(Xd).to_numpy()
    pred_mf = Xmf[- len(pred_mf):,-1:]
    pred_mf = [int(2*pred_mf[i]) for i in range(len(pred_mf))]
    acc_mf = np.mean(pred_mf == y_test)

    mus = [np.nanmean(X_trnan[y_train == g], axis = 0) for g in range(G)]

    S = np.array([DPER(X_trnan[y_train == g]) for g in range(G)])
    acc_QDA = QDA(mus, S, X_test, y_train, y_test)

    Sm = DPERm(X_trnan,y_train)
    acc_LDA = LDA(mus, Sm, X_test, y_train, y_test)
    accuracy.append([acc_LDA,acc_QDA,acc_mf])
  return accuracy

In [None]:
e20 = LDA_QDA_MF(X,Z[:,6],.2,10)
e40 = LDA_QDA_MF(X,Z[:,6],.4,10)
e60 = LDA_QDA_MF(X,Z[:,6],.6,10)
e80 = LDA_QDA_MF(X,Z[:,6],.8,10)

[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 4, number of used features: 0
[LightGBM] [Info] Start training from score 0.309990
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 4, number of used features: 0
[LightGBM] [Info] Start training from score -0.367551
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 4, number of used features: 0
[LightGBM] [Info] Start training from score -0.441992
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 4, number of used features: 0
[LightGBM] [Info] Start training from score 0.250000
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 4, number of used features: 0
[LightGBM] [Info] Start training from score -0.132235
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data po

In [None]:
sign = pd.DataFrame(np.repeat(" ± ",12).reshape(4,3),
                  index = ["20%", "40%", "60%", "80%"],
                  columns = ["LDA", "QDA", "MissForest"])
er = pd.DataFrame(np.vstack((np.mean(e20,axis = 0),np.mean(e40,axis = 0),np.mean(e60,axis = 0),np.mean(e80,axis = 0))).round(3),
                  index = sign.index,
                  columns = sign.columns)
std = pd.DataFrame(np.vstack((np.std(e20,axis = 0),np.std(e40,axis = 0),np.std(e60,axis = 0),np.std(e80,axis = 0))).round(3),
                  index = sign.index,
                  columns = sign.columns)

print(er.astype(str)+sign+std.astype(str))

               LDA            QDA     MissForest
20%  0.917 ± 0.012  0.677 ± 0.135  0.775 ± 0.019
40%  0.872 ± 0.056   0.63 ± 0.115  0.769 ± 0.114
60%  0.827 ± 0.114  0.605 ± 0.079  0.605 ± 0.079
80%  0.766 ± 0.137  0.592 ± 0.085  0.605 ± 0.079


#LDA with DPER, MICE, MissForest, Soft-Impute

In [None]:
df=pd.read_csv('/content/bank-additional.csv',delimiter=';')
'''Continuous: 0, 10, 11, 12, 13, 15, 16, 17, 18, 19
Categorical: 1, 2, 3, 4, 5, 6, 7, 8, 9, 14
Class: 20 (36548 obs have label 0 and 4640 obs have label 1)'''
M=df.values
X = normalize_data(M[:,[0, 10, 11, 12, 13, 15, 16, 17, 18, 19]]) #10 continuous features
Z = M[:,[1, 2, 3, 4, 5, 6, 7, 8, 9, 14]]      #10 categorical features 12 4 8 3 3 3 2 10 5 3
y = M[:,20]
label_encoder = LabelEncoder()
for i in range(Z.shape[1]):
    Z[:,i] = label_encoder.fit_transform(Z[:,i])
y = label_encoder.fit_transform(y)
print(sum(Z[:,6]==0))
print(sum(Z[:,6]==1))
print(X.shape)

In [None]:
def LDAmiss(X,y,missing_rate,runs):
  accuracy = []
  for i in range(runs):
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .4, random_state = 0)
    G = len(np.unique(y_train))
    X_trnan = generate_nan(X_train,missing_rate)

    #Impute data
    XMice = IterativeImputer(max_iter = 10).fit(X_trnan).transform(X_trnan)
    Xd = pd.DataFrame.from_records(X_trnan)
    mf = MissForest()
    XMiss_df = mf.fit_transform(Xd)
    XMiss = XMiss_df.to_numpy()
    XSoft =  SoftImpute(max_iters = 10, verbose = False).fit_transform(X_trnan)

    #Mean of each class
    mDper = [np.nanmean(X_trnan[y_train == g], axis = 0) for g in range(G)]
    mMice = [np.mean(XMice[y_train == g], axis = 0) for g in range(G)]
    mMiss = [np.mean(XMiss[y_train == g], axis = 0) for g in range(G)]
    mSoft = [np.mean(XSoft[y_train == g], axis = 0) for g in range(G)]

    #Assume equal covariance matrices
    SDper = DPERm(X_trnan,y_train)
    SMice =  sum([(sum(y_train==g) - 1)*np.cov(XMice[y_train==g], rowvar = False) for g in range(G)])/(len(y_train) - G)
    SMiss =  sum([(sum(y_train==g) - 1)*np.cov(XMiss[y_train==g], rowvar = False) for g in range(G)])/(len(y_train) - G)
    SSoft =  sum([(sum(y_train==g) - 1)*np.cov(XSoft[y_train==g], rowvar = False) for g in range(G)])/(len(y_train) - G)

    #LDA
    acc_DPER = LDA(mDper, SDper, X_test, y_train, y_test)
    acc_MICE = LDA(mMice, SMice, X_test, y_train, y_test)
    acc_MF = LDA(mMiss, SMiss, X_test, y_train, y_test)
    acc_SI = LDA(mSoft, SSoft, X_test, y_train, y_test)
    accuracy.append([acc_DPER,acc_MICE,acc_MF,acc_SI])
  return accuracy

In [None]:
a20 = LDAmiss(X,Z[:,6],missing_rate=.2,runs=10)
a40 = LDAmiss(X,Z[:,6],missing_rate=.4,runs=10)
a60 = LDAmiss(X,Z[:,6],missing_rate=.6,runs=10)
a80 = LDAmiss(X,Z[:,6],missing_rate=.8,runs=10)

[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Start training from score -0.403944
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Start training from score 0.201031
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Start training from score -0.351356
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Start training from score -0.182333
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Start training from score -0.143634
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data p

In [None]:
pm = pd.DataFrame(np.repeat(" ± ",16).reshape(4,4),
                  index = ["20%", "40%", "60%", "80%"],
                  columns = ["DPER", "MICE", "MissForest", "Soft-Impute"])
print('Accuracy')
acc_mean = pd.DataFrame(np.vstack((np.mean(a20, axis = 0), np.mean(a40, axis = 0),
                        np.mean(a60, axis = 0),np.mean(a80, axis = 0))).round(3),
                        index = ["20%", "40%", "60%", "80%"],
                        columns = ["DPER", "MICE", "MissForest", "Soft-Impute"])
acc_std = pd.DataFrame(np.vstack((np.std(a20, axis = 0), np.std(a40, axis = 0),
                      np.std(a60, axis = 0),np.std(a80, axis = 0))).round(3),
                      index = pm.index,
                      columns = pm.columns)
print(acc_mean.astype(str)+pm+acc_std.astype(str))

Accuracy
              DPER           MICE     MissForest    Soft-Impute
20%  0.912 ± 0.006  0.904 ± 0.002  0.904 ± 0.006  0.909 ± 0.004
40%  0.867 ± 0.069  0.888 ± 0.023    0.9 ± 0.022    0.89 ± 0.01
60%  0.831 ± 0.093  0.865 ± 0.025  0.747 ± 0.057  0.814 ± 0.019
80%  0.706 ± 0.105    0.8 ± 0.103  0.686 ± 0.003  0.771 ± 0.022


#QDA with DPER, MICE, MissForest, Soft-Impute

In [None]:
df=pd.read_csv('/content/bank-additional.csv',delimiter=';')
'''Continuous: 0, 10, 11, 12, 13, 15, 16, 17, 18, 19
Categorical: 1, 2, 3, 4, 5, 6, 7, 8, 9, 14
Class: 20 (36548 obs have label 0 and 4640 obs have label 1)'''
M=df.values
X = normalize_data(M[:,[0, 10, 11, 12, 13, 15, 16, 17, 18, 19]]) #10 continuous features
Z = M[:,[1, 2, 3, 4, 5, 6, 7, 8, 9, 14]]      #10 categorical features 12 4 8 3 3 3 2 10 5 3
y = M[:,20]
label_encoder = LabelEncoder()
for i in range(Z.shape[1]):
    Z[:,i] = label_encoder.fit_transform(Z[:,i])
y = label_encoder.fit_transform(y)
print(sum(Z[:,6]==0))
print(sum(Z[:,6]==1))
print(X.shape)

2652
1467
(4119, 10)


In [None]:
def QDAmiss(X,y,missing_rate,runs):
  accuracy = []
  for i in range(runs):
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .4, random_state = 0)
    G = len(np.unique(y_train))
    X_trnan = generate_nan(X_train,missing_rate)

    #Impute data
    XMice = IterativeImputer(max_iter = 10).fit(X_trnan).transform(X_trnan)
    Xd = pd.DataFrame.from_records(X_trnan)
    mf = MissForest()
    XMiss_df = mf.fit_transform(Xd)
    XMiss = XMiss_df.to_numpy()
    XSoft =  SoftImpute(max_iters = 10, verbose = False).fit_transform(X_trnan)

    #Mean of each class
    mDper = [np.nanmean(X_trnan[y_train == g], axis = 0) for g in range(G)]
    mMice = [np.mean(XMice[y_train == g], axis = 0) for g in range(G)]
    mMiss = [np.mean(XMiss[y_train == g], axis = 0) for g in range(G)]
    mSoft = [np.mean(XSoft[y_train == g], axis = 0) for g in range(G)]

    #NOT assume equal covariance matrices
    SDper = np.array([DPER(X_trnan[y_train==g]) for g in range(G)])
    SMice = np.array([np.cov(XMice[y_train==g], rowvar = False) for g in range(G)])
    SMiss = np.array([np.cov(XMiss[y_train==g], rowvar = False) for g in range(G)])
    SSoft = np.array([np.cov(XSoft[y_train==g], rowvar = False) for g in range(G)])

    #QDA
    acc_DPER = QDA(mDper, SDper, X_test, y_train, y_test)
    acc_MICE = QDA(mMice, SMice, X_test, y_train, y_test)
    acc_MF = QDA(mMiss, SMiss, X_test, y_train, y_test)
    acc_SI = QDA(mSoft, SSoft, X_test, y_train, y_test)
    accuracy.append([acc_DPER,acc_MICE,acc_MF,acc_SI])
  return accuracy

In [None]:
a20 = QDAmiss(X,Z[:,6],missing_rate=.2,runs=10)
a40 = QDAmiss(X,Z[:,6],missing_rate=.4,runs=10)
a60 = QDAmiss(X,Z[:,6],missing_rate=.6,runs=10)
a80 = QDAmiss(X,Z[:,6],missing_rate=.8,runs=10)

[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 1, number of used features: 0
[LightGBM] [Info] Start training from score -0.209228
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 1, number of used features: 0
[LightGBM] [Info] Start training from score 0.201031
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 1, number of used features: 0
[LightGBM] [Info] Start training from score -0.351356
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 1, number of used features: 0
[LightGBM] [Info] Start training from score -1.206054
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 1, number of used features: 0
[LightGBM] [Info] Start training from score -0.871264
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data p

In [None]:
pm = pd.DataFrame(np.repeat(" ± ",16).reshape(4,4),
                  index = ["20%", "40%", "60%", "80%"],
                  columns = ["DPER", "MICE", "MissForest", "Soft-Impute"])
print('Accuracy')
acc_mean = pd.DataFrame(np.vstack((np.mean(a20, axis = 0), np.mean(a40, axis = 0),
                        np.mean(a60, axis = 0),np.mean(a80, axis = 0))).round(3),
                        index = ["20%", "40%", "60%", "80%"],
                        columns = ["DPER", "MICE", "MissForest", "Soft-Impute"])
acc_std = pd.DataFrame(np.vstack((np.std(a20, axis = 0), np.std(a40, axis = 0),
                      np.std(a60, axis = 0),np.std(a80, axis = 0))).round(3),
                      index = pm.index,
                      columns = pm.columns)
print(acc_mean.astype(str)+pm+acc_std.astype(str))

Accuracy
              DPER           MICE     MissForest    Soft-Impute
20%  0.701 ± 0.149  0.801 ± 0.002  0.803 ± 0.002  0.685 ± 0.014
40%  0.614 ± 0.185  0.701 ± 0.091  0.713 ± 0.067  0.653 ± 0.018
60%  0.534 ± 0.122    0.591 ± 0.1  0.678 ± 0.059  0.653 ± 0.017
80%  0.631 ± 0.003  0.607 ± 0.076  0.658 ± 0.069  0.655 ± 0.014
