#Library and functions

In [None]:
!pip install scikit-learn
!pip install fancyimpute
!pip install DistributedMissForest
!pip install MissForest

Collecting fancyimpute
  Downloading fancyimpute-0.7.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting knnimpute>=0.1.0 (from fancyimpute)
  Downloading knnimpute-0.1.0.tar.gz (8.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nose (from fancyimpute)
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: fancyimpute, knnimpute
  Building wheel for fancyimpute (setup.py) ... [?25l[?25hdone
  Created wheel for fancyimpute: filename=fancyimpute-0.7.0-py3-none-any.whl size=29879 sha256=2d978dee01c715eb86838ab6a8f66e758f626813dab0f2a32465e89dd363a5fc
  Stored in directory: /root/.cache/pip/wheels/7b/0c/d3/ee82d1fbdcc0858d96434af108608d01703505d453720c84ed
  Building wheel for knnimpute (setup.py) ... [?25l[?25hdone
  Created wheel for knnimpute: filename=knnimpute-0.1.0-py3-none-

In [None]:
import numpy as np
import pandas as pd
import time
import math
import random

import sklearn.neighbors._base
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

from sklearn import datasets
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as skLDA
from sklearn.experimental import enable_iterative_imputer
from sklearn.model_selection import train_test_split
from sklearn.impute import IterativeImputer

from scipy import stats
from fancyimpute import SoftImpute
from missforest.missforest import MissForest
from numpy.linalg import norm, inv

In [None]:
#Covariance matrix err
def error(sig, sig_est):
  er = np.linalg.norm(sig_est.flatten()-sig.flatten())/sig.size
  return er

#normalizing data
def normalize_data(X):
  scaler = StandardScaler()
  scaler.fit(X)
  return scaler.transform(X)

def generate_nan(X, missing_rate):
    X_copy=np.copy(X)

    X_non_missing = X_copy[[0],:]
    X_missing = X_copy[[i for i in range(1,X.shape[0],1)],:]

    XmShape = X_missing.shape
    na_id = np.random.randint(0, X_missing.size, round(missing_rate * X_missing.size))
    X_nan = X_missing.flatten()
    X_nan[na_id] = np.nan
    X_nan = X_nan.reshape(XmShape)

    res = np.vstack((X_non_missing, X_nan))
    return res

#Algorithms

In [None]:
#Single_class
def diag_term(X,i):
  arr0 = X[:,i].flatten()
  arr = arr0[~np.isnan(arr0)]
  return np.var(arr)

def DPER(X):
    mus = np.nanmean(X,axis=0).T
    epsilon = 1e-5 # define epsilon to put r down to 0 if r < epsilon
    n,p = X.shape[0], X.shape[1]
    S = np.diag([diag_term(X,i) for i in range(p)])
    for i in range(p):
      for j in range(i):
        if ((S[i,i] == 0.) | (S[j,j] == 0.)):
          S[i,j] = S[j,i] = 0.
          continue
        mat = X[:,[i,j]]
        # drop rows with NA
        idx = ~np.isnan(mat).any(axis=1)
        mat = mat[idx]
        A = len(mat)
        s11 = A*np.var(mat[:,0])
        s22 = A*np.var(mat[:,1])
        s12 = sum((mat[:,0]-mus[i])*(mat[:,1]-mus[j]))
        B = S[i,i]*S[j,j]*A - s22 * S[i,i] - s11 * S[j,j]
        coefficient = [-A, s12, B, s12*S[i,i]*S[j,j]]
        r = np.roots(coefficient)
        r = r[abs(np.imag(r)) < epsilon]
        r = np.real(r)
        r[abs(r) < epsilon] = 0
        if len(r)>1:
          condi_var = S[j,j] - r**2/S[i,i]
          eta = -A*np.log(condi_var)-(S[j,j]-2*r/S[i,i]*s12 + r**2/S[i,i]**2*s11)/condi_var
          r = r[eta == max(eta[~np.isnan(eta)])]
        if len(r) > 1:
          if sum(r==0.0) == len(r):
            r = 0.
          else:
            w = np.cov(mat, rowvar=False)
            #r = r[w[0,1]*r>=0]
            r = r[np.abs(r-w[0,1]).argmin()] # select r that is closet to w[0,1]
        S[i,j] = S[j,i] = r
    return S

In [None]:
#For multiclass (X,y) where y is a class
#finding root closest CD
def solving(a,b,c,d,del_case):
  roots = np.roots([a,b,c,d])
  real_roots = np.real(roots[np.isreal(roots)])
  if len(real_roots)==1:
    return real_roots[0]
  else:
    f = lambda x: abs(x-del_case)
    F=[f(x) for x in real_roots]
    return real_roots[np.argmin(F)]

def sig_estimate_multi(X,mu0,mu1,y):
  del_case=0
  res=np.array([0]*8)  # [m,n,l,s11,s12,s22,sig11,sig22]
  numlabel=len(np.unique(y))
  for g in range(numlabel):
    m=n=l=s11=s12=s22=sig11=sig22=0
    mus0=mu0[g]
    mus1=mu1[g]
    Xg=(X.T)[y==g]
    for i in Xg:
      if np.isfinite(i[0]) and np.isfinite(i[1]):
        m=m+1
        s11=s11+(i[0]-mus0)**2
        s22=s22+(i[1]-mus1)**2
        s12=s12+(i[0]-mus0)*(i[1]-mus1)
        sig11=sig11+(i[0]-mus0)**2
        sig22=sig22+(i[1]-mus1)**2
      elif np.isfinite(i[0]) and np.isnan(i[1]):
        n=n+1
        sig11=sig11+(i[0]-mus0)**2
      elif np.isnan(i[0]) and np.isfinite(i[1]):
        l=l+1
        sig22=sig22+(i[1]-mus1)**2
    res = res+np.array([m,n,l,s11,s12,s22,sig11,sig22])
  m,n,l,s11,s12,s22,sig11,sig22 = res
  del_case = max(0,del_case/(m-1))
  sig11=sig11/(m+n)
  sig22=sig22/(m+l)
  sig12=solving(-m,s12,(m*sig11*sig22-s22*sig11-s11*sig22),s12*sig11*sig22,del_case)
  return sig11,sig22,sig12

def DPERm(X,y):            #with assumption of equal covariance matrices
  numlabel=len(np.unique(y))        #number of unique label in y
  p=X.shape[1]
  sig=np.zeros((p,p))               #estimated covariance matrix
  #compute mu_est
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  #estimation of covariane matrix
  for a in range(p):
    for b in range(a):
      temp=sig_estimate_multi(np.array([X[:,b],X[:,a]]),mu[:,b],mu[:,a],y)
      sig[b][b]=temp[0]
      sig[a][a]=temp[1]
      sig[b][a]=sig[a][b]=temp[2]
  return sig

#DPERC

In [None]:
df=pd.read_csv('/content/bank-additional.csv',delimiter=';')
#Continuous: 0, 10, 11, 12, 13, 15, 16, 17, 18, 19
#Categorical: 1, 2, 3, 4, 5, 6, 7, 8, 9, 14
#Class: 20
M = df.values
X = M[:,[0, 10 , 11, 12, 13, 15, 16, 17, 18, 19]] #10 continuous features
Z = M[:,[1, 2, 3, 4, 5, 6, 7, 8, 9, 14]]         #10 categorical features: number of label [12,4,8,3,3,3,2,10,5,3]
y = M[:,20]                                      #Class
label_encoder = LabelEncoder()
for i in range(Z.shape[1]):
    Z[:,i] = label_encoder.fit_transform(Z[:,i])
y = label_encoder.fit_transform(y)
X = normalize_data(X)
print(X.shape)

(4119, 10)


In [None]:
def experiments(X,y,Z,missing_rate,run_time):
  G = len(np.unique(y))
  numZ = Z.shape[1]
  S = np.array([np.cov(X[y==g],rowvar = False) for g in range(G)])
  err1 = []
  err2 = []
  for run in range(run_time):
    Xnan = generate_nan(X,missing_rate)
    D = np.array([DPER(Xnan[y==g]) for g in range(G)])
    C = D
    err1.append(error(S,D))
    for g in range(G):
      Xg = Xnan[y==g]
      for i in range(numZ):
        c = Z[:,i]
        cg = c[y==g]
        Sc = DPERm(Xg,cg)
        if (error(S[g],Sc) <= error(S[g],D[g])):
          C[g] = Sc
    err2.append(error(S,C))
  return err1,err2

In [None]:
ey20,ec20 = experiments(X,y,Z,.2,10)
pd.DataFrame([[np.mean(ey20).round(5),np.std(ey20).round(5),np.mean(ec20).round(5),np.std(ec20).round(5),((np.mean(ey20)-np.mean(ec20))/np.mean(ey20)*100).round(2)]],
                      index = ["20%"],
                      columns = ["DPER", "Std DPER", "DPERC", "Std DPERC", "Improved percent"])

  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=n

Unnamed: 0,DPER,Std DPER,DPERC,Std DPERC,Improved percent
20%,0.00292,0.00037,0.00275,0.00042,5.84


In [None]:
ey40,ec40 = experiments(X,y,Z,.4,10)
pd.DataFrame([[np.mean(ey40).round(5),np.std(ey40).round(5),np.mean(ec40).round(5),np.std(ec40).round(5),((np.mean(ey40)-np.mean(ec40))/np.mean(ey40)*100).round(2)]],
                      index = ["40%"],
                      columns = ["DPER", "Std DPER", "DPERC", "Std DPERC", "Improved percent"])

  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=n

Unnamed: 0,DPER,Std DPER,DPERC,Std DPERC,Improved percent
40%,0.00514,0.00079,0.00488,0.00065,5.15


In [None]:
ey60,ec60 = experiments(X,y,Z,.4,10)
pd.DataFrame([[np.mean(ey60).round(5),np.std(ey60).round(5),np.mean(ec60).round(5),np.std(ec60).round(5),((np.mean(ey60)-np.mean(ec60))/np.mean(ey60)*100).round(2)]],
                      index = ["60%"],
                      columns = ["DPER", "Std DPER", "DPERC", "Std DPERC", "Improved percent"])

  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=n

Unnamed: 0,DPER,Std DPER,DPERC,Std DPERC,Improved percent
60%,0.00504,0.00094,0.0047,0.00076,6.82


In [None]:
ey80,ec80 = experiments(X,y,Z,.4,10)
pd.DataFrame([[np.mean(ey80).round(5),np.std(ey80).round(5),np.mean(ec80).round(5),np.std(ec80).round(5),((np.mean(ey80)-np.mean(ec80))/np.mean(ey80)*100).round(2)]],
                      index = ["80%"],
                      columns = ["DPER", "Std DPER", "DPERC", "Std DPERC", "Improved percent"])

  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=np.array([np.nanmean(X[y==g],axis = 0) for g in range(numlabel)])
  mu=n

Unnamed: 0,DPER,Std DPER,DPERC,Std DPERC,Improved percent
80%,0.00489,0.00086,0.00456,0.00071,6.58
