## libraries and function 

In [1]:
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as skLDA
from scipy import stats
import numpy as np
import pandas as pd
import time 
import math

# RESI

In [2]:
def compute_attribute_weights(A):
  from sklearn.preprocessing import minmax_scale
  y = minmax_scale(A, axis = 0)
  p = y.copy()
  ysum = y.sum(axis = 0)
  for i in range(y.shape[1]):
    if (ysum[i] == 0):
      p[:, i] = 1
    else:
      p[:, i] = y[:, i] / ysum[i]
  from scipy.special import xlogy
  E = - xlogy(p,p).sum(axis=0) / math.log(p.shape[0])
  w = (1 - E)/(len(E) - E.sum())
  if np.isnan(w).any():
    return None
  else:
    return w                            
def generate_tuple_partition(CT, ICT, m):
  w = compute_attribute_weights(CT)
  r = np.ones(ICT.shape[0])
  for i in range(ICT.shape[0]):
    for j in range(ICT.shape[1]):
      if np.isnan(ICT[i,j]):
        r[i] -= w[j] #If NoneType then insufficient CT set has been used
  ICT = ICT[r.argsort()[::-1],:]
  return np.array_split(ICT, m)
def resi(Xtr_nan, m, n_neighbors):
  CT = [Xtr_nan[~np.isnan(Xtr_nan).any(axis=1)]]
  Tp = []
  T = generate_tuple_partition(CT[0],Xtr_nan[np.isnan(Xtr_nan).any(axis=1)], m)
  from sklearn.impute import KNNImputer
  for i in range(m):
    imputer = KNNImputer(n_neighbors=n_neighbors)
    imputer.fit(CT[-1])
    Tp.append(imputer.transform(T[i]))
    CT.append(np.concatenate((CT[-1],Tp[-1])))
  Tpp = []
  for i in range(m-1):
    imputer = KNNImputer(n_neighbors=n_neighbors)
    imputer.fit(CT[-1])
    Tpp.append(imputer.transform(T[i]))
  CT = [CT[0]]
  for i in range(m-1):
    CT.append(np.concatenate((CT[-1],np.mean(np.array([Tp[i], Tpp[i]]), axis=0 ))))
  CT.append(np.concatenate((CT[-1],Tp[-1])))
  return CT[-1]

### compute_err function 

In [3]:
def err(mus, S, mus_est, S_est):
  er = [np.linalg.norm(mus_est-mus)/mus.size,
        np.linalg.norm(S_est.flatten().flatten()-S.flatten())/S.size]  
  return np.mean(er)  
 
def generate_nan(Xtrain, missing_rate, minimum_complete_samples):
  ct_id = np.random.choice(range(Xtrain.shape[0]), size = minimum_complete_samples, replace = False)
  CT = Xtrain[ct_id]
  ICT = Xtrain[[i for i in range(Xtrain.shape[0]) if i not in ct_id]]
  ICTshape = ICT.shape
  na_id = np.random.randint(0,ICT.size,round(missing_rate*ICT.size))
  ICT = ICT.flatten()
  ICT[na_id] = np.nan
  return np.concatenate((CT,ICT.reshape(ICTshape)))
 
def compute_err(Xtrain, ytrain, G, missing_rate, minimum_complete_samples = 5, runs = 10):  
  e_rate = []
  running_time = []
  for i in range(runs):
    Xtr_nan = generate_nan(Xtrain, missing_rate, minimum_complete_samples)

    scaler = StandardScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtrain = scaler.transform(Xtrain)
    
    print('rank',np.linalg.matrix_rank(np.nan_to_num(Xtr_nan, copy=True, nan=0.0)))
 
    # estimate parameters from full data
    # each row is a mean of a class
    mus = np.array([np.mean(Xtrain[ytrain==g,:], axis=0) for g in np.arange(G)])
    S = np.array([np.cov(Xtrain[ytrain==g,:],rowvar =False) 
             for g in np.arange(G)])
    print('original mus',mus)
    print('S', S)
 
    #RESI approach
    start = time.time()
    Xtr_resi = resi(Xtr_nan, 3, 3) #Parameters: (Dataset, m, n_neighbors) 
    mus_resi = np.asarray([np.mean(Xtr_resi[ytrain==g,:], axis=0) for g in np.arange(G)])
    S_resi = np.asarray([(sum(ytrain==g))*np.cov(Xtr_resi[ytrain==g,:], rowvar =False)
             for g in np.arange(G)])
    print('mus RESI-KNNI', mus_resi)
    print('S RESI-KNNI', S_resi)    
    resi_err = err(mus, S, mus_resi, S_resi)
    resi_time = time.time()-start
    e_rate.append(resi_err)
    running_time.append(resi_time)
  e_rate = np.asarray(e_rate)
  running_time = np.asarray(running_time)
  return np.mean(e_rate), np.std(e_rate), np.mean(running_time)

# Digits

In [4]:
digits = datasets.load_digits()
X,y = digits.data, digits.target.ravel() 
print(X.shape)
rmid = np.where(sum(X!=0)<10)
X = np.delete(X, rmid,axis = 1)
X.shape

(1797, 64)


(1797, 54)

In [5]:
G = 10
e20 = compute_err(X, y, G, 0.2, runs = 10)
e35 = compute_err(X, y, G,0.35, runs = 10)
e50 = compute_err(X, y, G,0.5, runs = 10)
e65 = compute_err(X, y, G,0.65, runs = 10)
e80 = compute_err(X, y, G,0.8, 3, runs = 10)
RESI_res = np.asarray(list((e20, e35, e50, e65, e80)))
print('RESI result: \n', RESI_res)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  [-2.72263547e+00  1.38833764e+01  1.16640473e+01 ...  7.30586575e+01
    1.38409352e+02  7.31473065e+01]
  [-8.11329683e+00  1.81083759e+01  1.18266622e+01 ...  3.79205839e+01
    7.31473065e+01  1.40437619e+02]]

 [[ 1.96569469e+02  9.95488055e+01  3.87666844e+01 ...  1.33254896e+01
    8.66830146e+00 -1.05553216e+01]
  [ 9.95488055e+01  1.53136088e+02  6.75531155e+01 ...  4.15509193e+00
    1.00811080e+01  1.11732392e+01]
  [ 3.87666844e+01  6.75531155e+01  1.28694363e+02 ... -1.69987935e+00
    7.85601580e+00  6.41836706e+00]
  ...
  [ 1.33254896e+01  4.15509193e+00 -1.69987935e+00 ...  1.15303876e+02
    7.40156205e+01  2.57487668e+01]
  [ 8.66830146e+00  1.00811080e+01  7.85601580e+00 ...  7.40156205e+01
    1.30731977e+02  5.91326869e+01]
  [-1.05553216e+01  1.11732392e+01  6.41836706e+00 ...  2.57487668e+01
    5.91326869e+01  1.34851581e+02]]

 [[ 9.74880071e+01  4.76353073e+01  1.61176180e+01 ... -1.39368930e+0