## libraries and function 

In [1]:
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as skLDA
from scipy import stats
import numpy as np
import pandas as pd
import time 
import math

# RESI

In [2]:
def compute_attribute_weights(A):
  from sklearn.preprocessing import minmax_scale
  y = minmax_scale(A, axis = 0)
  p = y.copy()
  ysum = y.sum(axis = 0)
  for i in range(y.shape[1]):
    if (ysum[i] == 0):
      p[:, i] = 1
    else:
      p[:, i] = y[:, i] / ysum[i]
  from scipy.special import xlogy
  E = - xlogy(p,p).sum(axis=0) / math.log(p.shape[0])
  w = (1 - E)/(len(E) - E.sum())
  if np.isnan(w).any():
    return None
  else:
    return w                            
def generate_tuple_partition(CT, ICT, m):
  w = compute_attribute_weights(CT)
  r = np.ones(ICT.shape[0])
  for i in range(ICT.shape[0]):
    for j in range(ICT.shape[1]):
      if np.isnan(ICT[i,j]):
        r[i] -= w[j] #If NoneType then insufficient CT set has been used
  ICT = ICT[r.argsort()[::-1],:]
  return np.array_split(ICT, m)
def resi(Xtr_nan, m, n_neighbors):
  CT = [Xtr_nan[~np.isnan(Xtr_nan).any(axis=1)]]
  Tp = []
  T = generate_tuple_partition(CT[0],Xtr_nan[np.isnan(Xtr_nan).any(axis=1)], m)
  from sklearn.impute import KNNImputer
  for i in range(m):
    imputer = KNNImputer(n_neighbors=n_neighbors)
    imputer.fit(CT[-1])
    Tp.append(imputer.transform(T[i]))
    CT.append(np.concatenate((CT[-1],Tp[-1])))
  Tpp = []
  for i in range(m-1):
    imputer = KNNImputer(n_neighbors=n_neighbors)
    imputer.fit(CT[-1])
    Tpp.append(imputer.transform(T[i]))
  CT = [CT[0]]
  for i in range(m-1):
    CT.append(np.concatenate((CT[-1],np.mean(np.array([Tp[i], Tpp[i]]), axis=0 ))))
  CT.append(np.concatenate((CT[-1],Tp[-1])))
  return CT[-1]

### compute_err function 

In [3]:
def err(mus, S, mus_est, S_est):
  er = [np.linalg.norm(mus_est-mus)/mus.size,
        np.linalg.norm(S_est.flatten().flatten()-S.flatten())/S.size]  
  return np.mean(er)  
 
def generate_nan(Xtrain, missing_rate, minimum_complete_samples):
  ct_id = np.random.choice(range(Xtrain.shape[0]), size = minimum_complete_samples, replace = False)
  CT = Xtrain[ct_id]
  ICT = Xtrain[[i for i in range(Xtrain.shape[0]) if i not in ct_id]]
  ICTshape = ICT.shape
  na_id = np.random.randint(0,ICT.size,round(missing_rate*ICT.size))
  ICT = ICT.flatten()
  ICT[na_id] = np.nan
  return np.concatenate((CT,ICT.reshape(ICTshape)))
 
def compute_err(Xtrain, ytrain, G, missing_rate, minimum_complete_samples = 5, runs = 10):  
  e_rate = []
  for i in range(runs):
    Xtr_nan = generate_nan(Xtrain, missing_rate, minimum_complete_samples)

    scaler = StandardScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtrain = scaler.transform(Xtrain)
    
    print('rank',np.linalg.matrix_rank(np.nan_to_num(Xtr_nan, copy=True, nan=0.0)))
 
    # estimate parameters from full data
    # each row is a mean of a class
    mus = np.array([np.mean(Xtrain[ytrain==g,:], axis=0) for g in np.arange(G)])
    S = np.array([np.cov(Xtrain[ytrain==g,:],rowvar =False) 
             for g in np.arange(G)])
    print('original mus',mus)
    print('S', S)
 
    #RESI approach
    start = time.time()
    Xtr_resi = resi(Xtr_nan, 3, 3) #Parameters: (Dataset, m, n_neighbors) 
    mus_resi = np.asarray([np.mean(Xtr_resi[ytrain==g,:], axis=0) for g in np.arange(G)])
    S_resi = np.asarray([(sum(ytrain==g))*np.cov(Xtr_resi[ytrain==g,:], rowvar =False)
             for g in np.arange(G)])
    print('mus RESI-KNNI', mus_resi)
    print('S RESI-KNNI', S_resi)    
    resi_err = err(mus, S, mus_resi, S_resi)
    resi_time = time.time()-start
    e_rate.append(resi_err)
  e_rate = np.asarray(e_rate)
  return np.mean(e_rate), np.std(e_rate)

### MNIST

In [4]:
import tensorflow as tf
import tensorflow_datasets as tfds

# Fetch the dataset directly
mnist = tfds.image.MNIST()
# or by string name
mnist = tfds.builder('mnist')

# Download the data, prepare it, and write it to disk
mnist.download_and_prepare()

# Load data from disk as tf.data.Datasets
datasets = mnist.as_dataset()
train_dataset, test_dataset = datasets['train'], datasets['test']

[1mDownloading and preparing dataset mnist/3.0.1 (download: 11.06 MiB, generated: 21.00 MiB, total: 32.06 MiB) to /root/tensorflow_datasets/mnist/3.0.1...[0m


local data directory. If you'd instead prefer to read directly from our public
GCS bucket (recommended if you're running on GCP), you can instead pass
`try_gcs=True` to `tfds.load` or set `data_dir=gs://tfds-data/datasets`.



Dl Completed...:   0%|          | 0/4 [00:00<?, ? file/s]


[1mDataset mnist downloaded and prepared to /root/tensorflow_datasets/mnist/3.0.1. Subsequent calls will reuse this data.[0m


In [5]:
# convert the Dataset to NumPy arrays and flatten the data
Xtrain, ytrain = [], []
for example in tfds.as_numpy(train_dataset):
  Xtrain.append(example['image'].flatten())
  ytrain.append(example['label'])

Xtrain, ytrain = np.asarray(Xtrain), np.asarray(ytrain)
Xtrain = Xtrain.astype(float)

# convert the test set to NumPy arrays and flatten the data
Xtest, ytest = [], []
for example in tfds.as_numpy(test_dataset):
  Xtest.append(example['image'].flatten())
  ytest.append(example['label'])

Xtest, ytest = np.asarray(Xtest), np.asarray(ytest)
Xtest = Xtest.astype(float)
X = np.vstack((Xtrain,Xtest))
y = np.hstack((ytrain,ytest))
X.shape, y.shape
# set random seed and shuffle the data
np.random.seed(1)
idx = np.arange(len(y))
np.random.shuffle(idx)
X, y = X[idx,:], y[idx]  

X.shape, y.shape 

((70000, 784), (70000,))

In [6]:
# check if a column is all 0
id = [np.sum(Xtrain[:,i] != 0)>10 for i in range(28**2)]
# number of columns that mostly zero
print(28**2-np.sum(id))
# number of columns that has at least more than 10 non-zero
np.sum(id)
X = X[:, id]

135


## Run

In [None]:
G = 10
e20 = compute_err(X, y, G, 0.2, runs = 10)
e35 = compute_err(X, y, G,0.35, runs = 10)
e50 = compute_err(X, y, G,0.5, runs = 10)
e65 = compute_err(X, y, G,0.65, runs = 10)
e80 = compute_err(X, y, G,0.8, 3, runs = 10)
RESI_res = np.asarray(list((e20, e35, e50, e65, e80)))
print('RESI result: \n', RESI_res)