## libraries and function 

In [None]:
!pip install impyute
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as skLDA
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from scipy import stats
import numpy as np
import impyute as impy
import pandas as pd
import time 
!pip install missingpy
from missingpy import MissForest
import math



RESI

In [None]:
def compute_attribute_weights(A):
  from sklearn.preprocessing import minmax_scale
  y = minmax_scale(A, axis = 0)
  p = y.copy()
  ysum = y.sum(axis = 0)
  for i in range(y.shape[1]):
    if (ysum[i] == 0):
      p[:, i] = 1
    else:
      p[:, i] = y[:, i] / ysum[i]
  from scipy.special import xlogy
  E = - xlogy(p,p).sum(axis=0) / math.log(p.shape[0])
  w = (1 - E)/(len(E) - E.sum())
  if np.isnan(w).any():
    return None
  else:
    return w                                             
def generate_tuple_partition(CT, ICT, m):
  w = compute_attribute_weights(CT)
  r = np.ones(ICT.shape[0])
  for i in range(ICT.shape[0]):
    for j in range(ICT.shape[1]):
      if np.isnan(ICT[i,j]):
        r[i] -= w[j] #If NoneType then insufficient CT set has been used
  ICT = ICT[r.argsort()[::-1],:]
  return np.array_split(ICT, m)
def resi(Xtr_nan, m, n_neighbors):
  CT = [Xtr_nan[~np.isnan(Xtr_nan).any(axis=1)]]
  Tp = []
  T = generate_tuple_partition(CT[0],Xtr_nan[np.isnan(Xtr_nan).any(axis=1)], m)
  from sklearn.impute import KNNImputer
  for i in range(m):
    imputer = KNNImputer(n_neighbors=n_neighbors)
    imputer.fit(CT[-1])
    Tp.append(imputer.transform(T[i]))
    CT.append(np.concatenate((CT[-1],Tp[-1])))
  Tpp = []
  for i in range(m-1):
    imputer = KNNImputer(n_neighbors=n_neighbors)
    imputer.fit(CT[-1])
    Tpp.append(imputer.transform(T[i]))
  CT = [CT[0]]
  for i in range(m-1):
    CT.append(np.concatenate((CT[-1],np.mean(np.array([Tp[i], Tpp[i]]), axis=0 ))))
  CT.append(np.concatenate((CT[-1],Tp[-1])))
  return CT[-1]

### compute_err function 

In [None]:
def err(mus, S, mus_est, S_est):
  er = [np.linalg.norm(mus_est-mus)/mus.size,
         np.linalg.norm(S_est.flatten()-S.flatten())/S.size]
  return np.mean(er) 
 
def generate_nan(Xtrain, missing_rate, minimum_complete_samples):
  ct_id = np.random.choice(range(Xtrain.shape[0]), size = minimum_complete_samples, replace = False)
  CT = Xtrain[ct_id]
  ICT = Xtrain[[i for i in range(Xtrain.shape[0]) if i not in ct_id]]
  ICTshape = ICT.shape
  na_id = np.random.randint(0,ICT.size,round(missing_rate*ICT.size))
  ICT = ICT.flatten()
  ICT[na_id] = np.nan
  return np.concatenate((CT,ICT.reshape(ICTshape)))
 
def compute_err_resi(Xtrain, ytrain, G, missing_rate, runs = 10, minimum_complete_samples = 2):  
  e_rate = []
  for i in  range(runs):  
    Xtr_nan = generate_nan(Xtrain, missing_rate, minimum_complete_samples)
    
    scaler = StandardScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtrain = scaler.transform(Xtrain)
    
    # estimate parameters from full data
    mus = [np.mean(Xtrain[ytrain==g,:], axis=0) for g in np.arange(G)]
    mus = np.asarray(mus) # each row is a mean of a class
    S = [sum(ytrain==g)*np.cov(Xtrain[ytrain==g,:],rowvar =False) 
             for g in np.arange(G)]
    S = np.sum(S, axis = 0)/len(ytrain)
 
    #RESI approach
    start = time.time()
    k = round(math.sqrt(len(np.unique(ytrain))))
    if (k%2 == 0):
      k += 1
    Xtr_resi = resi(Xtr_nan, 3, k) #Parameters: (Dataset, m, n_neighbors) 
    mus_resi = np.asarray([np.mean(Xtr_resi[ytrain==g,:], axis=0
                                   ) for g in np.arange(G)])
    S_resi = np.asarray([(sum(ytrain==g))*np.cov(Xtr_resi[ytrain==g,:], rowvar = False) 
             for g in np.arange(G)])
    S_resi = np.sum(S_resi, axis = 0)/len(ytrain) 
    resi_err = err(mus, S, mus_resi, S_resi)
    resi_time = time.time()-start

    e_rate.append(resi_err)
  e_rate = np.asarray(e_rate)
  return np.mean(e_rate), np.std(e_rate)  

# Heart

In [None]:
data = pd.read_table('https://archive.ics.uci.edu/ml/machine-learning-databases/spect/SPECTF.train', header = None,sep=',')
print(data.head())
test = pd.read_table('https://archive.ics.uci.edu/ml/machine-learning-databases/spect/SPECTF.test',
                     header=None, sep = ',')
data = pd.concat([data, test])
data = data.to_numpy()
X,y = data[:,1:], data[:,0]
X = X.astype(np.float32)
G = len(np.unique(y)) 
print(np.shape(X))
for g in range(G):
  print(sum(y==g))

   0   1   2   3   4   5   6   7   8   ...  36  37  38  39  40  41  42  43  44
0   1  59  52  70  67  73  66  72  61  ...  56  62  56  72  62  74  74  64  67
1   1  72  62  69  67  78  82  74  65  ...  71  63  60  69  73  67  71  56  58
2   1  71  62  70  64  67  64  79  65  ...  70  66  65  64  55  61  41  51  46
3   1  69  71  70  78  61  63  67  65  ...  61  66  65  72  73  68  68  59  63
4   1  70  66  61  66  61  58  69  69  ...  69  70  66  70  64  60  55  49  41

[5 rows x 45 columns]
(267, 44)
55
212


In [None]:
resi_err = np.array([compute_err_resi(X, y, G, .2, runs = 10),
                    compute_err_resi(X, y, G, .35, runs = 10),
                    compute_err_resi(X, y, G, .5, runs = 10),
                    compute_err_resi(X, y, G, .65, runs = 10),
                    compute_err_resi(X, y, G, .8, runs = 10)])
resi_err.round(3)

array([[0.018, 0.002],
       [0.017, 0.003],
       [0.019, 0.001],
       [0.02 , 0.003],
       [0.023, 0.006]])

# Inosphere

In [None]:
data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data',
                  sep = ",", header = None)
# print(data.head())
data = pd.DataFrame.to_numpy(data)
X, y = data[:,:34].astype(np.float64), data[:,34]
le2 = LabelEncoder()
y = le2.fit_transform(y)
print(len(y))
X = np.delete(X,[0,1], axis = 1)
print(X.shape)
print(np.linalg.matrix_rank(X))

351
(351, 32)
32


In [None]:
G = 2
resi_err = np.array([compute_err_resi(X, y, G, .2, runs = 10),
                    compute_err_resi(X, y, G, .35, runs = 10),
                    compute_err_resi(X, y, G, .5, runs = 10),
                    compute_err_resi(X, y, G, .65, runs = 10),
                    compute_err_resi(X, y, G, .8, runs = 10)])
resi_err.round(3)

array([[0.017, 0.001],
       [0.017, 0.002],
       [0.019, 0.002],
       [0.021, 0.002],
       [0.023, 0.003]])

# seeds 

In [None]:
data = pd.read_table('https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt',
                     sep = '\s+', header = None)
data = pd.DataFrame.to_numpy(data)
X,y = data[:,:7], data[:,7]-1 # reset the labels to go start from 0  
print(X.shape)
print(np.linalg.matrix_rank(X))

(210, 7)
7


In [None]:
G = 3
resi_err = np.array([compute_err_resi(X, y, G, .2, runs = 10),
                    compute_err_resi(X, y, G, .35, runs = 10),
                    compute_err_resi(X, y, G, .5, runs = 10),
                    compute_err_resi(X, y, G, .65, runs = 10),
                    compute_err_resi(X, y, G, .8, runs = 10)])
resi_err.round(3)

array([[0.134, 0.007],
       [0.133, 0.005],
       [0.128, 0.005],
       [0.125, 0.008],
       [0.12 , 0.007]])

# wine
The data set is also available in sklearn, as noted in the package's website. So, we load it directly from sklearn

In [None]:
wine = datasets.load_wine()
X,y = wine.data, wine.target.ravel()
print(X.shape)
print(np.linalg.matrix_rank(X))
# sum(y==0), sum(y==1), sum(y==2)

(178, 13)
13


In [None]:
G = 3
resi_err = np.array([compute_err_resi(X, y, G, .2, runs = 10),
                    compute_err_resi(X, y, G, .35, runs = 10),
                    compute_err_resi(X, y, G, .5, runs = 10),
                    compute_err_resi(X, y, G, .65, runs = 10),
                    compute_err_resi(X, y, G, .8, runs = 10)])
resi_err.round(3)

array([[0.067, 0.004],
       [0.066, 0.003],
       [0.065, 0.003],
       [0.064, 0.002],
       [0.065, 0.002]])

# Iris
The data set is also available in sklearn, as noted in the package's website. So, we load it directly from sklearn

In [None]:
iris = datasets.load_iris()
X,y = iris.data, iris.target.ravel() 
G = 3
resi_err = np.array([compute_err_resi(X, y, G, .2, runs = 10),
                    compute_err_resi(X, y, G, .35, runs = 10),
                    compute_err_resi(X, y, G, .5, runs = 10),
                    compute_err_resi(X, y, G, .65, runs = 10),
                    compute_err_resi(X, y, G, .8, runs = 10)])
resi_err.round(3)

array([[0.181, 0.014],
       [0.203, 0.009],
       [0.214, 0.01 ],
       [0.196, 0.008],
       [0.187, 0.011]])