## libraries and function 

In [3]:
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as skLDA
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from scipy import stats
import numpy as np
import impyute as impy
import pandas as pd
import time 
import math

RESI

In [4]:
def compute_attribute_weights(A):
  from sklearn.preprocessing import minmax_scale
  y = minmax_scale(A, axis = 0)
  p = y.copy()
  ysum = y.sum(axis = 0)
  for i in range(y.shape[1]):
    if (ysum[i] == 0):
      p[:, i] = 1
    else:
      p[:, i] = y[:, i] / ysum[i]
  from scipy.special import xlogy
  E = - xlogy(p,p).sum(axis=0) / math.log(p.shape[0])
  w = (1 - E)/(len(E) - E.sum())
  if np.isnan(w).any():
    return None
  else:
    return w                                            
def generate_tuple_partition(CT, ICT, m):
  w = compute_attribute_weights(CT)
  r = np.ones(ICT.shape[0])
  for i in range(ICT.shape[0]):
    for j in range(ICT.shape[1]):
      if np.isnan(ICT[i,j]):
        r[i] -= w[j] #If NoneType then insufficient CT set has been used
  ICT = ICT[r.argsort()[::-1],:]
  return np.array_split(ICT, m)
def resi(Xtr_nan, m, n_neighbors):
  CT = [Xtr_nan[~np.isnan(Xtr_nan).any(axis=1)]]
  Tp = []
  T = generate_tuple_partition(CT[0],Xtr_nan[np.isnan(Xtr_nan).any(axis=1)], m)
  from sklearn.impute import KNNImputer
  for i in range(m):
    imputer = KNNImputer(n_neighbors=n_neighbors)
    imputer.fit(CT[-1])
    Tp.append(imputer.transform(T[i]))
    CT.append(np.concatenate((CT[-1],Tp[-1])))
  Tpp = []
  for i in range(m):
    train = CT[0]
    for j in range(1,m):
      if j != i:
        train = np.concatenate((train,T[j]))
    imputer = KNNImputer(n_neighbors=n_neighbors)
    imputer.fit(train)
    Tpp.append(imputer.transform(T[i]))
  imputer = KNNImputer(n_neighbors=n_neighbors)
  imputer.fit(CT[0])
  Tpp.append(imputer.transform(T[m-1]))
  CT = [CT[0]]
  for i in range(m):
    CT.append(np.concatenate((CT[-1],np.mean(np.array([Tp[i], Tpp[i]]), axis=0 ))))
  return CT[-1]

### compute_err function 

In [5]:
def err(mus, S, mus_est, S_est):
  er = [np.linalg.norm(mus_est-mus)/mus.size,
         np.linalg.norm(S_est.flatten()-S.flatten())/S.size]
  return np.mean(er) 
 
def generate_nan(Xtrain,ytr, missing_rate):
  minimum_complete_samples = 3
  ct_id = np.random.choice(range(Xtrain.shape[0]), size = minimum_complete_samples, replace = False)
  CT = Xtrain[ct_id]
  ICT = Xtrain[[i for i in range(Xtrain.shape[0]) if i not in ct_id]]
  ICTshape = ICT.shape
  na_id = np.random.randint(0,ICT.size, round(missing_rate*ICT.size))
  ICT = ICT.flatten()
  ICT[na_id] = np.nan
  xxx = np.concatenate((CT,ICT.reshape(ICTshape)))
  ytrain = np.hstack((ytr[ct_id], np.array([ytr[i] for i in range(Xtrain.shape[0]) if i not in ct_id])))
  return xxx, ytrain

def compute_err_resi(Xtrain, ytrain, G, missing_rate, runs = 10):  
  e_rate = []
  for i in  range(runs):  
    Xtr_nan, ytrain = generate_nan(Xtrain, ytrain, missing_rate)
    
    scaler = StandardScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtrain = scaler.transform(Xtrain)
    
    # estimate parameters from full data
    mus = [np.mean(Xtrain[ytrain==g,:], axis=0) for g in np.arange(G)]
    mus = np.asarray(mus) # each row is a mean of a class
    S = [sum(ytrain==g)*np.cov(Xtrain[ytrain==g,:],rowvar =False) 
             for g in np.arange(G)]
    S = np.sum(S, axis = 0)/len(ytrain)
 
    #RESI approach
    start = time.time()
    k = round(math.sqrt(len(np.unique(ytrain))))
    if (k%2 == 0):
      k += 1
    Xtr_resi = resi(Xtr_nan, 3, k) #Parameters: (Dataset, m, n_neighbors) 
    mus_resi = np.asarray([np.mean(Xtr_resi[ytrain==g,:], axis=0
                                   ) for g in np.arange(G)])
    S_resi = np.asarray([(sum(ytrain==g))*np.cov(Xtr_resi[ytrain==g,:], rowvar = False) 
             for g in np.arange(G)])
    S_resi = np.sum(S_resi, axis = 0)/len(ytrain) 
    resi_err = err(mus, S, mus_resi, S_resi)
    resi_time = time.time()-start

    e_rate.append(resi_err)
  e_rate = np.asarray(e_rate)
  return np.mean(e_rate), np.std(e_rate)  

# Heart

In [6]:
data = pd.read_table('https://archive.ics.uci.edu/ml/machine-learning-databases/spect/SPECTF.train', header = None,sep=',')
print(data.head())
test = pd.read_table('https://archive.ics.uci.edu/ml/machine-learning-databases/spect/SPECTF.test',
                     header=None, sep = ',')
data = pd.concat([data, test])
data = data.to_numpy()
X,y = data[:,1:], data[:,0]
X = X.astype(np.float32)
G = len(np.unique(y)) 
print(np.shape(X))
for g in range(G):
  print(sum(y==g))

   0   1   2   3   4   5   6   7   8   9   ...  35  36  37  38  39  40  41  \
0   1  59  52  70  67  73  66  72  61  58  ...  66  56  62  56  72  62  74   
1   1  72  62  69  67  78  82  74  65  69  ...  65  71  63  60  69  73  67   
2   1  71  62  70  64  67  64  79  65  70  ...  73  70  66  65  64  55  61   
3   1  69  71  70  78  61  63  67  65  59  ...  61  61  66  65  72  73  68   
4   1  70  66  61  66  61  58  69  69  72  ...  67  69  70  66  70  64  60   

   42  43  44  
0  74  64  67  
1  71  56  58  
2  41  51  46  
3  68  59  63  
4  55  49  41  

[5 rows x 45 columns]
(267, 44)
55
212


In [7]:
resi_err = np.array([compute_err_resi(X, y, G, .2, runs = 10),
                    compute_err_resi(X, y, G, .35, runs = 10),
                    compute_err_resi(X, y, G, .5, runs = 10),
                    compute_err_resi(X, y, G, .65, runs = 10),
                    compute_err_resi(X, y, G, .8, runs = 10)])
resi_err.round(3)

array([[0.012, 0.005],
       [0.013, 0.005],
       [0.012, 0.003],
       [0.015, 0.003],
       [0.014, 0.003]])

# Inosphere

In [8]:
data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data',
                  sep = ",", header = None)
# print(data.head())
data = pd.DataFrame.to_numpy(data)
X, y = data[:,:34].astype(np.float64), data[:,34]
le2 = LabelEncoder()
y = le2.fit_transform(y)
print(len(y))
X = np.delete(X,[0,1], axis = 1)
print(X.shape)
print(np.linalg.matrix_rank(X))

351
(351, 32)
32


In [9]:
G = 2
resi_err = np.array([compute_err_resi(X, y, G, .2, runs = 10),
                    compute_err_resi(X, y, G, .35, runs = 10),
                    compute_err_resi(X, y, G, .5, runs = 10),
                    compute_err_resi(X, y, G, .65, runs = 10),
                    compute_err_resi(X, y, G, .8, runs = 10)])
resi_err.round(3)

array([[0.008, 0.002],
       [0.008, 0.002],
       [0.009, 0.001],
       [0.01 , 0.002],
       [0.011, 0.002]])

# seeds 

In [10]:
data = pd.read_table('https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt',
                     sep = '\s+', header = None)
data = pd.DataFrame.to_numpy(data)
X,y = data[:,:7], data[:,7]-1 # reset the labels to go start from 0  
print(X.shape)
print(np.linalg.matrix_rank(X))

(210, 7)
7


In [11]:
G = 3
resi_err = np.array([compute_err_resi(X, y, G, .2, runs = 10),
                    compute_err_resi(X, y, G, .35, runs = 10),
                    compute_err_resi(X, y, G, .5, runs = 10),
                    compute_err_resi(X, y, G, .65, runs = 10),
                    compute_err_resi(X, y, G, .8, runs = 10)])
resi_err.round(3)

array([[0.114, 0.012],
       [0.105, 0.017],
       [0.107, 0.013],
       [0.103, 0.007],
       [0.093, 0.01 ]])

# wine
The data set is also available in sklearn, as noted in the package's website. So, we load it directly from sklearn

In [12]:
wine = datasets.load_wine()
X,y = wine.data, wine.target.ravel()
print(X.shape)
print(np.linalg.matrix_rank(X))
# sum(y==0), sum(y==1), sum(y==2)

(178, 13)
13


In [13]:
G = 3
resi_err = np.array([compute_err_resi(X, y, G, .2, runs = 10),
                    compute_err_resi(X, y, G, .35, runs = 10),
                    compute_err_resi(X, y, G, .5, runs = 10),
                    compute_err_resi(X, y, G, .65, runs = 10),
                    compute_err_resi(X, y, G, .8, runs = 10)])
resi_err.round(3)

array([[0.051, 0.008],
       [0.052, 0.006],
       [0.051, 0.007],
       [0.053, 0.005],
       [0.053, 0.005]])

# Iris
The data set is also available in sklearn, as noted in the package's website. So, we load it directly from sklearn

In [14]:
iris = datasets.load_iris()
X,y = iris.data, iris.target.ravel() 
G = 3
resi_err = np.array([compute_err_resi(X, y, G, .2, runs = 10),
                    compute_err_resi(X, y, G, .35, runs = 10),
                    compute_err_resi(X, y, G, .5, runs = 10),
                    compute_err_resi(X, y, G, .65, runs = 10),
                    compute_err_resi(X, y, G, .8, runs = 10)])
resi_err.round(3)

array([[0.153, 0.022],
       [0.15 , 0.035],
       [0.148, 0.032],
       [0.109, 0.032],
       [0.115, 0.03 ]])