## libraries and function 

In [None]:
!pip install impyute
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as skLDA
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from scipy import stats
import numpy as np
import impyute as impy
import pandas as pd
import time 
!pip install missingpy
from missingpy import MissForest
import math

Collecting impyute
  Downloading impyute-0.0.8-py2.py3-none-any.whl (31 kB)
Installing collected packages: impyute
Successfully installed impyute-0.0.8
Collecting missingpy
  Downloading missingpy-0.2.0-py3-none-any.whl (49 kB)
[K     |████████████████████████████████| 49 kB 6.0 MB/s 
[?25hInstalling collected packages: missingpy
Successfully installed missingpy-0.2.0




### compute_err function 

In [None]:
def err(mus, S, mus_est, S_est):
  er = [np.linalg.norm(mus_est-mus)/mus.size,
        np.linalg.norm(S_est.flatten().flatten()-S.flatten())/S.size]  
  return np.mean(er)  

def generate_nan(Xtrain, missing_rate):
  Xshape = Xtrain.shape
  na_id = np.random.randint(0,Xtrain.size,round(missing_rate*Xtrain.size))
  Xtr_nan = Xtrain.flatten()
  Xtr_nan[na_id] = np.nan 
  return Xtr_nan.reshape(Xshape) 

In [None]:
def compute_err_nuclear(Xtrain, ytrain, G, missing_rate, runs = 10):
  e_rate = []
  for i in  range(runs):
    Xtr_nan = generate_nan(Xtrain, missing_rate)
    
    scaler = StandardScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtrain = scaler.transform(Xtrain)
    
    # estimate parameters from full data
    # each row is a mean of a class
    mus = np.array([np.mean(Xtrain[ytrain==g,:], axis=0) for g in np.arange(G)])
    S = np.array([np.cov(Xtrain[ytrain==g,:],rowvar =False) 
             for g in np.arange(G)])

    start = time.time()
    Xtr_nuclear = NuclearNormMinimization(max_iters=100).fit_transform(Xtr_nan)
    mus_nuclear = np.asarray([np.mean(Xtr_nuclear[ytrain==g,:], axis=0
                                      ) for g in np.arange(G)])
    S_nuclear = np.asarray([(sum(ytrain==g))*np.cov(Xtr_nuclear[ytrain==g,:], rowvar =False) 
             for g in np.arange(G)])
    nuclear_err = err(mus, S, mus_nuclear, S_nuclear)
    nuclear_time = time.time()-start

    e_rate.append(nuclear_err)

  e_rate = np.asarray(e_rate)
  return np.mean(e_rate), np.std(e_rate)

In [None]:
def compute_err_miss(Xtrain, ytrain, G, missing_rate, runs = 10):
  e_rate = []
  for i in  range(runs):
    Xtr_nan = generate_nan(Xtrain, missing_rate)
    
    scaler = StandardScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtrain = scaler.transform(Xtrain)
    
    # estimate parameters from full data
    # each row is a mean of a class
    mus = np.array([np.mean(Xtrain[ytrain==g,:], axis=0) for g in np.arange(G)])
    S = np.array([np.cov(Xtrain[ytrain==g,:],rowvar =False) 
             for g in np.arange(G)])

    start = time.time()
    Xtr_mforest = MissForest(random_state=0).fit_transform(Xtr_nan)
    mus_mforest = np.asarray([np.mean(Xtr_mforest[ytrain==g,:], axis=0
                                      ) for g in np.arange(G)])
    S_mforest = np.asarray([(sum(ytrain==g))*np.cov(Xtr_mforest[ytrain==g,:], rowvar =False) 
             for g in np.arange(G)])
    mforest_err = err(mus, S, mus_mforest, S_mforest)
    mforest_time = time.time()-start

    e_rate.append(mforest_err)

  e_rate = np.asarray(e_rate)
  return np.mean(e_rate), np.std(e_rate)

# Test bivariate normality

In [None]:
!pip install scipy 



In [None]:
# !pip install pingouin
from pingouin import multivariate_normality

def non_normal_rate(X, alpha = 0.05):
  # permute through all pairs of features:
  d = X.shape[1]
  n_non_normal = 0
  for i in range(d):
    for j in range(i):
      res = multivariate_normality(X[:,[i,j]], alpha=.05)
      if res.pval < alpha:
        n_non_normal += 1
  return n_non_normal/d/(d-1)/2

# Heart

In [None]:
data = pd.read_table('https://archive.ics.uci.edu/ml/machine-learning-databases/spect/SPECTF.train', header = None,sep=',')
print(data.head())
test = pd.read_table('https://archive.ics.uci.edu/ml/machine-learning-databases/spect/SPECTF.test',
                     header=None, sep = ',')
data = pd.concat([data, test])
data = data.to_numpy()
X,y = data[:,1:], data[:,0]
X = X.astype(np.float32)
G = len(np.unique(y)) 
print(np.shape(X))
for g in range(G):
  print(sum(y==g))

   0   1   2   3   4   5   6   7   8   ...  36  37  38  39  40  41  42  43  44
0   1  59  52  70  67  73  66  72  61  ...  56  62  56  72  62  74  74  64  67
1   1  72  62  69  67  78  82  74  65  ...  71  63  60  69  73  67  71  56  58
2   1  71  62  70  64  67  64  79  65  ...  70  66  65  64  55  61  41  51  46
3   1  69  71  70  78  61  63  67  65  ...  61  66  65  72  73  68  68  59  63
4   1  70  66  61  66  61  58  69  69  ...  69  70  66  70  64  60  55  49  41

[5 rows x 45 columns]
(267, 44)
55
212


In [None]:
non_normal_rate(X)

0.25

In [None]:
G = 2
miss_err = np.array([compute_err_miss(X, y, G, .2, runs = 10),
                    compute_err_miss(X, y, G, .35, runs = 10),
                    compute_err_miss(X, y, G, .5, runs = 10),
                    compute_err_miss(X, y, G, .65, runs = 10),
                    compute_err_miss(X, y, G, .8, runs = 10)])
miss_err.round(3)

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0

array([[0.488, 0.006],
       [0.485, 0.009],
       [0.473, 0.01 ],
       [0.483, 0.014],
       [0.49 , 0.019]])

# Inosphere

In [None]:
data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data',
                  sep = ",", header = None)
# print(data.head())
data = pd.DataFrame.to_numpy(data)
X, y = data[:,:34].astype(np.float64), data[:,34]
le2 = LabelEncoder()
y = le2.fit_transform(y)
print(len(y))
X = np.delete(X,[0,1], axis = 1)
print(X.shape)
print(np.linalg.matrix_rank(X))

351
(351, 32)
32


In [None]:
non_normal_rate(X)

0.25

In [None]:
G = 2
nuclear_err = np.array([compute_err_nuclear(X, y, G, .2, runs = 10),
                    compute_err_nuclear(X, y, G, .35, runs = 10),
                    compute_err_nuclear(X, y, G, .5, runs = 10),
                    compute_err_nuclear(X, y, G, .65, runs = 10),
                    compute_err_nuclear(X, y, G, .8, runs = 10)])
nuclear_err.round(3)

In [None]:
G = 2
miss_err = np.array([compute_err_miss(X, y, G, .2, runs = 10),
                    compute_err_miss(X, y, G, .35, runs = 10),
                    compute_err_miss(X, y, G, .5, runs = 10),
                    compute_err_miss(X, y, G, .65, runs = 10),
                    compute_err_miss(X, y, G, .8, runs = 10)])
miss_err.round(3)

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6

array([[0.716, 0.003],
       [0.701, 0.006],
       [0.691, 0.008],
       [0.688, 0.01 ],
       [0.693, 0.009]])

# seeds 

In [None]:
data = pd.read_table('https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt',
                     sep = '\s+', header = None)
data = pd.DataFrame.to_numpy(data)
X,y = data[:,:7], data[:,7]-1 # reset the labels to go start from 0  
print(X.shape)
print(np.linalg.matrix_rank(X))

(210, 7)
7


In [None]:
non_normal_rate(X)

0.25

In [None]:
G = 3
nuclear_err = np.array([compute_err_nuclear(X, y, G, .2, runs = 10),
                    compute_err_nuclear(X, y, G, .35, runs = 10),
                    compute_err_nuclear(X, y, G, .5, runs = 10),
                    compute_err_nuclear(X, y, G, .65, runs = 10),
                    compute_err_nuclear(X, y, G, .8, runs = 10)])
nuclear_err.round(3)

In [None]:
G = 3
miss_err = np.array([compute_err_miss(X, y, G, .2, runs = 10),
                    compute_err_miss(X, y, G, .35, runs = 10),
                    compute_err_miss(X, y, G, .5, runs = 10),
                    compute_err_miss(X, y, G, .65, runs = 10),
                    compute_err_miss(X, y, G, .8, runs = 10)])
miss_err.round(3)

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 0

array([[0.526, 0.009],
       [0.518, 0.016],
       [0.532, 0.028],
       [0.577, 0.036],
       [0.571, 0.054]])

# wine
The data set is also available in sklearn, as noted in the package's website. So, we load it directly from sklearn

In [None]:
wine = datasets.load_wine()
X,y = wine.data, wine.target.ravel()
print(X.shape)
print(np.linalg.matrix_rank(X))
# sum(y==0), sum(y==1), sum(y==2)

(178, 13)
13


In [None]:
1-non_normal_rate(X)

0.7596153846153846

In [None]:
G = 3
nuclear_err = np.array([compute_err_nuclear(X, y, G, .2, runs = 10),
                    compute_err_nuclear(X, y, G, .35, runs = 10),
                    compute_err_nuclear(X, y, G, .5, runs = 10),
                    compute_err_nuclear(X, y, G, .65, runs = 10),
                    compute_err_nuclear(X, y, G, .8, runs = 10)])
nuclear_err.round(3)

In [None]:
G = 3
miss_err = np.array([compute_err_miss(X, y, G, .2, runs = 10),
                    compute_err_miss(X, y, G, .35, runs = 10),
                    compute_err_miss(X, y, G, .5, runs = 10),
                    compute_err_miss(X, y, G, .65, runs = 10),
                    compute_err_miss(X, y, G, .8, runs = 10)])
miss_err.round(3)

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4

array([[0.289, 0.004],
       [0.275, 0.01 ],
       [0.272, 0.013],
       [0.277, 0.011],
       [0.301, 0.021]])

# Iris
The data set is also available in sklearn, as noted in the package's website. So, we load it directly from sklearn

In [None]:
iris = datasets.load_iris()
X,y = iris.data, iris.target.ravel() 

In [None]:
non_normal_rate(X)

0.25

In [None]:
G = 3
nuclear_err = np.array([compute_err_nuclear(X, y, G, .2, runs = 10),
                    compute_err_nuclear(X, y, G, .35, runs = 10),
                    compute_err_nuclear(X, y, G, .5, runs = 10),
                    compute_err_nuclear(X, y, G, .65, runs = 10),
                    compute_err_nuclear(X, y, G, .8, runs = 10)])
nuclear_err.round(3)

In [None]:
G = 3
miss_err = np.array([compute_err_miss(X, y, G, .2, runs = 10),
                    compute_err_miss(X, y, G, .35, runs = 10),
                    compute_err_miss(X, y, G, .5, runs = 10),
                    compute_err_miss(X, y, G, .65, runs = 10),
                    compute_err_miss(X, y, G, .8, runs = 10)])
miss_err.round(3)

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4

array([[0.754, 0.038],
       [0.736, 0.043],
       [0.84 , 0.12 ],
       [0.913, 0.096],
       [0.96 , 0.117]])