## libraries and function 

In [None]:
!pip install impyute
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as skLDA
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from scipy import stats
import numpy as np
import impyute as impy
from fancyimpute import IterativeSVD, SoftImpute, NuclearNormMinimization, BiScaler
import pandas as pd
import time



The function `mle` allows us to compute the MLEs from training data with monotone missing data.

We denote
$$n = \begin{pmatrix}
n_1^{(1)} & n_1^{(2)} &...&n_1^{(K)}\\
\vdots & \vdots &\ddots&\vdots\\
n_G^{(1)} & n_G^{(2)} &...&n_G^{(K)}
\end{pmatrix}$$
$$p = (p_1,p_2,...,p_K)$$
G is the number of classes.

### MLE function 

In [None]:
import numpy as np
def mle(Xtrain, n, p, G):
    '''
    Xtrain: list of input. The ith element of the list contains the sample from
    the ith class.
    '''
    if p[0]==1:
        # the array that contains the means of each block for the 1st block
        mus = [np.mean(Xtrain[g][:,0]) for g in np.arange(G)]
        S = [(n[g,0]-1)*np.var(Xtrain[g][:,0]) for g in np.arange(G)]
    else:
        mus = [np.mean(Xtrain[g][:,0:p[0]], axis = 0) for g  in np.arange(G)]
        S = [(n[g,0]-1)*np.cov(Xtrain[g][:,0:p[0]],rowvar =False) 
             for g in np.arange(G)]
    
    mus = np.asarray(mus).T # so that each column is the mean of a class
    S = sum(S)/(sum(n[:,0])) 
    S = S.reshape((p[0],-1))
    for i in np.arange(1,len(p)):
        W = [(n[g,i]-1)*np.cov(Xtrain[g][0:n[g,i],0:p[i]],
                              rowvar=False) for g in np.arange(G)]
        W = sum(W)
        
        P = np.matmul(W[(p[i-1]):p[i], 0:p[i-1]],
                      np.linalg.inv(W[0:p[i-1],0:p[i-1]]))
        Q = (W[p[i-1]:p[i],p[i-1]:p[i]]-
            np.matmul(P, W[0:p[i-1],p[i-1]:p[i]]))/sum(n[:,i])
        xmeans = [np.mean(Xtrain[g][0:n[g,i],0:p[i]], axis = 0) 
                  for g in np.arange(G)]
        
        xmeans = np.asarray(xmeans)
        xmeans = xmeans.T
        mus = np.vstack((mus, xmeans[p[i-1]:p[i],:]
                       - np.matmul(P, xmeans[0:p[i-1]]-mus)))
        S21 = np.matmul(P, S)
        S = np.vstack((np.hstack((S, S21.T)),
                       np.hstack((S21, Q+np.matmul(P, S21.T)))))
    return [mus, S]

### nan function 


In [None]:
'''
function that create data list that contain missing values
The input X is a numpy array, y is the label
the function return a list where the ith element of 
the list belongs to the ith class
'''

def make_nan_list(X,y,G, n, p):
    # note that the label should go from 0 to G-1
    data = []
    for g in np.arange(G):
        data.append(X[y==g,:])
        for k in np.arange(len(p)-1):
            data[g][n[g,k+1]:n[g,k], p[k]:] = np.nan
    return data


In [None]:
def missing_rate(Xtrain, ytrain, n, p, G):
    # function that compute the missing rate of a given pattern    
    Xtr_nan_list = make_nan_list(Xtrain,ytrain,G, n, p)
    # make NA data
    # since making function changes the order of observation
    # we need to generate new ytr from Xtr_nan    
    Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0]))
    for g in np.arange(1,G):
        Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g]))
        ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g]))))

    # percentage of missing values
    per_missing = np.mean(np.isnan(Xtr_nan))
    return per_missing

### compute_err function 

In [None]:
def err(mus, S, mus_est, S_est):
  err_rate = (np.linalg.norm(mus_est-mus))/mus.size 
  err_rate += (np.linalg.norm(S_est-S))/S.size 
  return err_rate

def compute_err(Xtrain, ytrain, n, p, G):      
    # make NAs
    Xtr_nan_list = make_nan_list(Xtrain,ytrain,G, n, p)
    # make NA data
    # since making function changes the order of observation
    # we need to generate new ytr from Xtr_nan    
    Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0]))
    for g in np.arange(1,G):
        Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g]))
        ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g]))))

    scaler = StandardScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtrain = scaler.transform(Xtrain)
    for g in range(G):
      Xtr_nan_list[g] = scaler.transform(Xtr_nan_list[g])

    mus = [np.mean(Xtrain[ytrain==g,:], axis=0) for g in np.arange(G)]
    mus = np.asarray(mus) # each row is a mean of a class
    S = [(sum(ytrain==g)-1)*np.cov(Xtrain[ytrain==g,:],rowvar =False) 
             for g in np.arange(G)]
    S = np.asarray(S)/len(ytrain)

    # percentage of missing values
    per_missing = np.mean(np.isnan(Xtr_nan))
    # MLEs approach
    start = time.time()
    mus_mle, S_mle = mle(Xtr_nan_list, n, p, G)
    mle_err = err(mus, S, mus_mle.T, S_mle)
    mle_time = time.time()-start
    
    start = time.time()
    Xtr_em = impy.em(Xtr_nan)
    mus_em = np.array([np.mean(Xtr_em[ytrain==g,:], axis=0) for g in np.arange(G)])
    S_em = np.array([(sum(ytrain==g)-1)*np.cov(Xtr_em[ytrain==g,:], rowvar =False) 
             for g in np.arange(G)])
    S_em = S_em/len(ytrain)
    em_err = err(mus, S, mus_em, S_em)
    em_time = time.time()-start       

    start = time.time()
    Xtr_knn = impy.fast_knn(Xtr_nan)
    mus_knn = np.asarray([np.mean(Xtr_knn[ytrain==g,:], axis=0) for g in np.arange(G)])
    S_knn = np.asarray([(sum(ytrain==g)-1)*np.cov(Xtr_knn[ytrain==g,:], rowvar =False) 
             for g in np.arange(G)])
    S_knn = S_knn/len(ytrain)
    knn_err =  err(mus, S, mus_knn, S_knn)
    knn_time = time.time()-start 

    start = time.time()
    Xtr_softimpute = SoftImpute(max_iters = 100).fit_transform(Xtr_nan)
    mus_softimpute = np.asarray([np.mean(Xtr_softimpute[ytrain==g,:], axis=0
                                         ) for g in np.arange(G)])
    S_softimpute = np.asarray([(sum(ytrain==g)-1)*np.cov(Xtr_softimpute[ytrain==g,:], rowvar =False) 
             for g in np.arange(G)])
    S_softimpute = S_softimpute/len(ytrain)
    softimpute_err =  err(mus, S, mus_softimpute, S_softimpute)
    softimpute_time = time.time()-start

    start = time.time()
    Xtr_mice = IterativeImputer(max_iter=100).fit(Xtr_nan).transform(Xtr_nan)
    mus_mice = np.asarray([np.mean(Xtr_mice[ytrain==g,:], axis=0
                                   ) for g in np.arange(G)])
    S_mice = np.asarray([(sum(ytrain==g)-1)*np.cov(Xtr_mice[ytrain==g,:], rowvar =False) 
             for g in np.arange(G)])
    S_mice = S_mice/len(ytrain)
    mice_err = err(mus, S, mus_mice, S_mice)
    mice_time = time.time()-start

    start = time.time()
    Xtr_nuclear = NuclearNormMinimization(max_iters=100).fit_transform(Xtr_nan)
    mus_nuclear = np.asarray([np.mean(Xtr_nuclear[ytrain==g,:], axis=0
                                      ) for g in np.arange(G)])
    S_nuclear = np.asarray([(sum(ytrain==g)-1)*np.cov(Xtr_nuclear[ytrain==g,:], rowvar =False) 
             for g in np.arange(G)])
    S_nuclear = S_nuclear/len(ytrain)
    nuclear_err = err(mus, S, mus_nuclear, S_nuclear)
    nuclear_time = time.time()-start 
    
    err_rate = np.array([mle_err, knn_err, mice_err, softimpute_err, em_err,
           nuclear_err,per_missing])
    return err_rate

### compute_time

In [None]:
def compute_time(Xtrain, ytrain, n, p, G):  
    Xtr_nan_list = make_nan_list(Xtrain,ytrain,G, n, p)
    # make NA data
    # since making function changes the order of observation
    # we need to generate new ytr from Xtr_nan    
    Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0]))
    for g in np.arange(1,G):
        Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g]))
        ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g]))))

    scaler = StandardScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtrain = scaler.transform(Xtrain)
    for g in range(G):
      Xtr_nan_list[g] = scaler.transform(Xtr_nan_list[g])

    mus = [np.mean(Xtrain[ytrain==g,:], axis=0) for g in np.arange(G)]
    mus = np.asarray(mus) # each row is a mean of a class
    S = [(sum(ytrain==g)-1)*np.cov(Xtrain[ytrain==g,:],rowvar =False) 
             for g in np.arange(G)]
    S = np.asarray(S)/len(ytrain)

    # percentage of missing values
    per_missing = np.mean(np.isnan(Xtr_nan))
    # MLEs approach
    start = time.time()
    mus_mle, S_mle = mle(Xtr_nan_list, n, p, G)
    mle_err = err(mus, S, mus_mle.T, S_mle)
    mle_time = time.time()-start
    
    start = time.time()
    Xtr_em = impy.em(Xtr_nan)
    mus_em = np.array([np.mean(Xtr_em[ytrain==g,:], axis=0) for g in np.arange(G)])
    S_em = np.array([(sum(ytrain==g)-1)*np.cov(Xtr_em[ytrain==g,:], rowvar =False) 
             for g in np.arange(G)])
    S_em = S_em/len(ytrain)
    em_err = err(mus, S, mus_em, S_em)
    em_time = time.time()-start      

    start = time.time()
    Xtr_knn = impy.fast_knn(Xtr_nan)
    mus_knn = np.asarray([np.mean(Xtr_knn[ytrain==g,:], axis=0) for g in np.arange(G)])
    S_knn = np.asarray([(sum(ytrain==g)-1)*np.cov(Xtr_knn[ytrain==g,:], rowvar =False) 
             for g in np.arange(G)])
    S_knn = S_knn/len(ytrain)
    knn_err =  err(mus, S, mus_knn, S_knn)
    knn_time = time.time()-start 

    start = time.time()
    Xtr_softimpute = SoftImpute(max_iters = 100).fit_transform(Xtr_nan)
    mus_softimpute = np.asarray([np.mean(Xtr_softimpute[ytrain==g,:], axis=0
                                         ) for g in np.arange(G)])
    S_softimpute = np.asarray([(sum(ytrain==g)-1)*np.cov(Xtr_softimpute[ytrain==g,:], rowvar =False) 
             for g in np.arange(G)])
    S_softimpute = S_softimpute/len(ytrain)
    softimpute_err =  err(mus, S, mus_softimpute, S_softimpute)
    softimpute_time = time.time()-start

    start = time.time()
    Xtr_mice = IterativeImputer(max_iter=100).fit(Xtr_nan).transform(Xtr_nan)
    mus_mice = np.asarray([np.mean(Xtr_mice[ytrain==g,:], axis=0
                                   ) for g in np.arange(G)])
    S_mice = np.asarray([(sum(ytrain==g)-1)*np.cov(Xtr_mice[ytrain==g,:], rowvar =False) 
             for g in np.arange(G)])
    S_mice = S_mice/len(ytrain)
    mice_err = err(mus, S, mus_mice, S_mice)
    mice_time = time.time()-start

    start = time.time()
    Xtr_nuclear = NuclearNormMinimization(max_iters=100).fit_transform(Xtr_nan)
    mus_nuclear = np.asarray([np.mean(Xtr_nuclear[ytrain==g,:], axis=0
                                      ) for g in np.arange(G)])
    S_nuclear = np.asarray([(sum(ytrain==g)-1)*np.cov(Xtr_nuclear[ytrain==g,:], rowvar =False) 
             for g in np.arange(G)])
    S_nuclear = S_nuclear/len(ytrain)
    nuclear_err = err(mus, S, mus_nuclear, S_nuclear)
    nuclear_time = time.time()-start
  
    running_time = np.array([mle_time, knn_time,mice_time,
                             softimpute_time,em_time, nuclear_time,
                            ])
    return running_time

## Inosphere

In [None]:
data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data',
                  sep = ",", header = None)
print(data.head())
data = pd.DataFrame.to_numpy(data)
X, y = data[:,:34].astype(np.float64), data[:,34]
le2 = LabelEncoder()
y = le2.fit_transform(y)
print(len(y))
4/5*np.array([sum(y==0), sum(y==1)])

   0   1        2        3        4   ...       30       31       32       33  34
0   1   0  0.99539 -0.05889  0.85243  ...  0.42267 -0.54487  0.18641 -0.45300   g
1   1   0  1.00000 -0.18829  0.93035  ... -0.16626 -0.06288 -0.13738 -0.02447   b
2   1   0  1.00000 -0.03365  1.00000  ...  0.60436 -0.24180  0.56045 -0.38238   g
3   1   0  1.00000 -0.45161  1.00000  ...  0.25682  1.00000 -0.32382  1.00000   b
4   1   0  1.00000 -0.02401  0.94140  ... -0.05707 -0.59573 -0.04608 -0.65697   g

[5 rows x 35 columns]
351


array([100.8, 180. ])

In [None]:
X = np.delete(X,[0,1], axis = 1)
X.shape

(351, 32)

In [None]:
# set random seed and shuffle the data
np.random.seed(1)
idx = np.arange(len(y))
np.random.shuffle(idx)
X, y = X[idx,:], y[idx]  

### 20%

In [None]:
    n = np.array([[sum(y==0),95,80,70],
                  [sum(y==1),120,110,100]])
    p = np.array([17,25,30,32])
    res20 = compute_err(X,y,n,p,2)

[SoftImpute] Max Singular Value of X_init = 48.959812
[SoftImpute] Iter 1: observed MAE=0.039539 rank=32
[SoftImpute] Iter 2: observed MAE=0.039578 rank=32
[SoftImpute] Iter 3: observed MAE=0.039611 rank=32
[SoftImpute] Iter 4: observed MAE=0.039639 rank=32
[SoftImpute] Iter 5: observed MAE=0.039662 rank=32
[SoftImpute] Iter 6: observed MAE=0.039681 rank=32
[SoftImpute] Iter 7: observed MAE=0.039697 rank=32
[SoftImpute] Iter 8: observed MAE=0.039710 rank=32
[SoftImpute] Iter 9: observed MAE=0.039719 rank=32
[SoftImpute] Iter 10: observed MAE=0.039727 rank=32
[SoftImpute] Iter 11: observed MAE=0.039734 rank=32
[SoftImpute] Iter 12: observed MAE=0.039739 rank=32
[SoftImpute] Iter 13: observed MAE=0.039743 rank=32
[SoftImpute] Iter 14: observed MAE=0.039746 rank=32
[SoftImpute] Iter 15: observed MAE=0.039749 rank=32
[SoftImpute] Iter 16: observed MAE=0.039750 rank=32
[SoftImpute] Iter 17: observed MAE=0.039752 rank=32
[SoftImpute] Iter 18: observed MAE=0.039752 rank=32
[SoftImpute] Iter 1

In [None]:
res20

array([0.00943456, 0.03284996, 0.03243629, 0.03272348, 0.03342111,
       0.03270583,        nan, 0.20076567])

### 30%

In [None]:
    n = np.array([[sum(y==0),85,80,75],
                  [sum(y==1),100,90,85]])
    p = np.array([12,27,30,32])  
    res30 = compute_err(X,y,n,p,2)

[SoftImpute] Max Singular Value of X_init = 43.073685
[SoftImpute] Iter 1: observed MAE=0.037598 rank=32
[SoftImpute] Iter 2: observed MAE=0.037641 rank=32
[SoftImpute] Iter 3: observed MAE=0.037679 rank=32
[SoftImpute] Iter 4: observed MAE=0.037711 rank=32
[SoftImpute] Iter 5: observed MAE=0.037738 rank=32
[SoftImpute] Iter 6: observed MAE=0.037761 rank=32
[SoftImpute] Iter 7: observed MAE=0.037781 rank=32
[SoftImpute] Iter 8: observed MAE=0.037798 rank=32
[SoftImpute] Iter 9: observed MAE=0.037813 rank=32
[SoftImpute] Iter 10: observed MAE=0.037825 rank=32
[SoftImpute] Iter 11: observed MAE=0.037835 rank=32
[SoftImpute] Iter 12: observed MAE=0.037844 rank=32
[SoftImpute] Iter 13: observed MAE=0.037850 rank=32
[SoftImpute] Iter 14: observed MAE=0.037856 rank=32
[SoftImpute] Iter 15: observed MAE=0.037860 rank=32
[SoftImpute] Iter 16: observed MAE=0.037863 rank=32
[SoftImpute] Iter 17: observed MAE=0.037866 rank=32
[SoftImpute] Iter 18: observed MAE=0.037867 rank=32
[SoftImpute] Iter 1

### 40%

In [None]:
    n = np.array([[sum(y==0),85,70,55],
                  [sum(y==1),100,80,75]])
    p = np.array([10,15,20,32])  
    res40 = compute_err(X,y,n,p,2)

[SoftImpute] Max Singular Value of X_init = 39.508962
[SoftImpute] Iter 1: observed MAE=0.036746 rank=32
[SoftImpute] Iter 2: observed MAE=0.036808 rank=32
[SoftImpute] Iter 3: observed MAE=0.036865 rank=32
[SoftImpute] Iter 4: observed MAE=0.036915 rank=32
[SoftImpute] Iter 5: observed MAE=0.036961 rank=32
[SoftImpute] Iter 6: observed MAE=0.037002 rank=32
[SoftImpute] Iter 7: observed MAE=0.037037 rank=32
[SoftImpute] Iter 8: observed MAE=0.037068 rank=32
[SoftImpute] Iter 9: observed MAE=0.037096 rank=32
[SoftImpute] Iter 10: observed MAE=0.037120 rank=32
[SoftImpute] Iter 11: observed MAE=0.037141 rank=32
[SoftImpute] Iter 12: observed MAE=0.037160 rank=32
[SoftImpute] Iter 13: observed MAE=0.037177 rank=32
[SoftImpute] Iter 14: observed MAE=0.037193 rank=32
[SoftImpute] Iter 15: observed MAE=0.037208 rank=32
[SoftImpute] Iter 16: observed MAE=0.037220 rank=32
[SoftImpute] Iter 17: observed MAE=0.037231 rank=32
[SoftImpute] Iter 18: observed MAE=0.037241 rank=32
[SoftImpute] Iter 1

### 50%




In [None]:
    n = np.array([[sum(y==0),65,55,45],
                  [sum(y==1),80,70,65]])
    p = np.array([8,12,14,32])
    missing_rate(X,y,n,p,2)  

0.4998219373219373

In [None]:
res50 = compute_err(X,y,n,p,2)

[SoftImpute] Max Singular Value of X_init = 34.966795
[SoftImpute] Iter 1: observed MAE=0.035461 rank=32
[SoftImpute] Iter 2: observed MAE=0.035533 rank=32
[SoftImpute] Iter 3: observed MAE=0.035599 rank=32
[SoftImpute] Iter 4: observed MAE=0.035662 rank=32
[SoftImpute] Iter 5: observed MAE=0.035722 rank=32
[SoftImpute] Iter 6: observed MAE=0.035778 rank=32
[SoftImpute] Iter 7: observed MAE=0.035829 rank=32
[SoftImpute] Iter 8: observed MAE=0.035876 rank=32
[SoftImpute] Iter 9: observed MAE=0.035918 rank=32
[SoftImpute] Iter 10: observed MAE=0.035957 rank=32
[SoftImpute] Iter 11: observed MAE=0.035992 rank=32
[SoftImpute] Iter 12: observed MAE=0.036024 rank=32
[SoftImpute] Iter 13: observed MAE=0.036054 rank=32
[SoftImpute] Iter 14: observed MAE=0.036080 rank=32
[SoftImpute] Iter 15: observed MAE=0.036105 rank=32
[SoftImpute] Iter 16: observed MAE=0.036127 rank=32
[SoftImpute] Iter 17: observed MAE=0.036147 rank=32
[SoftImpute] Iter 18: observed MAE=0.036166 rank=32
[SoftImpute] Iter 1

In [None]:
res50

array([0.0165614 , 0.0335176 , 0.03340561, 0.03336637, 0.0338332 ,
       0.03334309, 0.49982194])

### results

In [None]:
inosphere = np.vstack((res20, res30, res40))
inosphere

array([[0.00943456, 0.03284996, 0.03243629, 0.03272348, 0.03319178,
        0.03270583, 0.20076567],
       [0.01167547, 0.03340321, 0.03303713, 0.03328126, 0.03365646,
        0.03324693, 0.30404202],
       [0.01493101, 0.03294414, 0.03336559, 0.03304057, 0.03360423,
        0.03301359, 0.39948362]])

# seeds 

In [None]:
data = pd.read_table('https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt',
                     sep = '\s+', header = None)
print(data.head())
data = pd.DataFrame.to_numpy(data)
# reset the labels to go start from 0  
X,y = data[:,:7], data[:,7]-1 

       0      1       2      3      4      5      6  7
0  15.26  14.84  0.8710  5.763  3.312  2.221  5.220  1
1  14.88  14.57  0.8811  5.554  3.333  1.018  4.956  1
2  14.29  14.09  0.9050  5.291  3.337  2.699  4.825  1
3  13.84  13.94  0.8955  5.324  3.379  2.259  4.805  1
4  16.14  14.99  0.9034  5.658  3.562  1.355  5.175  1


In [None]:
# set random seed and shuffle the data
np.random.seed(1)
idx = np.arange(len(y))
np.random.shuffle(idx)
X, y = X[idx,:], y[idx]  

## 20%

In [None]:
    p = np.array([3,5,7])
    n = np.array([[sum(y==0),50,40], [sum(y==1),50,40],
                  [sum(y==2),50,40]])    
    res20 = compute_err(X,y,n,p,3)

[SoftImpute] Max Singular Value of X_init = 28.545760
[SoftImpute] Iter 1: observed MAE=0.031347 rank=6
[SoftImpute] Iter 2: observed MAE=0.031495 rank=6
[SoftImpute] Iter 3: observed MAE=0.031634 rank=6
[SoftImpute] Iter 4: observed MAE=0.031764 rank=6
[SoftImpute] Iter 5: observed MAE=0.031887 rank=6
[SoftImpute] Iter 6: observed MAE=0.031997 rank=6
[SoftImpute] Iter 7: observed MAE=0.032095 rank=6
[SoftImpute] Iter 8: observed MAE=0.032185 rank=6
[SoftImpute] Iter 9: observed MAE=0.032261 rank=6
[SoftImpute] Iter 10: observed MAE=0.032321 rank=6
[SoftImpute] Iter 11: observed MAE=0.032373 rank=6
[SoftImpute] Iter 12: observed MAE=0.032416 rank=6
[SoftImpute] Iter 13: observed MAE=0.032452 rank=6
[SoftImpute] Iter 14: observed MAE=0.032484 rank=6
[SoftImpute] Iter 15: observed MAE=0.032510 rank=6
[SoftImpute] Iter 16: observed MAE=0.032534 rank=6
[SoftImpute] Iter 17: observed MAE=0.032555 rank=6
[SoftImpute] Iter 18: observed MAE=0.032576 rank=6
[SoftImpute] Iter 19: observed MAE=0.

In [None]:
res20

array([0.01553881, 0.19534817, 0.19680394, 0.19630229, 0.19618772,
       0.19631132, 0.20408163])

## 30%

In [None]:
    p = np.array([2,3,5,7])
    n = np.array([[sum(y==0),60,40,35], [sum(y==1),60,40,30],
                  [sum(y==2),60,40, 30]])    
    res30 = compute_err(X,y,n,p,3)

[SoftImpute] Max Singular Value of X_init = 26.551899
[SoftImpute] Iter 1: observed MAE=0.032275 rank=7
[SoftImpute] Iter 2: observed MAE=0.032432 rank=7
[SoftImpute] Iter 3: observed MAE=0.032584 rank=7
[SoftImpute] Iter 4: observed MAE=0.032723 rank=7
[SoftImpute] Iter 5: observed MAE=0.032847 rank=7
[SoftImpute] Iter 6: observed MAE=0.032955 rank=7
[SoftImpute] Iter 7: observed MAE=0.033056 rank=7
[SoftImpute] Iter 8: observed MAE=0.033146 rank=7
[SoftImpute] Iter 9: observed MAE=0.033223 rank=7
[SoftImpute] Iter 10: observed MAE=0.033287 rank=7
[SoftImpute] Iter 11: observed MAE=0.033342 rank=7
[SoftImpute] Iter 12: observed MAE=0.033389 rank=7
[SoftImpute] Iter 13: observed MAE=0.033428 rank=7
[SoftImpute] Iter 14: observed MAE=0.033462 rank=7
[SoftImpute] Iter 15: observed MAE=0.033491 rank=7
[SoftImpute] Iter 16: observed MAE=0.033518 rank=7
[SoftImpute] Iter 17: observed MAE=0.033542 rank=7
[SoftImpute] Iter 18: observed MAE=0.033565 rank=7
[SoftImpute] Iter 19: observed MAE=0.

In [None]:
    res30

array([0.01748592, 0.19381187, 0.19554525, 0.19533633, 0.19545069,
       0.19538507, 0.29931973])

## 40%

In [None]:
    p = np.array([2,3,5,7])
    n = np.array([[sum(y==0),39,30,28], [sum(y==1),39,30,28],
                  [sum(y==2),37,30, 28]])  
    missing_rate(X,y,n,p,3)

0.39931972789115644

In [None]:
    res40 = compute_err(X,y,n,p,3)

[SoftImpute] Max Singular Value of X_init = 24.515657
[SoftImpute] Iter 1: observed MAE=0.032039 rank=7
[SoftImpute] Iter 2: observed MAE=0.032244 rank=7
[SoftImpute] Iter 3: observed MAE=0.032434 rank=7
[SoftImpute] Iter 4: observed MAE=0.032597 rank=7
[SoftImpute] Iter 5: observed MAE=0.032728 rank=7
[SoftImpute] Iter 6: observed MAE=0.032828 rank=7
[SoftImpute] Iter 7: observed MAE=0.032901 rank=7
[SoftImpute] Iter 8: observed MAE=0.032957 rank=7
[SoftImpute] Iter 9: observed MAE=0.033000 rank=7
[SoftImpute] Iter 10: observed MAE=0.033029 rank=7
[SoftImpute] Iter 11: observed MAE=0.033050 rank=7
[SoftImpute] Iter 12: observed MAE=0.033065 rank=7
[SoftImpute] Iter 13: observed MAE=0.033074 rank=7
[SoftImpute] Iter 14: observed MAE=0.033080 rank=7
[SoftImpute] Iter 15: observed MAE=0.033083 rank=7
[SoftImpute] Iter 16: observed MAE=0.033084 rank=7
[SoftImpute] Iter 17: observed MAE=0.033083 rank=7
[SoftImpute] Iter 18: observed MAE=0.033083 rank=7
[SoftImpute] Iter 19: observed MAE=0.

In [None]:
res40

array([0.0224862 , 0.18818759, 0.19397734, 0.19138403, 0.18459814,
       0.19221158, 0.39931973])

## 50% 

In [None]:
    p = np.array([2,3,4,7])
    n = np.array([[sum(y==0),23,22,20], [sum(y==1),23,22,20],
                  [sum(y==2),22,21, 20]])  
    missing_rate(X,y,n,p,3)

0.501360544217687

In [None]:
    res50 = compute_err(X,y,n,p,3)
    res50

[SoftImpute] Max Singular Value of X_init = 22.508752
[SoftImpute] Iter 1: observed MAE=0.032230 rank=7
[SoftImpute] Iter 2: observed MAE=0.032343 rank=7
[SoftImpute] Iter 3: observed MAE=0.032446 rank=7
[SoftImpute] Iter 4: observed MAE=0.032537 rank=7
[SoftImpute] Iter 5: observed MAE=0.032618 rank=7
[SoftImpute] Iter 6: observed MAE=0.032690 rank=7
[SoftImpute] Iter 7: observed MAE=0.032758 rank=7
[SoftImpute] Iter 8: observed MAE=0.032819 rank=7
[SoftImpute] Iter 9: observed MAE=0.032874 rank=7
[SoftImpute] Iter 10: observed MAE=0.032921 rank=7
[SoftImpute] Iter 11: observed MAE=0.032962 rank=7
[SoftImpute] Iter 12: observed MAE=0.032999 rank=7
[SoftImpute] Iter 13: observed MAE=0.033029 rank=7
[SoftImpute] Iter 14: observed MAE=0.033054 rank=7
[SoftImpute] Iter 15: observed MAE=0.033074 rank=7
[SoftImpute] Iter 16: observed MAE=0.033090 rank=7
[SoftImpute] Iter 17: observed MAE=0.033102 rank=7
[SoftImpute] Iter 18: observed MAE=0.033111 rank=7
[SoftImpute] Iter 19: observed MAE=0.

array([0.02216568, 0.18534898, 0.19231731, 0.18722826, 0.18771439,
       0.18842451, 0.50136054])

## result

In [None]:
seeds = np.vstack((res20,res30,res40))
seeds

array([[0.01553881, 0.19534817, 0.19680394, 0.19630229, 0.19756143,
        0.19631132, 0.20408163],
       [0.01748592, 0.19381187, 0.19554525, 0.19533633, 0.19545069,
        0.19538507, 0.29931973],
       [0.0224862 , 0.18818759, 0.19397734, 0.19138403, 0.18459814,
        0.19221158, 0.39931973]])

# Parkinson dataset

https://archive.ics.uci.edu/ml/datasets/Parkinsons


In [None]:
data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data',
                  sep = ",")
data = data.drop(['name'], axis = 1)
X, y = data.drop(['status'], axis = 1), data['status']
X = pd.DataFrame.to_numpy(X)
np.array([sum(y==0), sum(y==1)])                   

array([ 48, 147])

In [None]:
# set random seed and shuffle the data
np.random.seed(1)
idx = np.arange(len(y))
np.random.shuffle(idx)
X, y = X[idx,:], y[idx]  

## 20%

In [None]:
    n = np.array([[sum(y==0),40,35, 30],
                  [sum(y==1),120,110,100]])
    p = np.array([6,10,15,22])
    res20 = compute_err(X,y, n, p, 2)

[SoftImpute] Max Singular Value of X_init = 43.858609
[SoftImpute] Iter 1: observed MAE=0.047508 rank=20
[SoftImpute] Iter 2: observed MAE=0.047595 rank=19
[SoftImpute] Iter 3: observed MAE=0.047654 rank=19
[SoftImpute] Iter 4: observed MAE=0.047692 rank=19
[SoftImpute] Iter 5: observed MAE=0.047715 rank=19
[SoftImpute] Iter 6: observed MAE=0.047729 rank=19
[SoftImpute] Iter 7: observed MAE=0.047732 rank=19
[SoftImpute] Iter 8: observed MAE=0.047729 rank=19
[SoftImpute] Iter 9: observed MAE=0.047721 rank=19
[SoftImpute] Iter 10: observed MAE=0.047709 rank=19
[SoftImpute] Iter 11: observed MAE=0.047694 rank=19
[SoftImpute] Iter 12: observed MAE=0.047673 rank=19
[SoftImpute] Iter 13: observed MAE=0.047644 rank=19
[SoftImpute] Iter 14: observed MAE=0.047612 rank=19
[SoftImpute] Iter 15: observed MAE=0.047583 rank=19
[SoftImpute] Iter 16: observed MAE=0.047554 rank=19
[SoftImpute] Iter 17: observed MAE=0.047509 rank=18
[SoftImpute] Iter 18: observed MAE=0.047465 rank=18
[SoftImpute] Iter 1



----------------------------------------------------------------------------
	SCS v2.1.2 - Splitting Conic Solver
	(c) Brendan O'Donoghue, Stanford University, 2012
----------------------------------------------------------------------------
Lin-sys: sparse-direct, nnz in A = 51993
eps = 1.00e-04, alpha = 1.50, max_iters = 100, normalize = 1, scale = 1.00
acceleration_lookback = 10, rho_x = 1.00e-03
Variables n = 32233, constraints m = 40813
Cones:	primal zero / dual free vars: 4290
	linear vars: 12870
	sd vars: 23653, sd blks: 1
Setup time: 4.36e-02s
----------------------------------------------------------------------------
 Iter | pri res | dua res | rel gap | pri obj | dua obj | kap/tau | time (s)
----------------------------------------------------------------------------
     0| 2.23e+21  2.09e+21  1.00e+00 -9.85e+23  2.04e+23  3.35e+23  4.21e-02 
   100| 1.86e-04  1.72e-04  2.54e-05  1.71e+02  1.71e+02  6.82e-15  3.81e+00 
-------------------------------------------------------

In [None]:
res20

array([0.01541584, 0.06242301, 0.05919733, 0.06078882, 0.06369383,
       0.0607681 , 0.1969697 ])

## 30%

In [None]:
    n = np.array([[sum(y==0),30,27,25],
                  [sum(y==1),100,90,85]])
    p = np.array([5,10,17,22])
    res30 = compute_err(X,y, n, p, 2)

[SoftImpute] Max Singular Value of X_init = 38.482354
[SoftImpute] Iter 1: observed MAE=0.042989 rank=19
[SoftImpute] Iter 2: observed MAE=0.043168 rank=19
[SoftImpute] Iter 3: observed MAE=0.043298 rank=19
[SoftImpute] Iter 4: observed MAE=0.043399 rank=19
[SoftImpute] Iter 5: observed MAE=0.043478 rank=19
[SoftImpute] Iter 6: observed MAE=0.043540 rank=19
[SoftImpute] Iter 7: observed MAE=0.043593 rank=19
[SoftImpute] Iter 8: observed MAE=0.043640 rank=19
[SoftImpute] Iter 9: observed MAE=0.043681 rank=19
[SoftImpute] Iter 10: observed MAE=0.043716 rank=19
[SoftImpute] Iter 11: observed MAE=0.043747 rank=19
[SoftImpute] Iter 12: observed MAE=0.043776 rank=19
[SoftImpute] Iter 13: observed MAE=0.043802 rank=19
[SoftImpute] Iter 14: observed MAE=0.043825 rank=19
[SoftImpute] Iter 15: observed MAE=0.043848 rank=19
[SoftImpute] Iter 16: observed MAE=0.043870 rank=19
[SoftImpute] Iter 17: observed MAE=0.043891 rank=19
[SoftImpute] Iter 18: observed MAE=0.043910 rank=19
[SoftImpute] Iter 1



----------------------------------------------------------------------------
	SCS v2.1.2 - Splitting Conic Solver
	(c) Brendan O'Donoghue, Stanford University, 2012
----------------------------------------------------------------------------
Lin-sys: sparse-direct, nnz in A = 51091
eps = 1.00e-04, alpha = 1.50, max_iters = 100, normalize = 1, scale = 1.00
acceleration_lookback = 10, rho_x = 1.00e-03
Variables n = 32233, constraints m = 40813
Cones:	primal zero / dual free vars: 4290
	linear vars: 12870
	sd vars: 23653, sd blks: 1
Setup time: 4.45e-02s
----------------------------------------------------------------------------
 Iter | pri res | dua res | rel gap | pri obj | dua obj | kap/tau | time (s)
----------------------------------------------------------------------------
     0| 2.25e+21  2.11e+21  1.00e+00 -9.28e+23  1.93e+23  3.06e+23  2.57e-02 
   100| 1.20e-03  1.27e-03  1.59e-04  1.64e+02  1.64e+02  4.93e-14  3.78e+00 
-------------------------------------------------------

In [None]:
    res30

array([0.01718869, 0.06574795, 0.06367668, 0.06340785, 0.06796594,
       0.06322138, 0.3020979 ])

## 40%

In [None]:
    n = np.array([[sum(y==0),30,25,21],
                  [sum(y==1),90,70,50]])
    p = np.array([5,10,17,22])
    res40 = compute_err(X,y, n, p, 2)

[SoftImpute] Max Singular Value of X_init = 34.067043
[SoftImpute] Iter 1: observed MAE=0.040371 rank=20
[SoftImpute] Iter 2: observed MAE=0.040520 rank=20
[SoftImpute] Iter 3: observed MAE=0.040634 rank=20
[SoftImpute] Iter 4: observed MAE=0.040730 rank=20
[SoftImpute] Iter 5: observed MAE=0.040807 rank=20
[SoftImpute] Iter 6: observed MAE=0.040874 rank=20
[SoftImpute] Iter 7: observed MAE=0.040930 rank=20
[SoftImpute] Iter 8: observed MAE=0.040976 rank=20
[SoftImpute] Iter 9: observed MAE=0.041013 rank=20
[SoftImpute] Iter 10: observed MAE=0.041046 rank=20
[SoftImpute] Iter 11: observed MAE=0.041072 rank=20
[SoftImpute] Iter 12: observed MAE=0.041096 rank=20
[SoftImpute] Iter 13: observed MAE=0.041117 rank=20
[SoftImpute] Iter 14: observed MAE=0.041137 rank=20
[SoftImpute] Iter 15: observed MAE=0.041155 rank=20
[SoftImpute] Iter 16: observed MAE=0.041170 rank=20
[SoftImpute] Iter 17: observed MAE=0.041185 rank=20
[SoftImpute] Iter 18: observed MAE=0.041197 rank=20
[SoftImpute] Iter 1



----------------------------------------------------------------------------
	SCS v2.1.2 - Splitting Conic Solver
	(c) Brendan O'Donoghue, Stanford University, 2012
----------------------------------------------------------------------------
Lin-sys: sparse-direct, nnz in A = 50293
eps = 1.00e-04, alpha = 1.50, max_iters = 100, normalize = 1, scale = 1.00
acceleration_lookback = 10, rho_x = 1.00e-03
Variables n = 32233, constraints m = 40813
Cones:	primal zero / dual free vars: 4290
	linear vars: 12870
	sd vars: 23653, sd blks: 1
Setup time: 3.98e-02s
----------------------------------------------------------------------------
 Iter | pri res | dua res | rel gap | pri obj | dua obj | kap/tau | time (s)
----------------------------------------------------------------------------
     0| 2.27e+21  2.12e+21  1.00e+00 -8.72e+23  1.81e+23  2.79e+23  2.98e-02 
   100| 3.89e-04  6.05e-04  1.36e-04  1.58e+02  1.58e+02  2.86e-14  3.75e+00 
-------------------------------------------------------

In [None]:
parkinson = np.vstack((res20,res30,res40))
parkinson

array([[0.01541584, 0.06242301, 0.05919733, 0.06078882, 0.06369383,
        0.0607681 , 0.1969697 ],
       [0.01718869, 0.06574795, 0.06367668, 0.06340785, 0.06796594,
        0.06322138, 0.3020979 ],
       [0.02186154, 0.07250771, 0.07307222, 0.06974819, 0.07422517,
        0.06905229, 0.3951049 ]])

## 50%

In [None]:
    n = np.array([[sum(y==0),30,25,20],
                  [sum(y==1),70,45,40]])
    p = np.array([5,8,12,22])
    missing_rate(X,y, n,p,2)

0.4976689976689977

In [None]:
res50 = compute_err(X,y, n, p, 2)
res50

[SoftImpute] Max Singular Value of X_init = 28.440608
[SoftImpute] Iter 1: observed MAE=0.036377 rank=20
[SoftImpute] Iter 2: observed MAE=0.036566 rank=20
[SoftImpute] Iter 3: observed MAE=0.036595 rank=19
[SoftImpute] Iter 4: observed MAE=0.036556 rank=19
[SoftImpute] Iter 5: observed MAE=0.036516 rank=19
[SoftImpute] Iter 6: observed MAE=0.036480 rank=19
[SoftImpute] Iter 7: observed MAE=0.036448 rank=19
[SoftImpute] Iter 8: observed MAE=0.036420 rank=19
[SoftImpute] Iter 9: observed MAE=0.036412 rank=19
[SoftImpute] Iter 10: observed MAE=0.036415 rank=19
[SoftImpute] Iter 11: observed MAE=0.036420 rank=19
[SoftImpute] Iter 12: observed MAE=0.036427 rank=19
[SoftImpute] Iter 13: observed MAE=0.036432 rank=19
[SoftImpute] Iter 14: observed MAE=0.036434 rank=19
[SoftImpute] Iter 15: observed MAE=0.036434 rank=19
[SoftImpute] Iter 16: observed MAE=0.036433 rank=19
[SoftImpute] Iter 17: observed MAE=0.036431 rank=19
[SoftImpute] Iter 18: observed MAE=0.036430 rank=19
[SoftImpute] Iter 1



----------------------------------------------------------------------------
	SCS v2.1.2 - Splitting Conic Solver
	(c) Brendan O'Donoghue, Stanford University, 2012
----------------------------------------------------------------------------
Lin-sys: sparse-direct, nnz in A = 49413
eps = 1.00e-04, alpha = 1.50, max_iters = 100, normalize = 1, scale = 1.00
acceleration_lookback = 10, rho_x = 1.00e-03
Variables n = 32233, constraints m = 40813
Cones:	primal zero / dual free vars: 4290
	linear vars: 12870
	sd vars: 23653, sd blks: 1
Setup time: 6.57e-02s
----------------------------------------------------------------------------
 Iter | pri res | dua res | rel gap | pri obj | dua obj | kap/tau | time (s)
----------------------------------------------------------------------------
     0| 2.29e+21  2.14e+21  1.00e+00 -8.02e+23  1.66e+23  2.48e+23  2.36e-02 
   100| 3.58e-04  4.43e-04  1.30e-04  1.46e+02  1.46e+02  2.38e-14  3.59e+00 
-------------------------------------------------------

array([0.03036777, 0.08348728, 0.07765944, 0.08100744, 0.08647955,
       0.08014575, 0.497669  ])

# wine
The data set is also available in sklearn, as noted in the package's website. So, we load it directly from sklearn

In [None]:
wine = datasets.load_wine()
X,y = wine.data, wine.target.ravel() 
sum(y==0), sum(y==1), sum(y==2)

(59, 71, 48)

In [None]:
# set random seed and shuffle the data
np.random.seed(1)
idx = np.arange(len(y))
np.random.shuffle(idx)
X, y = X[idx,:], y[idx]  

## 20%

In [None]:
    n = np.array([[sum(y==0),50,40,35], [sum(y==1),60,55,50],
                  [sum(y==2),40,35,30]])
    p = np.array([3,5, 12,13])    
    res20 = compute_err(X, y, n, p, 3)
    res20

[SoftImpute] Max Singular Value of X_init = 24.757295
[SoftImpute] Iter 1: observed MAE=0.032345 rank=13
[SoftImpute] Iter 2: observed MAE=0.032369 rank=13
[SoftImpute] Iter 3: observed MAE=0.032390 rank=13
[SoftImpute] Iter 4: observed MAE=0.032410 rank=13
[SoftImpute] Iter 5: observed MAE=0.032429 rank=13
[SoftImpute] Iter 6: observed MAE=0.032446 rank=13
[SoftImpute] Iter 7: observed MAE=0.032461 rank=13
[SoftImpute] Iter 8: observed MAE=0.032476 rank=13
[SoftImpute] Iter 9: observed MAE=0.032489 rank=13
[SoftImpute] Iter 10: observed MAE=0.032501 rank=13
[SoftImpute] Iter 11: observed MAE=0.032511 rank=13
[SoftImpute] Iter 12: observed MAE=0.032521 rank=13
[SoftImpute] Iter 13: observed MAE=0.032530 rank=13
[SoftImpute] Iter 14: observed MAE=0.032538 rank=13
[SoftImpute] Iter 15: observed MAE=0.032545 rank=13
[SoftImpute] Iter 16: observed MAE=0.032551 rank=13
[SoftImpute] Iter 17: observed MAE=0.032557 rank=13
[SoftImpute] Iter 18: observed MAE=0.032562 rank=13
[SoftImpute] Iter 1

array([0.0137993 , 0.11041898, 0.10881307, 0.10994632, 0.1106833 ,
       0.10981267, 0.19662921])

## 30%

In [None]:
    n = np.array([[sum(y==0),45,40,30], [sum(y==1),45,40,35],
                  [sum(y==2),39,35,30]])
    p = np.array([3,5,8,13])      
    res30 = compute_err(X, y, n, p, 3)
    res30

[SoftImpute] Max Singular Value of X_init = 22.856361
[SoftImpute] Iter 1: observed MAE=0.031798 rank=13
[SoftImpute] Iter 2: observed MAE=0.031824 rank=13
[SoftImpute] Iter 3: observed MAE=0.031848 rank=13
[SoftImpute] Iter 4: observed MAE=0.031870 rank=13
[SoftImpute] Iter 5: observed MAE=0.031890 rank=13
[SoftImpute] Iter 6: observed MAE=0.031911 rank=13
[SoftImpute] Iter 7: observed MAE=0.031931 rank=13
[SoftImpute] Iter 8: observed MAE=0.031950 rank=13
[SoftImpute] Iter 9: observed MAE=0.031967 rank=13
[SoftImpute] Iter 10: observed MAE=0.031983 rank=13
[SoftImpute] Iter 11: observed MAE=0.031999 rank=13
[SoftImpute] Iter 12: observed MAE=0.032014 rank=13
[SoftImpute] Iter 13: observed MAE=0.032027 rank=13
[SoftImpute] Iter 14: observed MAE=0.032040 rank=13
[SoftImpute] Iter 15: observed MAE=0.032052 rank=13
[SoftImpute] Iter 16: observed MAE=0.032063 rank=13
[SoftImpute] Iter 17: observed MAE=0.032073 rank=13
[SoftImpute] Iter 18: observed MAE=0.032083 rank=13
[SoftImpute] Iter 1

array([0.01573624, 0.11183001, 0.10827662, 0.11034529, 0.11737417,
       0.1101785 , 0.30337079])

## 40%

In [None]:
    n = np.array([[sum(y==0),35,30,27], [sum(y==1),37,32,27],
                  [sum(y==2),32,27,24]])
    p = np.array([3,5,8,13])    
    res40 = compute_err(X,y, n, p, 3)

[SoftImpute] Max Singular Value of X_init = 20.664846
[SoftImpute] Iter 1: observed MAE=0.030691 rank=13
[SoftImpute] Iter 2: observed MAE=0.030711 rank=13
[SoftImpute] Iter 3: observed MAE=0.030729 rank=13
[SoftImpute] Iter 4: observed MAE=0.030747 rank=13
[SoftImpute] Iter 5: observed MAE=0.030765 rank=13
[SoftImpute] Iter 6: observed MAE=0.030782 rank=13
[SoftImpute] Iter 7: observed MAE=0.030797 rank=13
[SoftImpute] Iter 8: observed MAE=0.030812 rank=13
[SoftImpute] Iter 9: observed MAE=0.030826 rank=13
[SoftImpute] Iter 10: observed MAE=0.030838 rank=13
[SoftImpute] Iter 11: observed MAE=0.030850 rank=13
[SoftImpute] Iter 12: observed MAE=0.030860 rank=13
[SoftImpute] Iter 13: observed MAE=0.030870 rank=13
[SoftImpute] Iter 14: observed MAE=0.030879 rank=13
[SoftImpute] Iter 15: observed MAE=0.030887 rank=13
[SoftImpute] Iter 16: observed MAE=0.030894 rank=13
[SoftImpute] Iter 17: observed MAE=0.030900 rank=13
[SoftImpute] Iter 18: observed MAE=0.030907 rank=13
[SoftImpute] Iter 1

## 50%

In [None]:
    n = np.array([[sum(y==0),35,27,22], [sum(y==1),35,27,22],
                  [sum(y==2),30,25,20]])
    p = np.array([2,4,5,13])    
    missing_rate(X,y,n,p,3)

0.5043215211754538

In [None]:
    res50 = compute_err(X,y, n, p, 3)
    res50

[SoftImpute] Max Singular Value of X_init = 18.613550
[SoftImpute] Iter 1: observed MAE=0.030634 rank=13
[SoftImpute] Iter 2: observed MAE=0.030655 rank=13
[SoftImpute] Iter 3: observed MAE=0.030673 rank=13
[SoftImpute] Iter 4: observed MAE=0.030689 rank=13
[SoftImpute] Iter 5: observed MAE=0.030702 rank=13
[SoftImpute] Iter 6: observed MAE=0.030715 rank=13
[SoftImpute] Iter 7: observed MAE=0.030727 rank=13
[SoftImpute] Iter 8: observed MAE=0.030738 rank=13
[SoftImpute] Iter 9: observed MAE=0.030747 rank=13
[SoftImpute] Iter 10: observed MAE=0.030756 rank=13
[SoftImpute] Iter 11: observed MAE=0.030763 rank=13
[SoftImpute] Iter 12: observed MAE=0.030771 rank=13
[SoftImpute] Iter 13: observed MAE=0.030778 rank=13
[SoftImpute] Iter 14: observed MAE=0.030785 rank=13
[SoftImpute] Iter 15: observed MAE=0.030791 rank=13
[SoftImpute] Iter 16: observed MAE=0.030798 rank=13
[SoftImpute] Iter 17: observed MAE=0.030804 rank=13
[SoftImpute] Iter 18: observed MAE=0.030810 rank=13
[SoftImpute] Iter 1

array([0.02202531, 0.11176594, 0.10991095, 0.11121482, 0.11705189,
       0.11109865, 0.50432152])

## results

In [None]:
wine = np.vstack((res20, res30, res40))
wine

array([[0.0137993 , 0.11041898, 0.10881307, 0.10994632, 0.1106833 ,
        0.10981267, 0.19662921],
       [0.01573624, 0.11183001, 0.10827662, 0.11034529, 0.11737417,
        0.1101785 , 0.30337079],
       [0.01860003, 0.11285862, 0.10915379, 0.11180951, 0.11221969,
        0.11161653, 0.39541919]])

# Iris
The data set is also available in sklearn, as noted in the package's website. So, we load it directly from sklearn

## 20%

In [None]:
iris = datasets.load_iris()
X,y = iris.data, iris.target.ravel() 

In [None]:
# set random seed and shuffle the data
np.random.seed(1)
idx = np.arange(len(y))
np.random.shuffle(idx)
X, y = X[idx,:], y[idx]  

In [None]:
n = np.array([[sum(y==0),40,30], [sum(y==1),40,30],
                  [sum(y==2),40,30]])
p = np.array([1,3,4])    
res20 =  compute_err(X,y, n, p, 3)
res20

[SoftImpute] Max Singular Value of X_init = 17.877569
[SoftImpute] Iter 1: observed MAE=0.024402 rank=4
[SoftImpute] Iter 2: observed MAE=0.024467 rank=4
[SoftImpute] Iter 3: observed MAE=0.024532 rank=4
[SoftImpute] Iter 4: observed MAE=0.024597 rank=4
[SoftImpute] Iter 5: observed MAE=0.024662 rank=4
[SoftImpute] Iter 6: observed MAE=0.024728 rank=4
[SoftImpute] Iter 7: observed MAE=0.024790 rank=4
[SoftImpute] Iter 8: observed MAE=0.024850 rank=4
[SoftImpute] Iter 9: observed MAE=0.024906 rank=4
[SoftImpute] Iter 10: observed MAE=0.024958 rank=4
[SoftImpute] Iter 11: observed MAE=0.025011 rank=4
[SoftImpute] Iter 12: observed MAE=0.025067 rank=4
[SoftImpute] Iter 13: observed MAE=0.025124 rank=4
[SoftImpute] Iter 14: observed MAE=0.025177 rank=4
[SoftImpute] Iter 15: observed MAE=0.025229 rank=4
[SoftImpute] Iter 16: observed MAE=0.025279 rank=4
[SoftImpute] Iter 17: observed MAE=0.025329 rank=4
[SoftImpute] Iter 18: observed MAE=0.025375 rank=4
[SoftImpute] Iter 19: observed MAE=0.

array([0.02617632, 0.28231714, 0.28823217, 0.28583875, 0.28528997,
       0.28588525, 0.2       ])

## 30%

In [None]:
n = np.array([[sum(y==0),33,25], [sum(y==1),33,25],
                  [sum(y==2),33,25]])
p = np.array([1,3,4])
res30 = compute_err(X,y, n, p, 3)

[SoftImpute] Max Singular Value of X_init = 16.673944
[SoftImpute] Iter 1: observed MAE=0.024106 rank=4
[SoftImpute] Iter 2: observed MAE=0.024158 rank=4
[SoftImpute] Iter 3: observed MAE=0.024210 rank=4
[SoftImpute] Iter 4: observed MAE=0.024261 rank=4
[SoftImpute] Iter 5: observed MAE=0.024312 rank=4
[SoftImpute] Iter 6: observed MAE=0.024364 rank=4
[SoftImpute] Iter 7: observed MAE=0.024415 rank=4
[SoftImpute] Iter 8: observed MAE=0.024467 rank=4
[SoftImpute] Iter 9: observed MAE=0.024519 rank=4
[SoftImpute] Iter 10: observed MAE=0.024571 rank=4
[SoftImpute] Iter 11: observed MAE=0.024620 rank=4
[SoftImpute] Iter 12: observed MAE=0.024670 rank=4
[SoftImpute] Iter 13: observed MAE=0.024719 rank=4
[SoftImpute] Iter 14: observed MAE=0.024766 rank=4
[SoftImpute] Iter 15: observed MAE=0.024812 rank=4
[SoftImpute] Iter 16: observed MAE=0.024859 rank=4
[SoftImpute] Iter 17: observed MAE=0.024909 rank=4
[SoftImpute] Iter 18: observed MAE=0.024958 rank=4
[SoftImpute] Iter 19: observed MAE=0.

In [None]:
res30

array([0.03395849, 0.27457804, 0.28883123, 0.28049099, 0.29109052,
       0.28089626, 0.295     ])

## 40%

In [None]:
n = np.array([[sum(y==0),28,20], [sum(y==1),28,20],
                  [sum(y==2),20,16]])
p = np.array([1,3,4])
res40 = compute_err(X, y, n, p, 3)
res40 

[SoftImpute] Max Singular Value of X_init = 15.236390
[SoftImpute] Iter 1: observed MAE=0.024184 rank=4
[SoftImpute] Iter 2: observed MAE=0.024225 rank=4
[SoftImpute] Iter 3: observed MAE=0.024266 rank=4
[SoftImpute] Iter 4: observed MAE=0.024308 rank=4
[SoftImpute] Iter 5: observed MAE=0.024351 rank=4
[SoftImpute] Iter 6: observed MAE=0.024395 rank=4
[SoftImpute] Iter 7: observed MAE=0.024437 rank=4
[SoftImpute] Iter 8: observed MAE=0.024480 rank=4
[SoftImpute] Iter 9: observed MAE=0.024523 rank=4
[SoftImpute] Iter 10: observed MAE=0.024563 rank=4
[SoftImpute] Iter 11: observed MAE=0.024601 rank=4
[SoftImpute] Iter 12: observed MAE=0.024637 rank=4
[SoftImpute] Iter 13: observed MAE=0.024671 rank=4
[SoftImpute] Iter 14: observed MAE=0.024704 rank=4
[SoftImpute] Iter 15: observed MAE=0.024740 rank=4
[SoftImpute] Iter 16: observed MAE=0.024775 rank=4
[SoftImpute] Iter 17: observed MAE=0.024807 rank=4
[SoftImpute] Iter 18: observed MAE=0.024839 rank=4
[SoftImpute] Iter 19: observed MAE=0.

array([0.05280262, 0.26091433, 0.27716668, 0.26623964, 0.26205479,
       0.26709203, 0.40333333])

In [None]:
iris = np.vstack((res20, res30, res40))
iris

array([[0.02617632, 0.28231714, 0.28823217, 0.28583875, 0.28528997,
        0.28588525, 0.2       ],
       [0.03395849, 0.27457804, 0.28883123, 0.28049099, 0.29109052,
        0.28089626, 0.295     ],
       [0.05280262, 0.26091433, 0.27716668, 0.26623964, 0.26205479,
        0.26709203, 0.40333333]])

## 50\%


In [None]:
n = np.array([[sum(y==0),18,15], [sum(y==1),18,15],
                  [sum(y==2),18,15]])
p = np.array([1,3,4])
missing_rate(X,y,n,p,3)

0.495

In [None]:
res50 = compute_err(X, y, n, p, 3)
res50 

[SoftImpute] Max Singular Value of X_init = 13.991957
[SoftImpute] Iter 1: observed MAE=0.023908 rank=4
[SoftImpute] Iter 2: observed MAE=0.023949 rank=4
[SoftImpute] Iter 3: observed MAE=0.023988 rank=4
[SoftImpute] Iter 4: observed MAE=0.024024 rank=4
[SoftImpute] Iter 5: observed MAE=0.024058 rank=4
[SoftImpute] Iter 6: observed MAE=0.024090 rank=4
[SoftImpute] Iter 7: observed MAE=0.024119 rank=4
[SoftImpute] Iter 8: observed MAE=0.024147 rank=4
[SoftImpute] Iter 9: observed MAE=0.024175 rank=4
[SoftImpute] Iter 10: observed MAE=0.024200 rank=4
[SoftImpute] Iter 11: observed MAE=0.024227 rank=4
[SoftImpute] Iter 12: observed MAE=0.024257 rank=4
[SoftImpute] Iter 13: observed MAE=0.024286 rank=4
[SoftImpute] Iter 14: observed MAE=0.024315 rank=4
[SoftImpute] Iter 15: observed MAE=0.024344 rank=4
[SoftImpute] Iter 16: observed MAE=0.024371 rank=4
[SoftImpute] Iter 17: observed MAE=0.024395 rank=4
[SoftImpute] Iter 18: observed MAE=0.024415 rank=4
[SoftImpute] Iter 19: observed MAE=0.

array([0.05290464, 0.26600708, 0.28223469, 0.26908135, 0.26718232,
       0.26983943, 0.495     ])

# Digits

In [None]:
digits = datasets.load_digits()
X,y = digits.data, digits.target.ravel() 
print(X.shape)
rmid = np.where(sum(X!=0)<10)
X = np.delete(X, rmid,axis = 1)
X.shape

(1797, 64)


(1797, 54)

In [None]:
# set random seed and shuffle the data
np.random.seed(1)
idx = np.arange(len(y))
np.random.shuffle(idx)
X, y = X[idx,:], y[idx]  

## 20%

In [None]:
    ng = np.asarray([sum(y==i) for i in np.arange(10)])
    n = np.hstack((ng.reshape((-1,1)), np.tile([150,140,130, 120],
                                 10).reshape((10,-1))))
    p = np.array([10,19, 25,56])    
    res20 = compute_err(X,y, n, p, 10) 
    res20  

[SoftImpute] Max Singular Value of X_init = 100.751609
[SoftImpute] Iter 1: observed MAE=0.037778 rank=54
[SoftImpute] Iter 2: observed MAE=0.037813 rank=54
[SoftImpute] Iter 3: observed MAE=0.037843 rank=54
[SoftImpute] Iter 4: observed MAE=0.037870 rank=54
[SoftImpute] Iter 5: observed MAE=0.037893 rank=54
[SoftImpute] Iter 6: observed MAE=0.037914 rank=54
[SoftImpute] Iter 7: observed MAE=0.037932 rank=54
[SoftImpute] Iter 8: observed MAE=0.037947 rank=54
[SoftImpute] Iter 9: observed MAE=0.037962 rank=54
[SoftImpute] Iter 10: observed MAE=0.037974 rank=54
[SoftImpute] Iter 11: observed MAE=0.037985 rank=54
[SoftImpute] Iter 12: observed MAE=0.037995 rank=54
[SoftImpute] Iter 13: observed MAE=0.038004 rank=54
[SoftImpute] Iter 14: observed MAE=0.038011 rank=54
[SoftImpute] Iter 15: observed MAE=0.038018 rank=54
[SoftImpute] Iter 16: observed MAE=0.038025 rank=54
[SoftImpute] Iter 17: observed MAE=0.038030 rank=54
[SoftImpute] Iter 18: observed MAE=0.038035 rank=54
[SoftImpute] Iter 

array([0.00193255, 0.02589197, 0.02600932, 0.02596283, 0.02595153,
       0.02596615, 0.20062244])

In [None]:
res20

array([0.00193255, 0.02589197, 0.02600932, 0.02596283, 0.02595153,
       0.02596615, 0.20062244])

## 30%

In [None]:
    ng = np.asarray([sum(y==i) for i in np.arange(10)])
    ng = np.asarray([sum(y==i) for i in np.arange(10)])
    n = np.hstack((ng.reshape((-1,1)), np.tile([120,115,110, 85],
                                 10).reshape((10,-1))))
    p = np.array([11,19, 25,56])     
    res30 = compute_err(X,y, n, p, 10) 
    res30  

[SoftImpute] Max Singular Value of X_init = 94.367583
[SoftImpute] Iter 1: observed MAE=0.037695 rank=54
[SoftImpute] Iter 2: observed MAE=0.037725 rank=54
[SoftImpute] Iter 3: observed MAE=0.037751 rank=54
[SoftImpute] Iter 4: observed MAE=0.037774 rank=54
[SoftImpute] Iter 5: observed MAE=0.037794 rank=54
[SoftImpute] Iter 6: observed MAE=0.037812 rank=54
[SoftImpute] Iter 7: observed MAE=0.037828 rank=54
[SoftImpute] Iter 8: observed MAE=0.037842 rank=54
[SoftImpute] Iter 9: observed MAE=0.037855 rank=54
[SoftImpute] Iter 10: observed MAE=0.037866 rank=54
[SoftImpute] Iter 11: observed MAE=0.037876 rank=54
[SoftImpute] Iter 12: observed MAE=0.037885 rank=54
[SoftImpute] Iter 13: observed MAE=0.037893 rank=54
[SoftImpute] Iter 14: observed MAE=0.037900 rank=54
[SoftImpute] Iter 15: observed MAE=0.037907 rank=54
[SoftImpute] Iter 16: observed MAE=0.037913 rank=54
[SoftImpute] Iter 17: observed MAE=0.037918 rank=54
[SoftImpute] Iter 18: observed MAE=0.037923 rank=54
[SoftImpute] Iter 1

array([0.00222284, 0.025957  , 0.0261318 , 0.02603002, 0.02599244,
       0.02603767, 0.29752262])

## 40%

In [None]:
    ng = np.asarray([sum(y==i) for i in np.arange(10)])
    n = np.hstack((ng.reshape((-1,1)),  np.tile([120,110,90, 50],
                                 10).reshape((10,-1))))    
    p = np.array([5,19, 25,56]) 
    res40 = compute_err(X,y, n, p, 10) 
    res40  

[SoftImpute] Max Singular Value of X_init = 86.674251
[SoftImpute] Iter 1: observed MAE=0.037383 rank=54
[SoftImpute] Iter 2: observed MAE=0.037415 rank=54
[SoftImpute] Iter 3: observed MAE=0.037443 rank=54
[SoftImpute] Iter 4: observed MAE=0.037468 rank=54
[SoftImpute] Iter 5: observed MAE=0.037490 rank=54
[SoftImpute] Iter 6: observed MAE=0.037510 rank=54
[SoftImpute] Iter 7: observed MAE=0.037527 rank=54
[SoftImpute] Iter 8: observed MAE=0.037543 rank=54
[SoftImpute] Iter 9: observed MAE=0.037556 rank=54
[SoftImpute] Iter 10: observed MAE=0.037569 rank=54
[SoftImpute] Iter 11: observed MAE=0.037579 rank=54
[SoftImpute] Iter 12: observed MAE=0.037589 rank=54
[SoftImpute] Iter 13: observed MAE=0.037597 rank=54
[SoftImpute] Iter 14: observed MAE=0.037605 rank=54
[SoftImpute] Iter 15: observed MAE=0.037612 rank=54
[SoftImpute] Iter 16: observed MAE=0.037618 rank=54
[SoftImpute] Iter 17: observed MAE=0.037624 rank=54
[SoftImpute] Iter 18: observed MAE=0.037629 rank=54
[SoftImpute] Iter 1

array([0.00253218, 0.02593142, 0.02619103, 0.0260126 , 0.02592944,
       0.02602602, 0.39729797])

### 50%

In [None]:
    ng = np.asarray([sum(y==i) for i in np.arange(10)])
    n = np.hstack((ng.reshape((-1,1)),  np.tile([110,90,75, 50],
                                 10).reshape((10,-1))))    
    p = np.array([5,10, 15,56]) 
    missing_rate(X,y,n,p,10)

0.5029266885137781

In [None]:
    res50 = compute_err(X,y, n, p, 10) 
    res50  

[SoftImpute] Max Singular Value of X_init = 79.751278
[SoftImpute] Iter 1: observed MAE=0.037620 rank=54
[SoftImpute] Iter 2: observed MAE=0.037649 rank=54
[SoftImpute] Iter 3: observed MAE=0.037675 rank=54
[SoftImpute] Iter 4: observed MAE=0.037698 rank=54
[SoftImpute] Iter 5: observed MAE=0.037719 rank=54
[SoftImpute] Iter 6: observed MAE=0.037738 rank=54
[SoftImpute] Iter 7: observed MAE=0.037755 rank=54
[SoftImpute] Iter 8: observed MAE=0.037770 rank=54
[SoftImpute] Iter 9: observed MAE=0.037783 rank=54
[SoftImpute] Iter 10: observed MAE=0.037796 rank=54
[SoftImpute] Iter 11: observed MAE=0.037807 rank=54
[SoftImpute] Iter 12: observed MAE=0.037817 rank=54
[SoftImpute] Iter 13: observed MAE=0.037826 rank=54
[SoftImpute] Iter 14: observed MAE=0.037834 rank=54
[SoftImpute] Iter 15: observed MAE=0.037842 rank=54
[SoftImpute] Iter 16: observed MAE=0.037848 rank=54
[SoftImpute] Iter 17: observed MAE=0.037855 rank=54
[SoftImpute] Iter 18: observed MAE=0.037860 rank=54
[SoftImpute] Iter 1

array([0.00314743, 0.02580076, 0.02601458, 0.02586622, 0.02579196,
       0.02587796, 0.50292669])

In [None]:
digits = np.vstack((res20,res30,res40))
digits

array([[0.00193255, 0.02589197, 0.02600932, 0.02596283, 0.02595153,
        0.02596615, 0.20062244],
       [0.00222284, 0.025957  , 0.0261318 , 0.02603002, 0.02599244,
        0.02603767, 0.29752262],
       [0.00253218, 0.02593142, 0.02619103, 0.0260126 , 0.02592944,
        0.02602602, 0.39729797]])