## libraries and function 


In [1]:
!pip install impyute
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as skLDA
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from scipy import stats
import numpy as np
import impyute as impy
from fancyimpute import IterativeSVD, SoftImpute, NuclearNormMinimization
import pandas as pd
import time

Collecting impyute
  Downloading https://files.pythonhosted.org/packages/37/28/86829f67c9affb847facaab94687761d3555539ec675f7577778c5b2680a/impyute-0.0.8-py2.py3-none-any.whl
Installing collected packages: impyute
Successfully installed impyute-0.0.8


Using TensorFlow backend.


The function `mle` allows us to compute the MLEs from training data with monotone missing data.

We denote
$$n = \begin{pmatrix}
n_1^{(1)} & n_1^{(2)} &...&n_1^{(K)}\\
\vdots & \vdots &\ddots&\vdots\\
n_G^{(1)} & n_G^{(2)} &...&n_G^{(K)}
\end{pmatrix}$$
$$p = (p_1,p_2,...,p_K)$$
G is the number of classes.

### MLE function 

In [2]:
import numpy as np
def mle(Xtrain, n, p, G):
    '''
    Xtrain: list of input. The ith element of the list contains the sample from
    the ith class.
    '''
    if p[0]==1:
        # the array that contains the means of each block for the 1st block
        mus = [np.mean(Xtrain[g][:,0]) for g in np.arange(G)]
        S = [(n[g,0]-1)*np.var(Xtrain[g][:,0]) for g in np.arange(G)]
    else:
        mus = [np.mean(Xtrain[g][:,0:p[0]], axis = 0) for g  in np.arange(G)]
        S = [(n[g,0]-1)*np.cov(Xtrain[g][:,0:p[0]],rowvar =False) 
             for g in np.arange(G)]
    
    mus = np.asarray(mus).T # so that each column is the mean of a class
    S = sum(S)/(sum(n[:,0])) 
    S = S.reshape((p[0],-1))
    for i in np.arange(1,len(p)):
        W = [(n[g,i]-1)*np.cov(Xtrain[g][0:n[g,i],0:p[i]],
                              rowvar=False) for g in np.arange(G)]
        W = sum(W)
        
        P = np.matmul(W[(p[i-1]):p[i], 0:p[i-1]],
                      np.linalg.inv(W[0:p[i-1],0:p[i-1]]))
        Q = (W[p[i-1]:p[i],p[i-1]:p[i]]-
            np.matmul(P, W[0:p[i-1],p[i-1]:p[i]]))/sum(n[:,i])
        xmeans = [np.mean(Xtrain[g][0:n[g,i],0:p[i]], axis = 0) 
                  for g in np.arange(G)]
        
        xmeans = np.asarray(xmeans)
        xmeans = xmeans.T
        mus = np.vstack((mus, xmeans[p[i-1]:p[i],:]
                       - np.matmul(P, xmeans[0:p[i-1]]-mus)))
        S21 = np.matmul(P, S)
        S = np.vstack((np.hstack((S, S21.T)),
                       np.hstack((S21, Q+np.matmul(P, S21.T)))))
    return [mus, S]

### LDA and nan function 


In [3]:
# function that return the misclassfication rate
# for LDA with missing data
def lda_miss(mus, S, Xtest, ytrain, ytest, G):
    f = lambda g: np.log(np.mean(ytrain==g)) - np.matmul(
                  np.matmul(mus[:,g].T, np.linalg.inv(S)), mus[:,g]/2)
    last2 = [f(g) for g in np.arange(G)]
    
    h = lambda g,i: last2[g] + np.matmul(mus[:,g].T, np.matmul(
                    np.linalg.inv(S), Xtest[i,:].T))
    pred_label = [np.argmax([h(g,i) for g in np.arange(G)]) 
                  for i in np.arange(len(Xtest))]
    pred_label = np.asarray(pred_label)
    return np.mean(pred_label.flatten() != ytest)

'''
function that create data list that contain missing values
The input X is a numpy array, y is the label
the function return a list where the ith element of 
the list belongs to the ith class
'''

def make_nan_list(X,y,G, n, p):
    # note that the label should go from 0 to G-1
    data = []
    for g in np.arange(G):
        data.append(X[y==g,:])
        for k in np.arange(len(p)-1):
            data[g][n[g,k+1]:n[g,k], p[k]:] = np.nan
    return data

### compute_err function 

In [4]:
def missing_rate(Xtrain, ytrain, Xtest, ytest, n, p, G):    
    Xtr_nan_list = make_nan_list(Xtrain,ytrain,G, n, p)
    # make NA data
    # since making function changes the order of observation
    # we need to generate new ytr from Xtr_nan    
    Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0]))
    for g in np.arange(1,G):
        Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g]))
        ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g]))))

    # percentage of missing values
    per_missing = np.mean(np.isnan(Xtr_nan))
    return per_missing

In [5]:
def compute_err(Xtrain, ytrain, Xtest, ytest, n, p, G):    
    Xtr_nan_list = make_nan_list(Xtrain,ytrain,G, n, p)
    # make NA data
    # since making function changes the order of observation
    # we need to generate new ytr from Xtr_nan    
    Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0]))
    for g in np.arange(1,G):
        Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g]))
        ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g]))))

    # percentage of missing values
    per_missing = np.mean(np.isnan(Xtr_nan))

    scaler = MinMaxScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtest = scaler.transform(Xtest)
    Xtr_nan_list2 = []
    for g in range(G):
      Xtr_nan_list2.append(scaler.transform(Xtr_nan_list[g]))

    # MLEs approach
    start = time.time()
    mus, S = mle(Xtr_nan_list2, n, p, G)
    mle_err = lda_miss(mus, S, Xtest, ytrain, ytest, G)
    mle_time = time.time()-start
    
    #impute,classify and get the error rates for imputation approaches    
    start = time.time()
    Xtr_em = impy.em(Xtr_nan)
    clf_em = skLDA().fit(Xtr_em, ytr)
    em_err = np.mean(clf_em.predict(Xtest).flatten() != ytest)
    em_time = time.time()-start 

    start = time.time()
    Xtr_knn = impy.fast_knn(Xtr_nan)
    clf_knn = skLDA().fit(Xtr_knn, ytr)
    knn_err = np.mean(clf_knn.predict(Xtest).flatten() != ytest)
    knn_time = time.time()-start 

    start = time.time()
    Xtr_softimpute = SoftImpute(max_iters = 100).fit_transform(Xtr_nan)
    clf_softimpute = skLDA().fit(Xtr_softimpute, ytr)
    softimpute_err = np.mean(clf_softimpute.predict(Xtest).flatten() != ytest)
    softimpute_time = time.time()-start

    start = time.time()
    Xtr_mice = IterativeImputer(max_iter=100).fit(Xtr_nan).transform(Xtr_nan)
    clf_mice = skLDA().fit(Xtr_mice, ytr)
    mice_err = np.mean(clf_mice.predict(Xtest).flatten() != ytest)
    mice_time = time.time()-start

    start = time.time()
    Xtr_nuclear = NuclearNormMinimization(max_iters=100).fit_transform(Xtr_nan)
    clf_nuclear = skLDA().fit(Xtr_nuclear, ytr)
    nuclear_err = np.mean(clf_nuclear.predict(Xtest).flatten() != ytest)
    nuclear_time = time.time()-start
    
    err_rate = np.array([mle_err, knn_err, mice_err, softimpute_err, em_err,
           nuclear_err, per_missing])
    return err_rate

## Inosphere

In [6]:
data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data',
                  sep = ",", header = None)
print(data.head())
data = pd.DataFrame.to_numpy(data)
X, y = data[:,:34].astype(np.float64), data[:,34]
le2 = LabelEncoder()
y = le2.fit_transform(y)
print(len(y))
4/5*np.array([sum(y==0), sum(y==1)])

   0   1        2        3        4   ...       30       31       32       33  34
0   1   0  0.99539 -0.05889  0.85243  ...  0.42267 -0.54487  0.18641 -0.45300   g
1   1   0  1.00000 -0.18829  0.93035  ... -0.16626 -0.06288 -0.13738 -0.02447   b
2   1   0  1.00000 -0.03365  1.00000  ...  0.60436 -0.24180  0.56045 -0.38238   g
3   1   0  1.00000 -0.45161  1.00000  ...  0.25682  1.00000 -0.32382  1.00000   b
4   1   0  1.00000 -0.02401  0.94140  ... -0.05707 -0.59573 -0.04608 -0.65697   g

[5 rows x 35 columns]
351


array([100.8, 180. ])

In [7]:
print(stats.describe(X[:,1]))
X = np.delete(X,[0,1], axis = 1)
X.shape

DescribeResult(nobs=351, minmax=(0.0, 0.0), mean=0.0, variance=0.0, skewness=0.0, kurtosis=-3.0)


(351, 32)

### 20%

In [17]:
def fold_k_err(X,y,k,G):
    np.random.seed(0)
    fold = np.random.choice(np.arange(5), len(y))
    Xtrain, ytrain = X[fold != k, :], y[fold != k]
    Xtest, ytest = X[fold == k, :], y[fold == k]
    n = np.array([[sum(ytrain==0),100,90,75],
                  [sum(ytrain==1),180,150,90]])
    p = np.array([5,10,20,32])
    return missing_rate(Xtrain, ytrain, Xtest, ytest, n, p, G)
# cross_validation error
res = [fold_k_err(X,y,k,2) for k in np.arange(5)]
np.mean(np.asarray(res), axis=0)

0.20234954407973257

In [20]:
def fold_k_err(X,y,k,G):
    np.random.seed(1)
    fold = np.random.choice(np.arange(5), len(y))
    Xtrain, ytrain = X[fold != k, :], y[fold != k]
    Xtest, ytest = X[fold == k, :], y[fold == k]
    n = np.array([[sum(ytrain==0),100,90,76],
                  [sum(ytrain==1),180,150,90]])
    p = np.array([5,10,20,32])
    return compute_err(Xtrain, ytrain, Xtest, ytest, n, p, G)
# cross_validation error
res = [fold_k_err(X,y,k,2) for k in np.arange(5)]
res20 = np.mean(np.asarray(res), axis=0)

[SoftImpute] Max Singular Value of X_init = 51.728363
[SoftImpute] Iter 1: observed MAE=0.048477 rank=32
[SoftImpute] Iter 2: observed MAE=0.048656 rank=32
[SoftImpute] Iter 3: observed MAE=0.048757 rank=32
[SoftImpute] Iter 4: observed MAE=0.048820 rank=32
[SoftImpute] Iter 5: observed MAE=0.048858 rank=32
[SoftImpute] Iter 6: observed MAE=0.048883 rank=32
[SoftImpute] Iter 7: observed MAE=0.048900 rank=32
[SoftImpute] Iter 8: observed MAE=0.048913 rank=32
[SoftImpute] Iter 9: observed MAE=0.048922 rank=32
[SoftImpute] Iter 10: observed MAE=0.048931 rank=32
[SoftImpute] Iter 11: observed MAE=0.048938 rank=32
[SoftImpute] Iter 12: observed MAE=0.048943 rank=32
[SoftImpute] Iter 13: observed MAE=0.048948 rank=32
[SoftImpute] Iter 14: observed MAE=0.048954 rank=32
[SoftImpute] Iter 15: observed MAE=0.048958 rank=32
[SoftImpute] Iter 16: observed MAE=0.048962 rank=32
[SoftImpute] Iter 17: observed MAE=0.048966 rank=32
[SoftImpute] Iter 18: observed MAE=0.048968 rank=31
[SoftImpute] Iter 1

### 30%

In [24]:
def fold_k_err(X,y,k,G):
    np.random.seed(0)
    fold = np.random.choice(np.arange(5), len(y))
    Xtrain, ytrain = X[fold != k, :], y[fold != k]
    Xtest, ytest = X[fold == k, :], y[fold == k]
    n = np.array([[sum(ytrain==0),100,75,67],
                  [sum(ytrain==1),150,120,90]])
    p = np.array([5,10,15,32])
    return compute_err(Xtrain, ytrain, Xtest, ytest, n, p, G)
# cross_validation error
res = [fold_k_err(X,y,k,2) for k in np.arange(5)]
res30 = np.mean(np.asarray(res), axis=0)

[SoftImpute] Max Singular Value of X_init = 48.418082
[SoftImpute] Iter 1: observed MAE=0.048273 rank=31
[SoftImpute] Iter 2: observed MAE=0.048462 rank=31
[SoftImpute] Iter 3: observed MAE=0.048600 rank=31
[SoftImpute] Iter 4: observed MAE=0.048695 rank=31
[SoftImpute] Iter 5: observed MAE=0.048759 rank=31
[SoftImpute] Iter 6: observed MAE=0.048806 rank=31
[SoftImpute] Iter 7: observed MAE=0.048838 rank=31
[SoftImpute] Iter 8: observed MAE=0.048862 rank=31
[SoftImpute] Iter 9: observed MAE=0.048878 rank=31
[SoftImpute] Iter 10: observed MAE=0.048888 rank=31
[SoftImpute] Iter 11: observed MAE=0.048894 rank=31
[SoftImpute] Iter 12: observed MAE=0.048898 rank=31
[SoftImpute] Iter 13: observed MAE=0.048901 rank=31
[SoftImpute] Iter 14: observed MAE=0.048902 rank=31
[SoftImpute] Iter 15: observed MAE=0.048902 rank=31
[SoftImpute] Iter 16: observed MAE=0.048900 rank=31
[SoftImpute] Iter 17: observed MAE=0.048899 rank=31
[SoftImpute] Iter 18: observed MAE=0.048897 rank=31
[SoftImpute] Iter 1

### 40%

In [22]:
def fold_k_err(X,y,k,G):
    np.random.seed(0)
    fold = np.random.choice(np.arange(5), len(y))
    Xtrain, ytrain = X[fold != k, :], y[fold != k]
    Xtest, ytest = X[fold == k, :], y[fold == k]
    n = np.array([[sum(ytrain==0),66,55,45],
                  [sum(ytrain==1),70,65,60]])
    p = np.array([11,12,15,32])
    return compute_err(Xtrain, ytrain, Xtest, ytest, n, p, G)
# cross_validation error
res = [fold_k_err(X,y,k,2) for k in np.arange(5)]
res40 = np.mean(np.asarray(res), axis=0)
res40

[SoftImpute] Max Singular Value of X_init = 43.760492
[SoftImpute] Iter 1: observed MAE=0.046555 rank=31
[SoftImpute] Iter 2: observed MAE=0.046805 rank=31
[SoftImpute] Iter 3: observed MAE=0.046981 rank=31
[SoftImpute] Iter 4: observed MAE=0.047104 rank=31
[SoftImpute] Iter 5: observed MAE=0.047178 rank=30
[SoftImpute] Iter 6: observed MAE=0.047197 rank=29
[SoftImpute] Iter 7: observed MAE=0.047200 rank=29
[SoftImpute] Iter 8: observed MAE=0.047210 rank=29
[SoftImpute] Iter 9: observed MAE=0.047221 rank=29
[SoftImpute] Iter 10: observed MAE=0.047231 rank=29
[SoftImpute] Iter 11: observed MAE=0.047238 rank=29
[SoftImpute] Iter 12: observed MAE=0.047243 rank=29
[SoftImpute] Iter 13: observed MAE=0.047247 rank=29
[SoftImpute] Iter 14: observed MAE=0.047250 rank=29
[SoftImpute] Iter 15: observed MAE=0.047250 rank=29
[SoftImpute] Iter 16: observed MAE=0.047249 rank=29
[SoftImpute] Iter 17: observed MAE=0.047246 rank=29
[SoftImpute] Iter 18: observed MAE=0.047244 rank=29
[SoftImpute] Iter 1

array([0.15138467, 0.15297176, 0.16442566, 0.1695362 , 0.19133985,
       0.1645362 , 0.40202223])

### results

In [26]:
inosphere = np.vstack((res20,res30,res40))
inosphere

array([[0.15535907, 0.15866975, 0.16091544, 0.15874908, 0.16866752,
        0.15874908, 0.20081745],
       [0.13938352, 0.1514952 , 0.13388334, 0.14573273, 0.15923935,
        0.14823273, 0.29875282],
       [0.15138467, 0.15297176, 0.16442566, 0.1695362 , 0.19133985,
        0.1645362 , 0.40202223]])

## seeds 

In [None]:
data = pd.read_table('https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt',
                     sep = '\s+', header = None)
print(data.head())
data = pd.DataFrame.to_numpy(data)
# reset the labels to go start from 0  
X,y = data[:,:7], data[:,7]-1 

       0      1       2      3      4      5      6  7
0  15.26  14.84  0.8710  5.763  3.312  2.221  5.220  1
1  14.88  14.57  0.8811  5.554  3.333  1.018  4.956  1
2  14.29  14.09  0.9050  5.291  3.337  2.699  4.825  1
3  13.84  13.94  0.8955  5.324  3.379  2.259  4.805  1
4  16.14  14.99  0.9034  5.658  3.562  1.355  5.175  1


In [None]:
def fold_k_err(X,y,k,G):
    np.random.seed(15)
    fold = np.random.choice(np.arange(5), len(y))
    Xtrain, ytrain = X[fold != k, :], y[fold != k]
    Xtest, ytest = X[fold == k, :], y[fold == k]
    p = np.array([2,3,7])
    n = np.array([[sum(ytrain==0),50,40], [sum(ytrain==1),50,35],
                  [sum(ytrain==2),50,38]])    
    return compute_err(Xtrain, ytrain, Xtest, ytest, n, p, G)
# cross_validation error
res = [fold_k_err(X,y,k,3) for k in np.arange(5)]
res20 = np.mean(np.asarray(res), axis=0)
res20

[SoftImpute] Max Singular Value of X_init = 14.506631
[SoftImpute] Iter 1: observed MAE=0.018580 rank=6
[SoftImpute] Iter 2: observed MAE=0.018548 rank=6
[SoftImpute] Iter 3: observed MAE=0.018533 rank=6
[SoftImpute] Iter 4: observed MAE=0.018531 rank=6
[SoftImpute] Iter 5: observed MAE=0.018539 rank=6
[SoftImpute] Iter 6: observed MAE=0.018557 rank=6
[SoftImpute] Iter 7: observed MAE=0.018579 rank=6
[SoftImpute] Iter 8: observed MAE=0.018600 rank=6
[SoftImpute] Iter 9: observed MAE=0.018619 rank=6
[SoftImpute] Iter 10: observed MAE=0.018636 rank=6
[SoftImpute] Iter 11: observed MAE=0.018651 rank=6
[SoftImpute] Iter 12: observed MAE=0.018665 rank=6
[SoftImpute] Iter 13: observed MAE=0.018677 rank=6
[SoftImpute] Iter 14: observed MAE=0.018686 rank=6
[SoftImpute] Iter 15: observed MAE=0.018693 rank=6
[SoftImpute] Iter 16: observed MAE=0.018699 rank=6
[SoftImpute] Iter 17: observed MAE=0.018704 rank=6
[SoftImpute] Iter 18: observed MAE=0.018707 rank=6
[SoftImpute] Iter 19: observed MAE=0.

array([0.03373977, 0.050962  , 0.04235088, 0.0523999 , 0.0858347 ,
       0.0523999 , 0.20211604])

## 30%

In [None]:
def fold_k_err(X,y,k,G):
    np.random.seed(15)
    fold = np.random.choice(np.arange(5), len(y))
    Xtrain, ytrain = X[fold != k, :], y[fold != k]
    Xtest, ytest = X[fold == k, :], y[fold == k]
    p = np.array([2,3,7])
    n = np.array([[sum(ytrain==0),40,30], [sum(ytrain==1),45,30],
                  [sum(ytrain==2),40,30]])
    return compute_err(Xtrain, ytrain, Xtest, ytest, n, p, G)
# cross_validation error
res = [fold_k_err(X,y,k,3) for k in np.arange(5)]
res30 = np.mean(np.asarray(res), axis=0)
res30

[SoftImpute] Max Singular Value of X_init = 13.472783
[SoftImpute] Iter 1: observed MAE=0.018825 rank=6
[SoftImpute] Iter 2: observed MAE=0.018743 rank=6
[SoftImpute] Iter 3: observed MAE=0.018692 rank=6
[SoftImpute] Iter 4: observed MAE=0.018668 rank=6
[SoftImpute] Iter 5: observed MAE=0.018675 rank=6
[SoftImpute] Iter 6: observed MAE=0.018691 rank=6
[SoftImpute] Iter 7: observed MAE=0.018713 rank=6
[SoftImpute] Iter 8: observed MAE=0.018736 rank=6
[SoftImpute] Iter 9: observed MAE=0.018762 rank=6
[SoftImpute] Iter 10: observed MAE=0.018786 rank=6
[SoftImpute] Iter 11: observed MAE=0.018809 rank=6
[SoftImpute] Iter 12: observed MAE=0.018829 rank=6
[SoftImpute] Iter 13: observed MAE=0.018845 rank=6
[SoftImpute] Iter 14: observed MAE=0.018858 rank=6
[SoftImpute] Iter 15: observed MAE=0.018869 rank=6
[SoftImpute] Iter 16: observed MAE=0.018878 rank=6
[SoftImpute] Iter 17: observed MAE=0.018886 rank=6
[SoftImpute] Iter 18: observed MAE=0.018893 rank=6
[SoftImpute] Iter 19: observed MAE=0.

array([0.03790644, 0.05106301, 0.06244892, 0.06207356, 0.08555693,
       0.06207356, 0.30151866])

## 40%

In [None]:
def fold_k_err(X,y,k,G):
    np.random.seed(15)
    fold = np.random.choice(np.arange(5), len(y))
    Xtrain, ytrain = X[fold != k, :], y[fold != k]
    Xtest, ytest = X[fold == k, :], y[fold == k]
    n = np.array([[sum(ytrain==0),35,24], [sum(ytrain==1),35,22],
                  [sum(ytrain==2),35,21]])
    p = np.array([2,3,7])
    return compute_err(Xtrain, ytrain, Xtest, ytest, n, p, G)
# cross_validation error
res = [fold_k_err(X,y,k,3) for k in np.arange(5)]
res40 = np.mean(np.asarray(res), axis=0)
res40

[SoftImpute] Max Singular Value of X_init = 12.247909
[SoftImpute] Iter 1: observed MAE=0.018835 rank=7
[SoftImpute] Iter 2: observed MAE=0.018853 rank=7
[SoftImpute] Iter 3: observed MAE=0.018874 rank=7
[SoftImpute] Iter 4: observed MAE=0.018819 rank=6
[SoftImpute] Iter 5: observed MAE=0.018756 rank=6
[SoftImpute] Iter 6: observed MAE=0.018714 rank=6
[SoftImpute] Iter 7: observed MAE=0.018692 rank=6
[SoftImpute] Iter 8: observed MAE=0.018685 rank=6
[SoftImpute] Iter 9: observed MAE=0.018686 rank=6
[SoftImpute] Iter 10: observed MAE=0.018688 rank=6
[SoftImpute] Iter 11: observed MAE=0.018694 rank=6
[SoftImpute] Iter 12: observed MAE=0.018708 rank=6
[SoftImpute] Iter 13: observed MAE=0.018722 rank=6
[SoftImpute] Iter 14: observed MAE=0.018737 rank=6
[SoftImpute] Iter 15: observed MAE=0.018750 rank=6
[SoftImpute] Iter 16: observed MAE=0.018761 rank=6
[SoftImpute] Iter 17: observed MAE=0.018771 rank=6
[SoftImpute] Iter 18: observed MAE=0.018780 rank=6
[SoftImpute] Iter 19: observed MAE=0.

array([0.03790644, 0.07133781, 0.07790393, 0.07383827, 0.09444582,
       0.06795592, 0.39683806])

In [None]:
seeds = np.vstack((res20,res30,res40))
seeds

array([[0.03373977, 0.050962  , 0.04235088, 0.0523999 , 0.0858347 ,
        0.0523999 , 0.20211604],
       [0.03790644, 0.05106301, 0.06244892, 0.06207356, 0.08555693,
        0.06207356, 0.30151866],
       [0.03790644, 0.07133781, 0.07790393, 0.07383827, 0.09444582,
        0.06795592, 0.39683806]])

# Parkinson dataset

https://archive.ics.uci.edu/ml/datasets/Parkinsons


In [None]:
data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data',
                  sep = ",")
data = data.drop(['name'], axis = 1)
X, y = data.drop(['status'], axis = 1), data['status']
X = pd.DataFrame.to_numpy(X)
4/5*np.array([sum(y==0), sum(y==1)])                   

array([ 38.4, 117.6])

## 20%

In [None]:
def fold_k_err(X,y,k,G):
    np.random.seed(1)
    fold = np.random.choice(np.arange(5), len(y))
    Xtrain, ytrain = X[fold != k, :], y[fold != k]
    Xtest, ytest = X[fold == k, :], y[fold == k]
    n = np.array([[sum(ytrain==0),35,30,29],
                  [sum(ytrain==1),100,88, 70]])
    p = np.array([5,10,15,22])
    return compute_err(Xtrain, ytrain, Xtest, ytest, n, p, G)
# cross_validation error
res = [fold_k_err(X,y,k,2) for k in np.arange(5)]
res_20 = np.mean(np.asarray(res), axis=0)
res_20

[SoftImpute] Max Singular Value of X_init = 16.131710
[SoftImpute] Iter 1: observed MAE=0.017482 rank=16
[SoftImpute] Iter 2: observed MAE=0.017603 rank=16
[SoftImpute] Iter 3: observed MAE=0.017677 rank=16
[SoftImpute] Iter 4: observed MAE=0.017720 rank=16
[SoftImpute] Iter 5: observed MAE=0.017747 rank=16
[SoftImpute] Iter 6: observed MAE=0.017763 rank=16
[SoftImpute] Iter 7: observed MAE=0.017773 rank=16
[SoftImpute] Iter 8: observed MAE=0.017780 rank=16
[SoftImpute] Iter 9: observed MAE=0.017785 rank=16
[SoftImpute] Iter 10: observed MAE=0.017788 rank=16
[SoftImpute] Iter 11: observed MAE=0.017791 rank=16
[SoftImpute] Iter 12: observed MAE=0.017794 rank=16
[SoftImpute] Iter 13: observed MAE=0.017797 rank=16
[SoftImpute] Iter 14: observed MAE=0.017800 rank=16
[SoftImpute] Iter 15: observed MAE=0.017802 rank=16
[SoftImpute] Iter 16: observed MAE=0.017804 rank=16
[SoftImpute] Iter 17: observed MAE=0.017805 rank=16
[SoftImpute] Iter 18: observed MAE=0.017807 rank=16
[SoftImpute] Iter 1



----------------------------------------------------------------------------
	SCS v2.1.2 - Splitting Conic Solver
	(c) Brendan O'Donoghue, Stanford University, 2012
----------------------------------------------------------------------------
Lin-sys: sparse-direct, nnz in A = 37087
eps = 1.00e-04, alpha = 1.50, max_iters = 100, normalize = 1, scale = 1.00
acceleration_lookback = 10, rho_x = 1.00e-03
Variables n = 21695, constraints m = 28339
Cones:	primal zero / dual free vars: 3322
	linear vars: 9966
	sd vars: 15051, sd blks: 1
Setup time: 3.62e-02s
----------------------------------------------------------------------------
 Iter | pri res | dua res | rel gap | pri obj | dua obj | kap/tau | time (s)
----------------------------------------------------------------------------
     0| 1.75e+21  1.68e+21  1.00e+00 -2.29e+23  4.91e+22  8.28e+22  2.23e-02 
   100| 1.46e-03  1.46e-03  1.24e-04  4.56e+01  4.57e+01  7.46e-15  2.39e+00 
--------------------------------------------------------



----------------------------------------------------------------------------
	SCS v2.1.2 - Splitting Conic Solver
	(c) Brendan O'Donoghue, Stanford University, 2012
----------------------------------------------------------------------------
Lin-sys: sparse-direct, nnz in A = 37676
eps = 1.00e-04, alpha = 1.50, max_iters = 100, normalize = 1, scale = 1.00
acceleration_lookback = 10, rho_x = 1.00e-03
Variables n = 22132, constraints m = 28864
Cones:	primal zero / dual free vars: 3366
	linear vars: 10098
	sd vars: 15400, sd blks: 1
Setup time: 3.23e-02s
----------------------------------------------------------------------------
 Iter | pri res | dua res | rel gap | pri obj | dua obj | kap/tau | time (s)
----------------------------------------------------------------------------
     0| 1.77e+21  1.70e+21  1.00e+00 -2.31e+23  4.95e+22  8.31e+22  2.02e-02 
   100| 1.48e-03  2.19e-03  7.50e-05  4.60e+01  4.60e+01  3.48e-15  2.43e+00 
-------------------------------------------------------



----------------------------------------------------------------------------
	SCS v2.1.2 - Splitting Conic Solver
	(c) Brendan O'Donoghue, Stanford University, 2012
----------------------------------------------------------------------------
Lin-sys: sparse-direct, nnz in A = 40376
eps = 1.00e-04, alpha = 1.50, max_iters = 100, normalize = 1, scale = 1.00
acceleration_lookback = 10, rho_x = 1.00e-03
Variables n = 24148, constraints m = 31276
Cones:	primal zero / dual free vars: 3564
	linear vars: 10692
	sd vars: 17020, sd blks: 1
Setup time: 3.62e-02s
----------------------------------------------------------------------------
 Iter | pri res | dua res | rel gap | pri obj | dua obj | kap/tau | time (s)
----------------------------------------------------------------------------
     0| 1.87e+21  1.79e+21  1.00e+00 -2.57e+23  5.45e+22  9.01e+22  2.38e-02 
   100| 1.44e-03  1.64e-03  3.05e-05  4.70e+01  4.70e+01  6.27e-15  2.61e+00 
-------------------------------------------------------



----------------------------------------------------------------------------
	SCS v2.1.2 - Splitting Conic Solver
	(c) Brendan O'Donoghue, Stanford University, 2012
----------------------------------------------------------------------------
Lin-sys: sparse-direct, nnz in A = 40376
eps = 1.00e-04, alpha = 1.50, max_iters = 100, normalize = 1, scale = 1.00
acceleration_lookback = 10, rho_x = 1.00e-03
Variables n = 24148, constraints m = 31276
Cones:	primal zero / dual free vars: 3564
	linear vars: 10692
	sd vars: 17020, sd blks: 1
Setup time: 3.23e-02s
----------------------------------------------------------------------------
 Iter | pri res | dua res | rel gap | pri obj | dua obj | kap/tau | time (s)
----------------------------------------------------------------------------
     0| 1.87e+21  1.79e+21  1.00e+00 -2.54e+23  5.40e+22  8.94e+22  2.38e-02 
   100| 2.22e-03  2.86e-03  1.92e-04  4.59e+01  4.59e+01  1.59e-14  2.60e+00 
-------------------------------------------------------



----------------------------------------------------------------------------
	SCS v2.1.2 - Splitting Conic Solver
	(c) Brendan O'Donoghue, Stanford University, 2012
----------------------------------------------------------------------------
Lin-sys: sparse-direct, nnz in A = 37381
eps = 1.00e-04, alpha = 1.50, max_iters = 100, normalize = 1, scale = 1.00
acceleration_lookback = 10, rho_x = 1.00e-03
Variables n = 21913, constraints m = 28601
Cones:	primal zero / dual free vars: 3344
	linear vars: 10032
	sd vars: 15225, sd blks: 1
Setup time: 3.40e-02s
----------------------------------------------------------------------------
 Iter | pri res | dua res | rel gap | pri obj | dua obj | kap/tau | time (s)
----------------------------------------------------------------------------
     0| 1.77e+21  1.69e+21  1.00e+00 -2.34e+23  5.01e+22  8.43e+22  3.06e-02 
   100| 1.93e-03  2.36e-03  3.85e-04  4.65e+01  4.64e+01  1.29e-14  2.46e+00 
-------------------------------------------------------

array([0.14596295, 0.15959932, 0.1824323 , 0.17788684, 0.18233162,
       0.18697775, 0.20164826])

## 30%

In [None]:
def fold_k_err(X,y,k,G):
    np.random.seed(1)
    fold = np.random.choice(np.arange(5), len(y))
    Xtrain, ytrain = X[fold != k, :], y[fold != k]
    Xtest, ytest = X[fold == k, :], y[fold == k]
    n = np.array([[sum(ytrain==0),35,30,25],
                  [sum(ytrain==1),80,70, 55]])
    p = np.array([5,10,15,22])
    return compute_err(Xtrain, ytrain, Xtest, ytest, n, p, G)
# cross_validation error
res = [fold_k_err(X,y,k,2) for k in np.arange(5)]
res_30 = np.mean(np.asarray(res), axis=0)
res_30

[SoftImpute] Max Singular Value of X_init = 14.573724
[SoftImpute] Iter 1: observed MAE=0.017023 rank=17
[SoftImpute] Iter 2: observed MAE=0.017181 rank=17
[SoftImpute] Iter 3: observed MAE=0.017274 rank=17
[SoftImpute] Iter 4: observed MAE=0.017334 rank=16
[SoftImpute] Iter 5: observed MAE=0.017373 rank=16
[SoftImpute] Iter 6: observed MAE=0.017410 rank=16
[SoftImpute] Iter 7: observed MAE=0.017443 rank=16
[SoftImpute] Iter 8: observed MAE=0.017472 rank=16
[SoftImpute] Iter 9: observed MAE=0.017488 rank=16
[SoftImpute] Iter 10: observed MAE=0.017476 rank=16
[SoftImpute] Iter 11: observed MAE=0.017437 rank=15
[SoftImpute] Iter 12: observed MAE=0.017403 rank=15
[SoftImpute] Iter 13: observed MAE=0.017377 rank=15
[SoftImpute] Iter 14: observed MAE=0.017356 rank=15
[SoftImpute] Iter 15: observed MAE=0.017337 rank=15
[SoftImpute] Iter 16: observed MAE=0.017319 rank=15
[SoftImpute] Iter 17: observed MAE=0.017302 rank=14
[SoftImpute] Iter 18: observed MAE=0.017288 rank=14
[SoftImpute] Iter 1



----------------------------------------------------------------------------
	SCS v2.1.2 - Splitting Conic Solver
	(c) Brendan O'Donoghue, Stanford University, 2012
----------------------------------------------------------------------------
Lin-sys: sparse-direct, nnz in A = 36441
eps = 1.00e-04, alpha = 1.50, max_iters = 100, normalize = 1, scale = 1.00
acceleration_lookback = 10, rho_x = 1.00e-03
Variables n = 21695, constraints m = 28339
Cones:	primal zero / dual free vars: 3322
	linear vars: 9966
	sd vars: 15051, sd blks: 1
Setup time: 3.04e-02s
----------------------------------------------------------------------------
 Iter | pri res | dua res | rel gap | pri obj | dua obj | kap/tau | time (s)
----------------------------------------------------------------------------
     0| 1.77e+21  1.70e+21  1.00e+00 -2.17e+23  4.64e+22  7.59e+22  1.90e-02 
   100| 1.39e-03  1.66e-03  2.14e-04  4.32e+01  4.32e+01  4.54e-15  2.40e+00 
--------------------------------------------------------



----------------------------------------------------------------------------
	SCS v2.1.2 - Splitting Conic Solver
	(c) Brendan O'Donoghue, Stanford University, 2012
----------------------------------------------------------------------------
Lin-sys: sparse-direct, nnz in A = 37030
eps = 1.00e-04, alpha = 1.50, max_iters = 100, normalize = 1, scale = 1.00
acceleration_lookback = 10, rho_x = 1.00e-03
Variables n = 22132, constraints m = 28864
Cones:	primal zero / dual free vars: 3366
	linear vars: 10098
	sd vars: 15400, sd blks: 1
Setup time: 3.01e-02s
----------------------------------------------------------------------------
 Iter | pri res | dua res | rel gap | pri obj | dua obj | kap/tau | time (s)
----------------------------------------------------------------------------
     0| 1.79e+21  1.72e+21  1.00e+00 -2.17e+23  4.65e+22  7.58e+22  2.00e-02 
   100| 1.16e-03  1.15e-03  1.97e-04  4.31e+01  4.31e+01  5.47e-16  2.44e+00 
-------------------------------------------------------



----------------------------------------------------------------------------
	SCS v2.1.2 - Splitting Conic Solver
	(c) Brendan O'Donoghue, Stanford University, 2012
----------------------------------------------------------------------------
Lin-sys: sparse-direct, nnz in A = 39730
eps = 1.00e-04, alpha = 1.50, max_iters = 100, normalize = 1, scale = 1.00
acceleration_lookback = 10, rho_x = 1.00e-03
Variables n = 24148, constraints m = 31276
Cones:	primal zero / dual free vars: 3564
	linear vars: 10692
	sd vars: 17020, sd blks: 1
Setup time: 3.45e-02s
----------------------------------------------------------------------------
 Iter | pri res | dua res | rel gap | pri obj | dua obj | kap/tau | time (s)
----------------------------------------------------------------------------
     0| 1.88e+21  1.81e+21  1.00e+00 -2.45e+23  5.19e+22  8.36e+22  2.16e-02 
   100| 1.18e-03  1.08e-03  1.49e-04  4.52e+01  4.52e+01  1.88e-15  2.60e+00 
-------------------------------------------------------



----------------------------------------------------------------------------
	SCS v2.1.2 - Splitting Conic Solver
	(c) Brendan O'Donoghue, Stanford University, 2012
----------------------------------------------------------------------------
Lin-sys: sparse-direct, nnz in A = 39730
eps = 1.00e-04, alpha = 1.50, max_iters = 100, normalize = 1, scale = 1.00
acceleration_lookback = 10, rho_x = 1.00e-03
Variables n = 24148, constraints m = 31276
Cones:	primal zero / dual free vars: 3564
	linear vars: 10692
	sd vars: 17020, sd blks: 1
Setup time: 3.13e-02s
----------------------------------------------------------------------------
 Iter | pri res | dua res | rel gap | pri obj | dua obj | kap/tau | time (s)
----------------------------------------------------------------------------
     0| 1.88e+21  1.81e+21  1.00e+00 -2.41e+23  5.12e+22  8.25e+22  2.10e-02 
   100| 1.92e-03  2.83e-03  1.41e-04  4.44e+01  4.44e+01  9.71e-16  2.60e+00 
-------------------------------------------------------



----------------------------------------------------------------------------
	SCS v2.1.2 - Splitting Conic Solver
	(c) Brendan O'Donoghue, Stanford University, 2012
----------------------------------------------------------------------------
Lin-sys: sparse-direct, nnz in A = 36735
eps = 1.00e-04, alpha = 1.50, max_iters = 100, normalize = 1, scale = 1.00
acceleration_lookback = 10, rho_x = 1.00e-03
Variables n = 21913, constraints m = 28601
Cones:	primal zero / dual free vars: 3344
	linear vars: 10032
	sd vars: 15225, sd blks: 1
Setup time: 3.31e-02s
----------------------------------------------------------------------------
 Iter | pri res | dua res | rel gap | pri obj | dua obj | kap/tau | time (s)
----------------------------------------------------------------------------
     0| 1.78e+21  1.71e+21  1.00e+00 -2.17e+23  4.65e+22  7.60e+22  2.13e-02 
   100| 8.57e-04  1.53e-03  1.50e-04  4.36e+01  4.36e+01  2.39e-14  2.39e+00 
-------------------------------------------------------

array([0.15214437, 0.18730494, 0.19012383, 0.18233666, 0.16837813,
       0.20116279, 0.29585588])

## 40%

In [None]:
def fold_k_err(X,y,k,G):
    np.random.seed(1)
    fold = np.random.choice(np.arange(5), len(y))
    Xtrain, ytrain = X[fold != k, :], y[fold != k]
    Xtest, ytest = X[fold == k, :], y[fold == k]
    n = np.array([[sum(ytrain==0),30,25,21],
                  [sum(ytrain==1),59,50,45]])
    p = np.array([5,10,15,22])
    return compute_err(Xtrain, ytrain, Xtest, ytest, n, p, G)
# cross_validation error
res = [fold_k_err(X,y,k,2) for k in np.arange(5)]
res_40 = np.mean(np.asarray(res), axis=0)
res_40

[SoftImpute] Max Singular Value of X_init = 14.023882
[SoftImpute] Iter 1: observed MAE=0.017928 rank=17
[SoftImpute] Iter 2: observed MAE=0.017928 rank=17
[SoftImpute] Iter 3: observed MAE=0.017939 rank=17
[SoftImpute] Iter 4: observed MAE=0.017949 rank=17
[SoftImpute] Iter 5: observed MAE=0.017959 rank=17
[SoftImpute] Iter 6: observed MAE=0.017970 rank=17
[SoftImpute] Iter 7: observed MAE=0.017976 rank=17
[SoftImpute] Iter 8: observed MAE=0.017979 rank=17
[SoftImpute] Iter 9: observed MAE=0.017982 rank=17
[SoftImpute] Iter 10: observed MAE=0.017985 rank=17
[SoftImpute] Iter 11: observed MAE=0.017988 rank=17
[SoftImpute] Iter 12: observed MAE=0.017991 rank=17
[SoftImpute] Iter 13: observed MAE=0.017993 rank=17
[SoftImpute] Iter 14: observed MAE=0.017995 rank=17
[SoftImpute] Iter 15: observed MAE=0.017996 rank=17
[SoftImpute] Iter 16: observed MAE=0.017997 rank=17
[SoftImpute] Iter 17: observed MAE=0.017999 rank=17
[SoftImpute] Iter 18: observed MAE=0.018000 rank=17
[SoftImpute] Iter 1



----------------------------------------------------------------------------
	SCS v2.1.2 - Splitting Conic Solver
	(c) Brendan O'Donoghue, Stanford University, 2012
----------------------------------------------------------------------------
Lin-sys: sparse-direct, nnz in A = 35735
eps = 1.00e-04, alpha = 1.50, max_iters = 100, normalize = 1, scale = 1.00
acceleration_lookback = 10, rho_x = 1.00e-03
Variables n = 21695, constraints m = 28339
Cones:	primal zero / dual free vars: 3322
	linear vars: 9966
	sd vars: 15051, sd blks: 1
Setup time: 3.11e-02s
----------------------------------------------------------------------------
 Iter | pri res | dua res | rel gap | pri obj | dua obj | kap/tau | time (s)
----------------------------------------------------------------------------
     0| 1.79e+21  1.72e+21  1.00e+00 -2.13e+23  4.57e+22  7.23e+22  2.20e-02 
   100| 2.12e-03  3.02e-03  1.37e-04  4.46e+01  4.46e+01  7.79e-15  2.42e+00 
--------------------------------------------------------



----------------------------------------------------------------------------
	SCS v2.1.2 - Splitting Conic Solver
	(c) Brendan O'Donoghue, Stanford University, 2012
----------------------------------------------------------------------------
Lin-sys: sparse-direct, nnz in A = 36324
eps = 1.00e-04, alpha = 1.50, max_iters = 100, normalize = 1, scale = 1.00
acceleration_lookback = 10, rho_x = 1.00e-03
Variables n = 22132, constraints m = 28864
Cones:	primal zero / dual free vars: 3366
	linear vars: 10098
	sd vars: 15400, sd blks: 1
Setup time: 3.34e-02s
----------------------------------------------------------------------------
 Iter | pri res | dua res | rel gap | pri obj | dua obj | kap/tau | time (s)
----------------------------------------------------------------------------
     0| 1.81e+21  1.74e+21  1.00e+00 -2.13e+23  4.56e+22  7.20e+22  2.06e-02 
   100| 2.32e-03  3.37e-03  1.93e-05  4.37e+01  4.37e+01  2.88e-15  2.41e+00 
-------------------------------------------------------



----------------------------------------------------------------------------
	SCS v2.1.2 - Splitting Conic Solver
	(c) Brendan O'Donoghue, Stanford University, 2012
----------------------------------------------------------------------------
Lin-sys: sparse-direct, nnz in A = 39024
eps = 1.00e-04, alpha = 1.50, max_iters = 100, normalize = 1, scale = 1.00
acceleration_lookback = 10, rho_x = 1.00e-03
Variables n = 24148, constraints m = 31276
Cones:	primal zero / dual free vars: 3564
	linear vars: 10692
	sd vars: 17020, sd blks: 1
Setup time: 3.22e-02s
----------------------------------------------------------------------------
 Iter | pri res | dua res | rel gap | pri obj | dua obj | kap/tau | time (s)
----------------------------------------------------------------------------
     0| 1.90e+21  1.83e+21  1.00e+00 -2.40e+23  5.09e+22  7.95e+22  2.22e-02 
   100| 1.97e-03  2.50e-03  5.38e-04  4.56e+01  4.56e+01  1.37e-14  2.54e+00 
-------------------------------------------------------



----------------------------------------------------------------------------
	SCS v2.1.2 - Splitting Conic Solver
	(c) Brendan O'Donoghue, Stanford University, 2012
----------------------------------------------------------------------------
Lin-sys: sparse-direct, nnz in A = 39024
eps = 1.00e-04, alpha = 1.50, max_iters = 100, normalize = 1, scale = 1.00
acceleration_lookback = 10, rho_x = 1.00e-03
Variables n = 24148, constraints m = 31276
Cones:	primal zero / dual free vars: 3564
	linear vars: 10692
	sd vars: 17020, sd blks: 1
Setup time: 3.34e-02s
----------------------------------------------------------------------------
 Iter | pri res | dua res | rel gap | pri obj | dua obj | kap/tau | time (s)
----------------------------------------------------------------------------
     0| 1.90e+21  1.83e+21  1.00e+00 -2.40e+23  5.10e+22  7.95e+22  2.05e-02 
   100| 7.16e-04  9.62e-04  7.44e-05  4.57e+01  4.57e+01  6.43e-15  2.54e+00 
-------------------------------------------------------



----------------------------------------------------------------------------
	SCS v2.1.2 - Splitting Conic Solver
	(c) Brendan O'Donoghue, Stanford University, 2012
----------------------------------------------------------------------------
Lin-sys: sparse-direct, nnz in A = 36029
eps = 1.00e-04, alpha = 1.50, max_iters = 100, normalize = 1, scale = 1.00
acceleration_lookback = 10, rho_x = 1.00e-03
Variables n = 21913, constraints m = 28601
Cones:	primal zero / dual free vars: 3344
	linear vars: 10032
	sd vars: 15225, sd blks: 1
Setup time: 2.63e-02s
----------------------------------------------------------------------------
 Iter | pri res | dua res | rel gap | pri obj | dua obj | kap/tau | time (s)
----------------------------------------------------------------------------
     0| 1.80e+21  1.73e+21  1.00e+00 -2.11e+23  4.52e+22  7.15e+22  1.94e-02 
   100| 1.70e-03  2.32e-03  5.64e-05  4.39e+01  4.39e+01  6.95e-15  2.40e+00 
-------------------------------------------------------

array([0.18698782, 0.2111195 , 0.19932045, 0.17790194, 0.17790194,
       0.17325078, 0.39881343])

In [None]:
parkinson = np.vstack((res_20,res_30,res_40))
parkinson

array([[0.14596295, 0.15959932, 0.1824323 , 0.17788684, 0.18233162,
        0.18697775, 0.20164826],
       [0.15214437, 0.18730494, 0.19012383, 0.18233666, 0.16837813,
        0.20116279, 0.29585588],
       [0.18698782, 0.2111195 , 0.19932045, 0.17790194, 0.17790194,
        0.17325078, 0.39881343]])

# wine
The data set is also available in sklearn, as noted in the package's website. So, we load it directly from sklearn

In [None]:
wine = datasets.load_wine()
X,y = wine.data, wine.target.ravel() 
print(sum(y==0), sum(y==1), sum(y==2))

59 71 48


((178, 13), array([47.2, 56.8, 38.4]))

## 20%

In [None]:
def fold_k_err(X,y,k,G):
    np.random.seed(1)
    fold = np.random.choice(np.arange(5), len(y))
    Xtrain, ytrain = X[fold != k, :], y[fold != k]
    Xtest, ytest = X[fold == k, :], y[fold == k]
    n = np.array([[sum(ytrain==0),40,35,27], [sum(ytrain==1),45,40,35],
                  [sum(ytrain==2),30,25,22]])
    p = np.array([5,7,9,13])   
    return compute_err(Xtrain, ytrain, Xtest, ytest, n, p, G)
# cross_validation error
res = [fold_k_err(X,y,k,3) for k in np.arange(5)]
res20 = np.mean(np.asarray(res), axis=0)
res20

[SoftImpute] Max Singular Value of X_init = 16.079100
[SoftImpute] Iter 1: observed MAE=0.022542 rank=13
[SoftImpute] Iter 2: observed MAE=0.022609 rank=13
[SoftImpute] Iter 3: observed MAE=0.022663 rank=13
[SoftImpute] Iter 4: observed MAE=0.022707 rank=13
[SoftImpute] Iter 5: observed MAE=0.022745 rank=13
[SoftImpute] Iter 6: observed MAE=0.022779 rank=13
[SoftImpute] Iter 7: observed MAE=0.022808 rank=13
[SoftImpute] Iter 8: observed MAE=0.022832 rank=13
[SoftImpute] Iter 9: observed MAE=0.022852 rank=13
[SoftImpute] Iter 10: observed MAE=0.022869 rank=13
[SoftImpute] Iter 11: observed MAE=0.022885 rank=13
[SoftImpute] Iter 12: observed MAE=0.022900 rank=13
[SoftImpute] Iter 13: observed MAE=0.022912 rank=13
[SoftImpute] Iter 14: observed MAE=0.022924 rank=13
[SoftImpute] Iter 15: observed MAE=0.022934 rank=13
[SoftImpute] Iter 16: observed MAE=0.022942 rank=13
[SoftImpute] Iter 17: observed MAE=0.022950 rank=13
[SoftImpute] Iter 18: observed MAE=0.022958 rank=13
[SoftImpute] Iter 1

array([0.01145161, 0.02504608, 0.01790323, 0.01290323, 0.03004608,
       0.01290323, 0.20108013])

## 30%

In [None]:
def fold_k_err(X,y,k,G):
    np.random.seed(1)
    fold = np.random.choice(np.arange(5), len(y))
    Xtrain, ytrain = X[fold != k, :], y[fold != k]
    Xtest, ytest = X[fold == k, :], y[fold == k]
    n = np.array([[sum(ytrain==0),40,35,25], [sum(ytrain==1),45,40,30],
                  [sum(ytrain==2),35,25,22]])
    p = np.array([2,4,7,13])  
    return compute_err(Xtrain, ytrain, Xtest, ytest, n, p, G)
# cross_validation error
res = [fold_k_err(X,y,k,3) for k in np.arange(5)]
res30 = np.mean(np.asarray(res), axis=0)
res30

[SoftImpute] Max Singular Value of X_init = 15.206606
[SoftImpute] Iter 1: observed MAE=0.022682 rank=13
[SoftImpute] Iter 2: observed MAE=0.022728 rank=13
[SoftImpute] Iter 3: observed MAE=0.022769 rank=13
[SoftImpute] Iter 4: observed MAE=0.022805 rank=13
[SoftImpute] Iter 5: observed MAE=0.022838 rank=13
[SoftImpute] Iter 6: observed MAE=0.022868 rank=13
[SoftImpute] Iter 7: observed MAE=0.022896 rank=13
[SoftImpute] Iter 8: observed MAE=0.022923 rank=13
[SoftImpute] Iter 9: observed MAE=0.022947 rank=13
[SoftImpute] Iter 10: observed MAE=0.022969 rank=13
[SoftImpute] Iter 11: observed MAE=0.022987 rank=13
[SoftImpute] Iter 12: observed MAE=0.023004 rank=13
[SoftImpute] Iter 13: observed MAE=0.023019 rank=13
[SoftImpute] Iter 14: observed MAE=0.023032 rank=13
[SoftImpute] Iter 15: observed MAE=0.023044 rank=13
[SoftImpute] Iter 16: observed MAE=0.023055 rank=13
[SoftImpute] Iter 17: observed MAE=0.023065 rank=13
[SoftImpute] Iter 18: observed MAE=0.023074 rank=13
[SoftImpute] Iter 1

array([0.01145161, 0.0414977 , 0.01790323, 0.01645161, 0.05175411,
       0.01645161, 0.30420463])

## 40%

In [None]:
def fold_k_err(X,y,k,G):
    np.random.seed(1)
    fold = np.random.choice(np.arange(5), len(y))
    Xtrain, ytrain = X[fold != k, :], y[fold != k]
    Xtest, ytest = X[fold == k, :], y[fold == k]
    n = np.array([[sum(ytrain==0),30,25,20], [sum(ytrain==1),35,27,24],
                  [sum(ytrain==2),26,22,20]])
    p = np.array([2,5,9,13])  
    return compute_err(Xtrain, ytrain, Xtest, ytest, n, p, G)
# cross_validation error
res = [fold_k_err(X,y,k,3) for k in np.arange(5)]
res40 = np.mean(np.asarray(res), axis=0)
res40

[SoftImpute] Max Singular Value of X_init = 13.872467
[SoftImpute] Iter 1: observed MAE=0.022019 rank=13
[SoftImpute] Iter 2: observed MAE=0.022079 rank=13
[SoftImpute] Iter 3: observed MAE=0.022138 rank=13
[SoftImpute] Iter 4: observed MAE=0.022189 rank=13
[SoftImpute] Iter 5: observed MAE=0.022236 rank=13
[SoftImpute] Iter 6: observed MAE=0.022277 rank=13
[SoftImpute] Iter 7: observed MAE=0.022315 rank=13
[SoftImpute] Iter 8: observed MAE=0.022351 rank=13
[SoftImpute] Iter 9: observed MAE=0.022385 rank=13
[SoftImpute] Iter 10: observed MAE=0.022417 rank=13
[SoftImpute] Iter 11: observed MAE=0.022447 rank=13
[SoftImpute] Iter 12: observed MAE=0.022476 rank=13
[SoftImpute] Iter 13: observed MAE=0.022501 rank=13
[SoftImpute] Iter 14: observed MAE=0.022523 rank=13
[SoftImpute] Iter 15: observed MAE=0.022544 rank=13
[SoftImpute] Iter 16: observed MAE=0.022561 rank=13
[SoftImpute] Iter 17: observed MAE=0.022577 rank=13
[SoftImpute] Iter 18: observed MAE=0.022590 rank=13
[SoftImpute] Iter 1

array([0.01145161, 0.05099374, 0.01657982, 0.02859447, 0.11274784,
       0.02859447, 0.39993818])

## results

In [None]:
wine = np.vstack((res20, res30, res40))
wine

array([[0.01145161, 0.02504608, 0.01790323, 0.01290323, 0.03004608,
        0.01290323, 0.20108013],
       [0.01145161, 0.0414977 , 0.01790323, 0.01645161, 0.05175411,
        0.01645161, 0.30420463],
       [0.01145161, 0.05099374, 0.01657982, 0.02859447, 0.11274784,
        0.02859447, 0.39993818]])

# Iris
The data set is also available in sklearn, as noted in the package's website. So, we load it directly from sklearn

In [None]:
iris = datasets.load_iris()
X,y = iris.data, iris.target.ravel() 

def fold_k_err(X,y,k,G):
    np.random.seed(1)
    fold = np.random.choice(np.arange(5), len(y))
    Xtrain, ytrain = X[fold != k, :], y[fold != k]
    Xtest, ytest = X[fold == k, :], y[fold == k]
    n = np.array([[sum(ytrain==0),30,28], [sum(ytrain==1),30,27],
                  [sum(ytrain==2),30,27]])
    p = np.array([1,3,4])
    return compute_err(Xtrain, ytrain, Xtest, ytest, n, p, G)
# cross_validation error
res = [fold_k_err(X,y,k,3) for k in np.arange(5)]
res20 = np.mean(np.asarray(res), axis=0)

[SoftImpute] Max Singular Value of X_init = 9.360055
[SoftImpute] Iter 1: observed MAE=0.013556 rank=4
[SoftImpute] Iter 2: observed MAE=0.013579 rank=4
[SoftImpute] Iter 3: observed MAE=0.013607 rank=4
[SoftImpute] Iter 4: observed MAE=0.013639 rank=4
[SoftImpute] Iter 5: observed MAE=0.013688 rank=4
[SoftImpute] Iter 6: observed MAE=0.013742 rank=4
[SoftImpute] Iter 7: observed MAE=0.013803 rank=4
[SoftImpute] Iter 8: observed MAE=0.013869 rank=4
[SoftImpute] Iter 9: observed MAE=0.013940 rank=4
[SoftImpute] Iter 10: observed MAE=0.014010 rank=4
[SoftImpute] Iter 11: observed MAE=0.014079 rank=4
[SoftImpute] Iter 12: observed MAE=0.014144 rank=4
[SoftImpute] Iter 13: observed MAE=0.014205 rank=4
[SoftImpute] Iter 14: observed MAE=0.014266 rank=4
[SoftImpute] Iter 15: observed MAE=0.014323 rank=4
[SoftImpute] Iter 16: observed MAE=0.014373 rank=4
[SoftImpute] Iter 17: observed MAE=0.014416 rank=4
[SoftImpute] Iter 18: observed MAE=0.014452 rank=4
[SoftImpute] Iter 19: observed MAE=0.0

In [None]:
res20

array([0.02443182, 0.12466649, 0.06280594, 0.11697419, 0.16634852,
       0.11697419, 0.20309912])

## 30%

In [None]:
iris = datasets.load_iris()
X,y = iris.data, iris.target.ravel() 

def fold_k_err(X,y,k,G):
    np.random.seed(1)
    fold = np.random.choice(np.arange(5), len(y))
    Xtrain, ytrain = X[fold != k, :], y[fold != k]
    Xtest, ytest = X[fold == k, :], y[fold == k]
    n = np.array([[sum(ytrain==0),25,22], [sum(ytrain==1),25,21],
                  [sum(ytrain==2),25,22]])
    p = np.array([1,3,4])    
    return compute_err(Xtrain, ytrain, Xtest, ytest, n, p, G)
# cross_validation error
res = [fold_k_err(X,y,k,3) for k in np.arange(5)]
res30 = np.mean(np.asarray(res), axis=0)

[SoftImpute] Max Singular Value of X_init = 8.523870
[SoftImpute] Iter 1: observed MAE=0.013399 rank=4
[SoftImpute] Iter 2: observed MAE=0.013404 rank=4
[SoftImpute] Iter 3: observed MAE=0.013410 rank=4
[SoftImpute] Iter 4: observed MAE=0.013417 rank=4
[SoftImpute] Iter 5: observed MAE=0.013425 rank=4
[SoftImpute] Iter 6: observed MAE=0.013440 rank=4
[SoftImpute] Iter 7: observed MAE=0.013462 rank=4
[SoftImpute] Iter 8: observed MAE=0.013488 rank=4
[SoftImpute] Iter 9: observed MAE=0.013514 rank=4
[SoftImpute] Iter 10: observed MAE=0.013539 rank=4
[SoftImpute] Iter 11: observed MAE=0.013571 rank=4
[SoftImpute] Iter 12: observed MAE=0.013607 rank=4
[SoftImpute] Iter 13: observed MAE=0.013648 rank=4
[SoftImpute] Iter 14: observed MAE=0.013687 rank=4
[SoftImpute] Iter 15: observed MAE=0.013726 rank=4
[SoftImpute] Iter 16: observed MAE=0.013766 rank=4
[SoftImpute] Iter 17: observed MAE=0.013802 rank=4
[SoftImpute] Iter 18: observed MAE=0.013835 rank=4
[SoftImpute] Iter 19: observed MAE=0.0

In [None]:
res30

array([0.03212413, 0.15534831, 0.06280594, 0.12625449, 0.15329451,
       0.1353454 , 0.3012073 ])

## 40%

In [None]:
iris = datasets.load_iris()
X,y = iris.data, iris.target.ravel() 

def fold_k_err(X,y,k,G):
    np.random.seed(1)
    fold = np.random.choice(np.arange(5), len(y))
    Xtrain, ytrain = X[fold != k, :], y[fold != k]
    Xtest, ytest = X[fold == k, :], y[fold == k]
    n = np.array([[sum(ytrain==0),20,15], [sum(ytrain==1),20,15],
                  [sum(ytrain==2),20,16]])
    p = np.array([1,3,4])
    return compute_err(Xtrain, ytrain, Xtest, ytest, n, p, G)
# cross_validation error
res = [fold_k_err(X,y,k,3) for k in np.arange(5)]
res40 = np.mean(np.asarray(res), axis=0)

[SoftImpute] Max Singular Value of X_init = 7.596066
[SoftImpute] Iter 1: observed MAE=0.013298 rank=4
[SoftImpute] Iter 2: observed MAE=0.013297 rank=4
[SoftImpute] Iter 3: observed MAE=0.013294 rank=4
[SoftImpute] Iter 4: observed MAE=0.013290 rank=4
[SoftImpute] Iter 5: observed MAE=0.013286 rank=4
[SoftImpute] Iter 6: observed MAE=0.013280 rank=4
[SoftImpute] Iter 7: observed MAE=0.013273 rank=4
[SoftImpute] Iter 8: observed MAE=0.013265 rank=4
[SoftImpute] Iter 9: observed MAE=0.013258 rank=4
[SoftImpute] Iter 10: observed MAE=0.013252 rank=4
[SoftImpute] Iter 11: observed MAE=0.013255 rank=4
[SoftImpute] Iter 12: observed MAE=0.013262 rank=4
[SoftImpute] Iter 13: observed MAE=0.013274 rank=4
[SoftImpute] Iter 14: observed MAE=0.013287 rank=4
[SoftImpute] Iter 15: observed MAE=0.013302 rank=4
[SoftImpute] Iter 16: observed MAE=0.013318 rank=4
[SoftImpute] Iter 17: observed MAE=0.013338 rank=4
[SoftImpute] Iter 18: observed MAE=0.013357 rank=4
[SoftImpute] Iter 19: observed MAE=0.0

In [None]:
res40

array([0.03697552, 0.15329451, 0.07511655, 0.13250449, 0.16382773,
       0.13250449, 0.40349028])

In [None]:
iris = np.vstack((res20, res30, res40))
iris

array([[0.02443182, 0.12466649, 0.06280594, 0.11697419, 0.16634852,
        0.11697419, 0.20309912],
       [0.03212413, 0.15534831, 0.06280594, 0.12625449, 0.15329451,
        0.1353454 , 0.3012073 ],
       [0.03697552, 0.15329451, 0.07511655, 0.13250449, 0.16382773,
        0.13250449, 0.40349028]])

# Digits

In [None]:
digits = datasets.load_digits()
X,y = digits.data, digits.target.ravel() 
print(X.shape)
rmid = np.where(sum(X!=0)<10)

(1797, 64)


In [None]:
X = np.delete(X, rmid,axis = 1)
X.shape

(1797, 54)

## 20%

In [None]:
def fold_k_err(X,y,k,G):
    np.random.seed(1)
    fold = np.random.choice(np.arange(5), len(y))
    Xtrain, ytrain = X[fold != k, :], y[fold != k]
    Xtest, ytest = X[fold == k, :], y[fold == k]
    ng = np.asarray([sum(ytrain==i) for i in np.arange(10)])
    n = np.hstack((ng.reshape((-1,1)), np.tile([125,120,100, 90],
                                 10).reshape((10,-1))))
    p = np.array([10,19, 25,56])    
    return compute_err(Xtrain, ytrain, Xtest, ytest, n, p, G)
# cross_validation error
res = [fold_k_err(X,y,k,10) for k in np.arange(5)]
res20 = np.mean(np.asarray(res), axis=0)

In [None]:
res20

array([0.05752231, 0.07124877, 0.07087817, 0.07022853, 0.06020288,
       0.0696683 , 0.07087817, 0.06966812, 0.20354437])

## 30%

In [None]:
def fold_k_err(X,y,k,G):
    np.random.seed(1)
    fold = np.random.choice(np.arange(5), len(y))
    Xtrain, ytrain = X[fold != k, :], y[fold != k]
    Xtest, ytest = X[fold == k, :], y[fold == k]
    ng = np.asarray([sum(ytrain==i) for i in np.arange(10)])
    n = np.hstack((ng.reshape((-1,1)), np.tile([130,115,90, 80,70,60],
                                 10).reshape((10,-1))))
    p = np.array([10,15, 25,30,40,56]) 
    return compute_err(Xtrain, ytrain, Xtest, ytest, n, p, G)
# cross_validation error
res = [fold_k_err(X,y,k,10) for k in np.arange(5)]
res30 = np.mean(np.asarray(res), axis=0)

In [None]:
res30

array([0.05783407, 0.08195374, 0.06966223, 0.07142643, 0.06357481,
       0.07195977, 0.06856867, 0.07519163, 0.29565362])

## 40%

In [None]:
def fold_k_err(X,y,k,G):
    np.random.seed(1)
    fold = np.random.choice(np.arange(5), len(y))
    Xtrain, ytrain = X[fold != k, :], y[fold != k]
    Xtest, ytest = X[fold == k, :], y[fold == k]
    ng = np.asarray([sum(ytrain==i) for i in np.arange(10)])
    n = np.hstack((ng.reshape((-1,1)), np.tile([110,100,80, 70,55,50],
                                 10).reshape((10,-1))))
    p = np.array([10,12, 20,27,35,56]) 
    return compute_err(Xtrain, ytrain, Xtest, ytest, n, p, G)
# cross_validation error
res = [fold_k_err(X,y,k,10) for k in np.arange(5)]
res40 = np.mean(np.asarray(res), axis=0)

In [None]:
digits = np.vstack((res20,res30,res40))
digits

# semeion

In [None]:
X,y = digits.data, digits.target.ravel() 
print(X.shape)
rmid = np.where(sum(X!=0)<10)

In [None]:
def fold_k_err(X,y,k,G):
    np.random.seed(1)
    fold = np.random.choice(np.arange(5), len(y))
    Xtrain, ytrain = X[fold != k, :], y[fold != k]
    Xtest, ytest = X[fold == k, :], y[fold == k]
    ng = np.asarray([sum(ytrain==i) for i in np.arange(10)])
    n = np.hstack((ng.reshape((-1,1)), np.tile([125,120,100, 90],
                                 10).reshape((10,-1))))
    p = np.array([10,19, 25,56])    
    return compute_err(Xtrain, ytrain, Xtest, ytest, n, p, G)
# cross_validation error
res = [fold_k_err(X,y,k,10) for k in np.arange(5)]
res20 = np.mean(np.asarray(res), axis=0)