## libraries and function 


In [1]:
!pip install impyute
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as skLDA
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from scipy import stats
import numpy as np
import impyute as impy
from fancyimpute import IterativeSVD, SoftImpute, NuclearNormMinimization
import pandas as pd
import time

Collecting impyute
  Downloading https://files.pythonhosted.org/packages/37/28/86829f67c9affb847facaab94687761d3555539ec675f7577778c5b2680a/impyute-0.0.8-py2.py3-none-any.whl
Installing collected packages: impyute
Successfully installed impyute-0.0.8




The function `mle` allows us to compute the MLEs from training data with monotone missing data.

We denote
$$n = \begin{pmatrix}
n_1^{(1)} & n_1^{(2)} &...&n_1^{(K)}\\
\vdots & \vdots &\ddots&\vdots\\
n_G^{(1)} & n_G^{(2)} &...&n_G^{(K)}
\end{pmatrix}$$
$$p = (p_1,p_2,...,p_K)$$
G is the number of classes.

In [2]:
'''
function that create data list that contain missing values
The input X is a numpy array, y is the label
the function return a list where the ith element of 
the list belongs to the ith class
'''

def make_nan_list(X,y,G, n, p):
    # note that the label should go from 0 to G-1
    data = []
    for g in np.arange(G):
        data.append(X[y==g,:])
        for k in np.arange(len(p)-1):
            data[g][n[g,k+1]:n[g,k], p[k]:] = np.nan
    return data

### compute_err function 

In [3]:
def missing_rate(Xtrain, ytrain, n, p, G):    
    Xtr_nan_list = make_nan_list(Xtrain,ytrain,G, n, p)
    # make NA data
    # since making function changes the order of observation
    # we need to generate new ytr from Xtr_nan    
    Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0]))
    for g in np.arange(1,G):
        Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g]))
        ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g]))))

    # percentage of missing values
    per_missing = np.mean(np.isnan(Xtr_nan))
    return per_missing

In [4]:
def compute_err_SOFT(Xtrain, ytrain, Xtest, ytest, n, p, G):    
    Xtr_nan_list = make_nan_list(Xtrain,ytrain,G, n, p)
    # make NA data
    # since making function changes the order of observation
    # we need to generate new ytr from Xtr_nan    
    Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0]))
    for g in np.arange(1,G):
        Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g]))
        ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g]))))

    # percentage of missing values
    per_missing = np.mean(np.isnan(Xtr_nan))

    scaler = MinMaxScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtest = scaler.transform(Xtest)
    Xtr_nan_list2 = []
    for g in range(G):
      Xtr_nan_list2.append(scaler.transform(Xtr_nan_list[g]))

    start = time.time()
    Xtr_softimpute = SoftImpute(max_iters = 100).fit_transform(Xtr_nan)
    clf_softimpute = skLDA().fit(Xtr_softimpute, ytr)
    softimpute_err = np.mean(clf_softimpute.predict(Xtest).flatten() != ytest)
    softimpute_time = time.time()-start

    return softimpute_err, softimpute_time

## Import MNIST

In [5]:
import tensorflow as tf
import tensorflow_datasets as tfds

# Fetch the dataset directly
mnist = tfds.image.MNIST()
# or by string name
mnist = tfds.builder('mnist')

# Download the data, prepare it, and write it to disk
mnist.download_and_prepare()

# Load data from disk as tf.data.Datasets
datasets = mnist.as_dataset()
train_dataset, test_dataset = datasets['train'], datasets['test']

[1mDownloading and preparing dataset mnist/3.0.1 (download: 11.06 MiB, generated: 21.00 MiB, total: 32.06 MiB) to /root/tensorflow_datasets/mnist/3.0.1...[0m


local data directory. If you'd instead prefer to read directly from our public
GCS bucket (recommended if you're running on GCP), you can instead pass
`try_gcs=True` to `tfds.load` or set `data_dir=gs://tfds-data/datasets`.



HBox(children=(FloatProgress(value=0.0, description='Dl Completed...', max=4.0, style=ProgressStyle(descriptio…



[1mDataset mnist downloaded and prepared to /root/tensorflow_datasets/mnist/3.0.1. Subsequent calls will reuse this data.[0m


In [None]:
mnist.info

tfds.core.DatasetInfo(
    name='mnist',
    version=3.0.1,
    description='The MNIST database of handwritten digits.',
    homepage='http://yann.lecun.com/exdb/mnist/',
    features=FeaturesDict({
        'image': Image(shape=(28, 28, 1), dtype=tf.uint8),
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=10),
    }),
    total_num_examples=70000,
    splits={
        'test': 10000,
        'train': 60000,
    },
    supervised_keys=('image', 'label'),
    citation="""@article{lecun2010mnist,
      title={MNIST handwritten digit database},
      author={LeCun, Yann and Cortes, Corinna and Burges, CJ},
      journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist},
      volume={2},
      year={2010}
    }""",
    redistribution_info=,
)

In [6]:
# convert the Dataset to NumPy arrays and flatten the data
Xtrain_np, ytrain_np = [], []
for example in tfds.as_numpy(train_dataset):
  Xtrain_np.append(example['image'].flatten())
  ytrain_np.append(example['label'])


Xtrain, ytrain = np.asarray(Xtrain_np), np.asarray(ytrain_np)
Xtrain = Xtrain.astype(float)

# set random seed and shuffle the data
np.random.seed(1)
idx = np.arange(len(ytrain))
np.random.shuffle(idx)
Xtrain, ytrain = Xtrain[idx,:], ytrain[idx]  

Xtrain.shape, ytrain.shape 

((60000, 784), (60000,))

In [7]:
# convert the test set to NumPy arrays and flatten the data
Xtest, ytest = [], []
for example in tfds.as_numpy(test_dataset):
  Xtest.append(example['image'].flatten())
  ytest.append(example['label'])

Xtest, ytest = np.asarray(Xtest), np.asarray(ytest)
Xtest = Xtest.astype(float)

In [8]:
# check if a column is all 0
id = [np.sum(Xtrain[:,i] != 0)>10 for i in range(28**2)]
# number of columns that mostly zero
print(28**2-np.sum(id))
# number of columns that has at least more than 10 non-zero
np.sum(id)

135


649

In [9]:
Xtrain, Xtest = Xtrain[:,id], Xtest[:,id]

In [10]:
Xtrain.shape, Xtest.shape

((60000, 649), (10000, 649))

In [11]:
    # number of sample per class in training data
    ng = np.asarray([sum(ytrain==i) for i in np.arange(10)])
    ng

array([5923, 6742, 5958, 6131, 5842, 5421, 5918, 6265, 5851, 5949])

## 20%

In [None]:
    n = np.hstack((ng.reshape((-1,1)), np.tile([4500,4200,3700, 3500],
                                 10).reshape((10,-1))))
    p = np.array([300,320,400, 500,649])   
    missing_rate(Xtrain, ytrain, n, p, 10)

0.19940934771443247

In [None]:
compute_err_SOFT(Xtrain, ytrain, Xtest, ytest, n, p, 10)

[SoftImpute] Max Singular Value of X_init = 1281.262364
[SoftImpute] Iter 1: observed MAE=0.041620 rank=215
[SoftImpute] Iter 2: observed MAE=0.041728 rank=210
[SoftImpute] Iter 3: observed MAE=0.041799 rank=209
[SoftImpute] Iter 4: observed MAE=0.041848 rank=208
[SoftImpute] Iter 5: observed MAE=0.041882 rank=208
[SoftImpute] Iter 6: observed MAE=0.041908 rank=208
[SoftImpute] Iter 7: observed MAE=0.041928 rank=208
[SoftImpute] Iter 8: observed MAE=0.041944 rank=207
[SoftImpute] Iter 9: observed MAE=0.041957 rank=207
[SoftImpute] Iter 10: observed MAE=0.041968 rank=207
[SoftImpute] Iter 11: observed MAE=0.041978 rank=207
[SoftImpute] Iter 12: observed MAE=0.041986 rank=207
[SoftImpute] Iter 13: observed MAE=0.041994 rank=207
[SoftImpute] Iter 14: observed MAE=0.042000 rank=207
[SoftImpute] Iter 15: observed MAE=0.042006 rank=207
[SoftImpute] Iter 16: observed MAE=0.042012 rank=207
[SoftImpute] Iter 17: observed MAE=0.042017 rank=207
[SoftImpute] Iter 18: observed MAE=0.042022 rank=207

(0.1395, 958.3851203918457)

## 30 %

In [None]:
    n = np.hstack((ng.reshape((-1,1)), np.tile([4400,4000,3400, 3000],
                                 10).reshape((10,-1))))
    p = np.array([200,290,300, 400,649])   
    missing_rate(Xtrain, ytrain, n, p, 10)

0.3007190549563431

In [None]:
compute_err_SOFT(Xtrain, ytrain, Xtest, ytest, n, p, 10)

[SoftImpute] Max Singular Value of X_init = 1162.477247
[SoftImpute] Iter 1: observed MAE=0.040079 rank=220
[SoftImpute] Iter 2: observed MAE=0.040171 rank=216
[SoftImpute] Iter 3: observed MAE=0.040236 rank=214
[SoftImpute] Iter 4: observed MAE=0.040282 rank=213
[SoftImpute] Iter 5: observed MAE=0.040316 rank=213
[SoftImpute] Iter 6: observed MAE=0.040341 rank=212
[SoftImpute] Iter 7: observed MAE=0.040361 rank=212
[SoftImpute] Iter 8: observed MAE=0.040377 rank=212
[SoftImpute] Iter 9: observed MAE=0.040391 rank=212
[SoftImpute] Iter 10: observed MAE=0.040403 rank=212
[SoftImpute] Iter 11: observed MAE=0.040413 rank=212
[SoftImpute] Iter 12: observed MAE=0.040421 rank=212
[SoftImpute] Iter 13: observed MAE=0.040429 rank=212
[SoftImpute] Iter 14: observed MAE=0.040436 rank=212
[SoftImpute] Iter 15: observed MAE=0.040442 rank=212
[SoftImpute] Iter 16: observed MAE=0.040447 rank=212
[SoftImpute] Iter 17: observed MAE=0.040452 rank=212
[SoftImpute] Iter 18: observed MAE=0.040457 rank=212

(0.1512, 1026.3849368095398)

## 40%

In [None]:
    n = np.hstack((ng.reshape((-1,1)), np.tile([3600,3400,3000, 2500],
                                 10).reshape((10,-1))))
    p = np.array([150,220,300, 400,649])   
    missing_rate(Xtrain, ytrain, n, p, 10)

0.39740626605033386

In [None]:
compute_err_SOFT(Xtrain, ytrain, Xtest, ytest, n, p, 10)

[SoftImpute] Max Singular Value of X_init = 1056.302670
[SoftImpute] Iter 1: observed MAE=0.038729 rank=222
[SoftImpute] Iter 2: observed MAE=0.038825 rank=218
[SoftImpute] Iter 3: observed MAE=0.038893 rank=217
[SoftImpute] Iter 4: observed MAE=0.038940 rank=216
[SoftImpute] Iter 5: observed MAE=0.038975 rank=215
[SoftImpute] Iter 6: observed MAE=0.039002 rank=215
[SoftImpute] Iter 7: observed MAE=0.039024 rank=215
[SoftImpute] Iter 8: observed MAE=0.039042 rank=215
[SoftImpute] Iter 9: observed MAE=0.039057 rank=214
[SoftImpute] Iter 10: observed MAE=0.039070 rank=214
[SoftImpute] Iter 11: observed MAE=0.039082 rank=214
[SoftImpute] Iter 12: observed MAE=0.039091 rank=214
[SoftImpute] Iter 13: observed MAE=0.039100 rank=214
[SoftImpute] Iter 14: observed MAE=0.039108 rank=214
[SoftImpute] Iter 15: observed MAE=0.039115 rank=214
[SoftImpute] Iter 16: observed MAE=0.039121 rank=214
[SoftImpute] Iter 17: observed MAE=0.039127 rank=214
[SoftImpute] Iter 18: observed MAE=0.039132 rank=214

(0.1513, 1027.803083896637)

## 50%

In [12]:
n = np.hstack((ng.reshape((-1,1)), np.tile([2900,2800,2600, 2500],
                                 10).reshape((10,-1))))
p = np.array([90,140,155, 170,649])   
missing_rate(Xtrain, ytrain, n, p, 10)

0.4957627118644068

In [13]:
compute_err_SOFT(Xtrain, ytrain, Xtest, ytest, n, p, 10)

[SoftImpute] Max Singular Value of X_init = 981.752595
[SoftImpute] Iter 1: observed MAE=0.038407 rank=211
[SoftImpute] Iter 2: observed MAE=0.038479 rank=209
[SoftImpute] Iter 3: observed MAE=0.038529 rank=208
[SoftImpute] Iter 4: observed MAE=0.038565 rank=207
[SoftImpute] Iter 5: observed MAE=0.038591 rank=207
[SoftImpute] Iter 6: observed MAE=0.038610 rank=207
[SoftImpute] Iter 7: observed MAE=0.038626 rank=207
[SoftImpute] Iter 8: observed MAE=0.038639 rank=207
[SoftImpute] Iter 9: observed MAE=0.038649 rank=207
[SoftImpute] Iter 10: observed MAE=0.038658 rank=207
[SoftImpute] Iter 11: observed MAE=0.038666 rank=207
[SoftImpute] Iter 12: observed MAE=0.038672 rank=207
[SoftImpute] Iter 13: observed MAE=0.038678 rank=207
[SoftImpute] Iter 14: observed MAE=0.038683 rank=207
[SoftImpute] Iter 15: observed MAE=0.038688 rank=207
[SoftImpute] Iter 16: observed MAE=0.038692 rank=207
[SoftImpute] Iter 17: observed MAE=0.038695 rank=207
[SoftImpute] Iter 18: observed MAE=0.038698 rank=207


(0.1547, 798.8074636459351)