## libraries and function 


In [1]:
!pip install impyute
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as skLDA
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from scipy import stats
import numpy as np
import impyute as impy
from fancyimpute import IterativeSVD, SoftImpute, NuclearNormMinimization
import pandas as pd
import time

Collecting impyute
  Downloading https://files.pythonhosted.org/packages/37/28/86829f67c9affb847facaab94687761d3555539ec675f7577778c5b2680a/impyute-0.0.8-py2.py3-none-any.whl
Installing collected packages: impyute
Successfully installed impyute-0.0.8




### LDA and nan function 


In [2]:
'''
function that create data list that contain missing values
The input X is a numpy array, y is the label
the function return a list where the ith element of 
the list belongs to the ith class
'''

def make_nan_list(X,y,G, n, p):
    # note that the label should go from 0 to G-1
    data = []
    for g in np.arange(G):
        data.append(X[y==g,:])
        for k in np.arange(len(p)-1):
            data[g][n[g,k+1]:n[g,k], p[k]:] = np.nan
    return data

### compute_err function 

In [3]:
def missing_rate(Xtrain, ytrain, n, p, G):    
    Xtr_nan_list = make_nan_list(Xtrain,ytrain,G, n, p)
    # make NA data
    # since making function changes the order of observation
    # we need to generate new ytr from Xtr_nan    
    Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0]))
    for g in np.arange(1,G):
        Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g]))
        ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g]))))

    # percentage of missing values
    per_missing = np.mean(np.isnan(Xtr_nan))
    return per_missing

In [4]:
def compute_err_kNN(Xtrain, ytrain, Xtest, ytest, n, p, G):    
    Xtr_nan_list = make_nan_list(Xtrain,ytrain,G, n, p)
    # make NA data
    # since making function changes the order of observation
    # we need to generate new ytr from Xtr_nan    
    Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0]))
    for g in np.arange(1,G):
        Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g]))
        ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g]))))

    # percentage of missing values
    per_missing = np.mean(np.isnan(Xtr_nan))

    scaler = MinMaxScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtest = scaler.transform(Xtest)
    Xtr_nan_list2 = []
    for g in range(G):
      Xtr_nan_list2.append(scaler.transform(Xtr_nan_list[g]))

    start = time.time()
    Xtr_knn = impy.fast_knn(Xtr_nan, k=1)
    print("Finished imputing")
    clf_knn = skLDA().fit(Xtr_knn, ytr)
    knn_err = np.mean(clf_knn.predict(Xtest).flatten() != ytest)
    knn_time = time.time()-start 

    return knn_err, knn_time

## Import MNIST

In [5]:
import tensorflow as tf
import tensorflow_datasets as tfds

# Fetch the dataset directly
mnist = tfds.image.MNIST()
# or by string name
mnist = tfds.builder('mnist')

# Download the data, prepare it, and write it to disk
mnist.download_and_prepare()

# Load data from disk as tf.data.Datasets
datasets = mnist.as_dataset()
train_dataset, test_dataset = datasets['train'], datasets['test']

[1mDownloading and preparing dataset mnist/3.0.1 (download: 11.06 MiB, generated: 21.00 MiB, total: 32.06 MiB) to /root/tensorflow_datasets/mnist/3.0.1...[0m


local data directory. If you'd instead prefer to read directly from our public
GCS bucket (recommended if you're running on GCP), you can instead pass
`try_gcs=True` to `tfds.load` or set `data_dir=gs://tfds-data/datasets`.



HBox(children=(FloatProgress(value=0.0, description='Dl Completed...', max=4.0, style=ProgressStyle(descriptio…



[1mDataset mnist downloaded and prepared to /root/tensorflow_datasets/mnist/3.0.1. Subsequent calls will reuse this data.[0m


In [6]:
# convert the Dataset to NumPy arrays and flatten the data
Xtrain, ytrain = [], []
for example in tfds.as_numpy(train_dataset):
  Xtrain.append(example['image'].flatten())
  ytrain.append(example['label'])


Xtrain, ytrain = np.asarray(Xtrain), np.asarray(ytrain)
Xtrain = Xtrain.astype(float)

# set random seed and shuffle the data
np.random.seed(1)
idx = np.arange(len(ytrain))
np.random.shuffle(idx)
Xtrain, ytrain = Xtrain[idx,:], ytrain[idx]  

Xtrain.shape, ytrain.shape 

((60000, 784), (60000,))

In [7]:
# convert the test set to NumPy arrays and flatten the data
Xtest, ytest = [], []
for example in tfds.as_numpy(test_dataset):
  Xtest.append(example['image'].flatten())
  ytest.append(example['label'])

Xtest, ytest = np.asarray(Xtest), np.asarray(ytest)
Xtest = Xtest.astype(float)

In [8]:
# check if a column is all 0
id = [np.sum(Xtrain[:,i] != 0)>10 for i in range(28**2)]
# number of columns that mostly zero
print(28**2-np.sum(id))
# number of columns that has at least more than 10 non-zero
np.sum(id)

135


649

In [9]:
Xtrain, Xtest = Xtrain[:,id], Xtest[:,id]

In [None]:
Xtrain.shape, Xtest.shape

((60000, 649), (10000, 649))

In [10]:
# number of sample per class in training data
ng = np.asarray([sum(ytrain==i) for i in np.arange(10)])
ng

array([5923, 6742, 5958, 6131, 5842, 5421, 5918, 6265, 5851, 5949])

## 20%

In [None]:
n = np.hstack((ng.reshape((-1,1)), np.tile([4500,4200,3700, 3500],
                                 10).reshape((10,-1))))
p = np.array([300,320,400, 500,649])   
missing_rate(Xtrain, ytrain, n, p, 10)

0.19940934771443247

In [None]:
# kernel crashed after running out of RAM (25 GB)
knn20 = compute_err_kNN(Xtrain, ytrain, Xtest, ytest, n, p, 10)

## 30%

In [None]:
n = np.hstack((ng.reshape((-1,1)), np.tile([4500,4200,3700, 3400],
                                 10).reshape((10,-1))))
p = np.array([100,250,350, 400,649])   
missing_rate(Xtrain, ytrain, n, p, 10)

0.2997945557267591

In [None]:
knn30 = compute_err_kNN(Xtrain, ytrain, Xtest, ytest, n, p, 10)

## 40%


In [None]:
n = np.hstack((ng.reshape((-1,1)), np.tile([4000,3500,3000, 2600],
                                 10).reshape((10,-1))))
p = np.array([100,250,350, 400,649])   
missing_rate(Xtrain, ytrain, n, p, 10)

0.39717514124293785

In [None]:
knn40 = compute_err_kNN(Xtrain, ytrain, Xtest, ytest, n, p, 10)

In [None]:
result = np.vstack((knn20, knn30, knn40))
numpy.savetxt("mnist_knn.csv", result, delimiter=",")

## 50%

In [23]:
n = np.hstack((ng.reshape((-1,1)), np.tile([2900,2800,2600, 2500],
                                 10).reshape((10,-1))))
p = np.array([90,140,155, 170,649])   
missing_rate(Xtrain, ytrain, n, p, 10)

0.4957627118644068