In [6]:
import numpy as np
import tensorflow as tf
from contextlib import ExitStack
import glob
import os
from sklearn.mixture import GaussianMixture
from umap import UMAP
from IPython.display import clear_output

In [7]:
def create_example_protobuff(image, label):
    # convert to binary string format for Example protobuf
    image_data = tf.io.serialize_tensor(image)

    return tf.train.Example(
        features=tf.train.Features(
            feature={
                'image': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image_data.numpy()])),
                'label': tf.train.Feature(int64_list=tf.train.Int64List(value=[label]))
            }
        )
    )

In [8]:
def write_tfrecords(name, dataset):
#     num_examples_in_smaller_file = 3_000
    full_path = f'doodle-full-clean/{name}.tfrecord'
#     small_path = f'doodle-small-clean/{name}.tfrecord'
#     small_dataset = dataset.take(num_examples_in_smaller_file)

    with ExitStack() as stack:
        writer = stack.enter_context(tf.io.TFRecordWriter(full_path))

        # create example protobuffs from instances
        for image, label in dataset:
            example = create_example_protobuff(image, np.uint8(label))
            writer.write(example.SerializeToString())

#     with ExitStack() as stack:
#         writer = stack.enter_context(tf.io.TFRecordWriter(small_path))

#         # create example protobuffs from instances
#         for image, label in small_dataset:
#             example = create_example_protobuff(image, np.uint8(label))
#             writer.write(example.SerializeToString())

In [9]:
def clean_data(X):
    x_tfm = UMAP(n_components=2).fit_transform(X)
    
    gm = GaussianMixture(n_components=1, n_init=10)
    gm.fit(x_tfm)
    
    # Any isntance located in a low-density region is considered to be an anomaly
    densities = gm.score_samples(x_tfm) # score_samples esitmates the density of the model at any given location
    # say 10% are anomalies (see https://koaning.io/til/moar-bad-labels/)
    density_threshold = np.percentile(densities, 10) 
    non_anomalies_idxs = np.nonzero(densities > density_threshold)[0]
    
    X_clean = X[non_anomalies_idxs]
    
    return X_clean

In [10]:
all_files = sorted([path.lower() for path in glob.glob("doodle_data_npy/*")])
files_already_done = glob.glob("doodle-full-clean/*")
files_already_done = ["doodle_data_npy/" + file.split("/")[1].split(".")[0] + ".npy" for file in files_already_done]
all_files = [file for file in all_files if file not in files_already_done]

print(len(all_files))

def load_data():
    class_names = []
    num_files = 0

    # load each data file 
    for idx, file in enumerate(all_files):
        data = np.load(file)
        
        ##################################
        # Here we do the anomaly detection to clean the dataset
        ##################################
        data = clean_data(data)
    
        # data is 784, but need to reshape to 28x28 for CNN
        data = data.reshape((data.shape[0], 28, 28)).astype(np.uint8)
        labels = np.full(data.shape[0], idx)
        
        # convert numpy array to Tensorflow Dataset object
        dataset = tf.data.Dataset.from_tensor_slices((data, labels))

        # class name will be name of file e.g. 'fork.npy' is 'fork'
        class_name, ext = os.path.splitext(os.path.basename(file))
        class_names.append(class_name)
        
        # write Dataset to files
        write_tfrecords(f"{class_name}", dataset)
        
        # logging
        num_files += 1
        clear_output(wait=True)
        print(f'{num_files} file npy to tfrecord', flush=True)
            
load_data()

345 file npy to tfrecord
