## Data Loading and Prepeocessing
Script for batch dataloading and preprocessing

In [0]:
import numpy as np
import tensorflow as tf
import tarfile
import io
from tensorflow import keras
from google.cloud import storage
from tensorflow.keras.models import Model 
from keras import backend as K
from PIL import Image
from matplotlib import pylab as plt

Using TensorFlow backend.


#### Connect with GCS bucket

In [0]:
bucket_name = "dataproc-staging-us-central1-759291875656-wohgf1sk"
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
file_prefix = "data/"
blobs = bucket.list_blobs(prefix=file_prefix, delimiter = '/')

In [0]:
# list files in folder
for blob in blobs:
    print(blob.name)

data/
data/filelist_places365-standard.tar
data/test_data.tar
data/train_data.tar
data/val_data.tar


#### Download file from blob

In [0]:
file_name = "val_data.tar"
blob = bucket.get_blob(file_prefix + file_name)
blob.download_to_filename(file_name)

#### Functions for Loading and Preprocessing

In [0]:
def batch_preprocess_imgs(archive, preprocess_func, 
                          preprocess_args, start, end):
    """
    batch preprocess images from an archive by reading and 
    preprocessing files within the start and end of the batch
    Args:
        archive: (Tarfile) an archive of image files
        preprocess_func: (func) functions for preprocessing
        start: (int) starting index
        end: (int) ending index (non-inclusive)
    Returns:
        result: (ndarray) an array of the preprocesed batch
    """
    result = []
    idx = start
    
    while idx < end:
        member = archive.next()
        f = tar_file.extractfile(member)
        if f is not None:
            content = f.read()
            result.append(preprocess_func(content, preprocess_args))
            idx += 1
    return result

In [0]:
def img_to_ndarray(img_data, img_dim):
    """ 
    Resize image so that it has height dim_h and flatten the image
    Args:
        img_data: (bytes) image data
        img_dim: typle(width, height)
    Returns:
        img: (np array) the resized and flattened image
    """
    img = Image.open(io.BytesIO(img_data))
    img = img.resize(img_dim).convert(mode = 'L') # convert to grey scale
    return np.array(img) / 255.0

#### Batch Loading and Preprocessing

In [0]:
import time
start_time = time.time()

# setup
with tarfile.open(file_name) as tar_file:
    data_size = sum([1 if f.isfile() else 0 for f in tar_file.getmembers()])
print(f'data size = {data_size}')

img_size = (32, 32)
tar_file = tarfile.open(file_name)
BATCH_SIZE = 1000

# save out batches to fp
filename = 'img_array.pkl'
fp = np.memmap(filename, dtype='float32', mode='w+',
               shape=(data_size, img_size[0], img_size[1]))

# batch loading and preprocessing
for i in range(data_size // BATCH_SIZE):
    start, end = i*BATCH_SIZE, min((i+1)*BATCH_SIZE, data_size)
    result = batch_preprocess_imgs(archive=tar_file,
                                   preprocess_func=img_to_ndarray,
                                   preprocess_args=img_size,
                                   start=start, end=end)
    fp[start:end] = np.array(result)
    time_elapsed = round(time.time()-start_time, 2)
    print(f'loading batch ({i+1} of {data_size // BATCH_SIZE})')
    print(f'time elapsed: {time_elapsed//60} minutes {round(time_elapsed%60, 2)} seconds')
    
tar_file.close()
end_time = time.time()
print(f'Total preprocessing time: {round((end_time-start_time)/60, 2)} minutes')

data size = 36500
loading batch (1 of 36)
time elapsed: 0.0 minutes 10.49 seconds
loading batch (2 of 36)
time elapsed: 0.0 minutes 18.36 seconds
loading batch (3 of 36)
time elapsed: 0.0 minutes 26.14 seconds
loading batch (4 of 36)
time elapsed: 0.0 minutes 33.9 seconds
loading batch (5 of 36)
time elapsed: 0.0 minutes 41.69 seconds
loading batch (6 of 36)
time elapsed: 0.0 minutes 49.47 seconds
loading batch (7 of 36)
time elapsed: 0.0 minutes 57.17 seconds
loading batch (8 of 36)
time elapsed: 1.0 minutes 4.89 seconds
loading batch (9 of 36)
time elapsed: 1.0 minutes 12.68 seconds
loading batch (10 of 36)
time elapsed: 1.0 minutes 20.47 seconds
loading batch (11 of 36)
time elapsed: 1.0 minutes 28.41 seconds
loading batch (12 of 36)
time elapsed: 1.0 minutes 36.26 seconds
loading batch (13 of 36)
time elapsed: 1.0 minutes 44.09 seconds
loading batch (14 of 36)
time elapsed: 1.0 minutes 51.9 seconds
loading batch (15 of 36)
time elapsed: 1.0 minutes 59.68 seconds
loading batch (16 o

#### Verify and Visualize Output

In [0]:
img_array = fp
print(img_array.shape)
plt.imshow(img_array[0], cmap='Greys')