In [1]:
import os

import tensorflow as tf
from tensorflow import keras

AUTOTUNE = tf.data.experimental.AUTOTUNE

import numpy as np

import matplotlib.pyplot as plt

import matplotlib.image as mpimg

import kaggle

from definitions import *

# Preprocess files from raw dataset

### Download data

In [2]:
kaggle.api.authenticate()
kaggle.api.dataset_download_files(dataset_name, path=raw_data_dir, unzip=True, quiet=False)

common-mobile-web-app-icons.zip: Skipping, found more recently modified local copy (use --force to force download)


### Create list of files and labels

In [3]:
subdirs = [f.name for f in os.scandir(raw_data_dir) if f.is_dir()]

files = []
for subdir in subdirs:
    subdir_path = os.path.join(raw_data_dir, subdir)
    files += [os.path.join(subdir_path, f.name) for f in os.scandir(subdir_path) if f.is_file()]

# Remove not images from dataset
not_jpg = [f for f in files if not f.endswith(".jpg")]
files = [file for file in files if file not in not_jpg]

# Sort files to maintain order
files = sorted(files)

labels = [file.split("\\")[-2] for file in files]

### Encode labels

In [None]:
label2index = dict((label, index) for index, label in enumerate(sorted(set(labels))))
encoded_labels = [label2index[label] for label in labels]

### Split files into training, testing and validation

In [None]:
NUMBER_OF_FILES  = len(files)
NUMBER_OF_LABELS = len(label2index)

In [None]:
from sklearn.model_selection import train_test_split

train_files, test_files, train_labels, test_labels = train_test_split(files,
                                                                      encoded_labels,
                                                                      test_size=TEST_SPLIT_FACTOR,
                                                                      random_state=1969)

train_files, val_files, train_labels, val_labels = train_test_split(train_files,
                                                                    train_labels,
                                                                    test_size=VAL_SPLIT_FACTOR,
                                                                    random_state=1969)

### Prepare images in dataset

In [None]:
def prepare_image(image):
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [IMG_SIZE, IMG_SIZE])
    image = tf.cast(image, tf.uint8)
    image = tf.image.encode_jpeg(
        image,
        optimize_size=True,
        x_density=96,
        y_density=96
    )

    return image

def load_and_prepare_image(path):
    image = tf.io.read_file(path)
    return prepare_image(image)

# Create TFRecord files

### Create split datasets

In [None]:
train_images_ds = tf.data.Dataset.from_tensor_slices(train_files)
test_images_ds  = tf.data.Dataset.from_tensor_slices(test_files)
val_images_ds   = tf.data.Dataset.from_tensor_slices(val_files)

train_labels_ds = tf.data.Dataset.from_tensor_slices(train_labels)
test_labels_ds  = tf.data.Dataset.from_tensor_slices(test_labels)
val_labels_ds   = tf.data.Dataset.from_tensor_slices(val_labels)

### Apply preprocessing to images datasets

In [None]:
train_images_ds = train_images_ds.map(load_and_prepare_image, num_parallel_calls=4)
test_images_ds  =  test_images_ds.map(load_and_prepare_image, num_parallel_calls=4)
val_images_ds   =   val_images_ds.map(load_and_prepare_image, num_parallel_calls=4)

### Save images datasets to binary files

In [None]:
train_images_ds = train_images_ds.map(tf.io.serialize_tensor, num_parallel_calls=4)
test_images_ds  =  test_images_ds.map(tf.io.serialize_tensor, num_parallel_calls=4)
val_images_ds   =   val_images_ds.map(tf.io.serialize_tensor, num_parallel_calls=4)

train_writer = tf.data.experimental.TFRecordWriter(train_images_file)
test_writer  = tf.data.experimental.TFRecordWriter( test_images_file)
val_writer   = tf.data.experimental.TFRecordWriter(  val_images_file)

# <span style="color:red">Do not use the commented code below!!!</span>
### ...unless you're 100% sure you know why are you doing this
This will override our dataset and it will be no longer consistent with a previous version

In [None]:
# train_writer.write(train_images_ds)
# test_writer. write(test_images_ds)
# val_writer.  write(val_images_ds)

### Save labels datasets to binary files

In [12]:
train_labels_ds = train_labels_ds.map(tf.io.serialize_tensor, num_parallel_calls=4)
test_labels_ds  =  test_labels_ds.map(tf.io.serialize_tensor, num_parallel_calls=4)
val_labels_ds   =   val_labels_ds.map(tf.io.serialize_tensor, num_parallel_calls=4)

train_writer = tf.data.experimental.TFRecordWriter(train_labels_file)
test_writer  = tf.data.experimental.TFRecordWriter( test_labels_file)
val_writer   = tf.data.experimental.TFRecordWriter(  val_labels_file)

# <span style="color:red">Do not use the commented code below!!!</span>
### ...unless you're 100% sure you know why are you doing this
This will override our dataset and it will be no longer consistent with a previous version

In [13]:
# train_writer.write(train_labels_ds)
# test_writer. write(test_labels_ds)
# val_writer.  write(val_labels_ds)