<a href="https://colab.research.google.com/github/svantepihl/Thesis-MaskDetection/blob/master/Images_to_TFRecords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Credits**

https://codelabs.developers.google.com/codelabs/keras-flowers-tpu#4

## Imports

In [None]:
import os, sys, math
import numpy as np
from matplotlib import pyplot as plt
import tensorflow as tf
print("Tensorflow version " + tf.__version__)
AUTO = tf.data.experimental.AUTOTUNE # used in tf.data.Dataset API

## Configuration

In [None]:
GCS_FOLDER = 'gs://facemask-detection-thesis-images-jpeg/'
GCS_PATTERN = GCS_FOLDER + '**/*.jp*'
GCS_OUTPUT = 'gs://facemask-detection-thesis-32-tfrecords-jpeg-224x224/'  # prefix for output file names
SHARDS = 32
TARGET_WIDTH = 224
TARGET_HEIGHT = 224
TARGET_SIZE = [TARGET_WIDTH, TARGET_HEIGHT]
CLASSES = [b'MaskCorrect', b'MaskOnChin', b'MaskOnlyOnMouth', b'NoMask'] # do not change, maps to the labels in the data (folder names)
STR_CLASSES = ['MaskCorrect', 'MaskOnChin', 'MaskOnlyOnMouth', 'NoMask']

In [None]:
# Utility function to plot images
def display_9_images_from_dataset(dataset):
  plt.figure(figsize=(13,13))
  subplot=331
  for i, (image, label) in enumerate(dataset):
    plt.subplot(subplot)
    plt.axis('off')
    plt.imshow(image.numpy().astype(np.uint8))
    plt.title(label.numpy().decode("utf-8"), fontsize=16)
    subplot += 1
    if i==8:
      break
  plt.tight_layout()
  plt.subplots_adjust(wspace=0.1, hspace=0.1)
  plt.show()

## GCP auth

In [None]:
if 'google.colab' in sys.modules:
   from google.colab import auth
   auth.authenticate_user()

## Read images and labels

In [None]:
nb_images = len(tf.io.gfile.glob(GCS_PATTERN))
for label in STR_CLASSES:
  class_pattern = GCS_FOLDER+str(label)+'/*.*'
  print(class_pattern)
  print(str(label) + " images: " + str(len(tf.io.gfile.glob(class_pattern))))
shard_size = math.ceil(1.0 * nb_images / SHARDS)
print("Pattern matches {} images which will be rewritten as {} .tfrec files containing {} images each.".format(nb_images, SHARDS, shard_size))

def decode_jpeg_and_label(filename):
  bits = tf.io.read_file(filename)
  image = tf.image.decode_jpeg(bits)
  # parse flower name from containing directory
  label = tf.strings.split(tf.expand_dims(filename, axis=-1), sep='/')
  label = label.values[-2]
  return image, label

filenames = tf.data.Dataset.list_files(GCS_PATTERN, seed=35155) # This also shuffles the images
dataset1 = filenames.map(decode_jpeg_and_label, num_parallel_calls=AUTO)

In [None]:
display_9_images_from_dataset(dataset1)

## Resize and crop images to common size

In [None]:
def resize_image_fill(image, label):
  # Resize and crop using "fill" algorithm:
  # always make sure the resulting image
  # is cut out from the source image so that
  # it fills the TARGET_SIZE entirely with no
  # black bars and a preserved aspect ratio.
  w = tf.shape(image)[0]
  h = tf.shape(image)[1]
  tw = TARGET_SIZE[1]
  th = TARGET_SIZE[0]
  resize_crit = (w * th) / (h * tw)
  image = tf.cond(resize_crit < 1,
                  lambda: tf.image.resize(image, [w*tw/w, h*tw/w]), # if true
                  lambda: tf.image.resize(image, [w*th/h, h*th/h])  # if false
                 )
  nw = tf.shape(image)[0]
  nh = tf.shape(image)[1]
  image = tf.image.crop_to_bounding_box(image, (nw - tw) // 2, (nh - th) // 2, tw, th)
  return image, label

In [None]:
def resize_image_tensorflow(image, label):
  image = tf.image.resize(
      image, 
      TARGET_SIZE, 
      method=tf.image.ResizeMethod.AREA, 
      preserve_aspect_ratio=False,
      antialias=True)
  return image, label

In [None]:
dataset2 = dataset1.map(resize_image_tensorflow, num_parallel_calls=AUTO)  

In [None]:
display_9_images_from_dataset(dataset2)

## Recompress the images

In [None]:
def recompress_image(image, label):
  height = tf.shape(image)[0]
  width = tf.shape(image)[1]
  image = tf.cast(image, tf.uint8)
  image = tf.image.encode_jpeg(image, optimize_size=True, chroma_downsampling=False)
  return image, label
dataset3 = dataset2.map(recompress_image, num_parallel_calls=AUTO)
dataset3 = dataset3.batch(shard_size) # sharding: there will be one "batch" of images per file 

## Write dataset to TFRecord files

In [None]:
# Three types of data can be stored in TFRecords: bytestrings, integers and floats
# They are always stored as lists, a single data element will be a list of size 1

def _bytestring_feature(list_of_bytestrings):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings))

def _int_feature(list_of_ints): # int64
  return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints))

def _float_feature(list_of_floats): # float32
  return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats))
  

def to_tfrecord(tfrec_filewriter, img_bytes, label,):  
  class_num = np.argmax(np.array(CLASSES)==label) # order defined in CLASSES
  one_hot_class = tf.one_hot(class_num,len(CLASSES)).numpy().tolist()

  features = {
      "image": _bytestring_feature([img_bytes]), # one image in the list
      "class": _int_feature([class_num]),        # one class in the list
      "one_hot_class": _float_feature(one_hot_class) # variable length  list of floats, n=len(CLASSES)
  }
  return tf.train.Example(features=tf.train.Features(feature=features))
  
print("Writing TFRecords")
for shard, (image, label) in enumerate(dataset3):
  # batch size used as shard size here
  shard_size = image.numpy().shape[0]
  # good practice to have the number of records in the filename
  filename = GCS_OUTPUT + "{:02d}-{}.tfrec".format(shard, shard_size)
  
  with tf.io.TFRecordWriter(filename) as out_file:
    for i in range(shard_size):
      example = to_tfrecord(out_file,
                            image.numpy()[i], # re-compressed image: already a byte string
                            label.numpy()[i])
      out_file.write(example.SerializeToString())
    print("Wrote file {} containing {} records".format(filename, shard_size))

In [None]:
print("Done!")