<a href="https://colab.research.google.com/github/tomasmuzas/MasterThesis/blob/main/CreateTFDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.initializers import VarianceScaling
import tensorflow.keras
from tensorflow.keras import optimizers
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.inception_resnet_v2 import InceptionResNetV2
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.applications.resnet import ResNet101
from tqdm.notebook import tqdm
import numpy as np

In [2]:
from google.colab import auth
auth.authenticate_user()

In [9]:
DATASET_NAME = "GZ1_Unique_Validation"
GCS_OUTPUT = f'gs://tomasmuzasmaster2021/paper/tfrecords/{DATASET_NAME}'
GCS_PATTERN = f'{DATASET_NAME}/*/*.jpg'
AUTO = tf.data.experimental.AUTOTUNE
CLASSES = [b'Spiral', b'Elliptical'] # -- binary
# CLASSES = [b'Er', b'Ei', b'Ec'] # Elliptical only
# CLASSES = [b'SB', b'S'] # Spiral binary only
# CLASSES = [b'Er', b'Ei', b'Ec', b'SB', b'S'] # All combined

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
%%bash -s "$DATASET_NAME"
rsync -ah --progress drive/MyDrive/MTD/PaperDatasets/$1.zip $1.zip && unzip -q $1.zip

sending incremental file list
GZ1_Unique_Validation.zip
         32.77K   0%    0.00kB/s    0:00:00           16.81M   8%   16.00MB/s    0:00:11           40.14M  20%   19.11MB/s    0:00:08          135.82M  68%   43.15MB/s    0:00:01          198.28M 100%   53.12MB/s    0:00:03 (xfr#1, to-chk=0/1)


In [11]:
import math

def decode_jpeg_and_label(filename):
  bits = tf.io.read_file(filename)
  image = tf.image.decode_jpeg(bits)
  label = tf.strings.split(tf.expand_dims(filename, axis=-1), sep='/')
  label = label.values[-2]
  return image, label

def recompress_image(image, label):
  height = tf.shape(image)[0]
  width = tf.shape(image)[1]
  image = tf.image.encode_jpeg(image, optimize_size=True, chroma_downsampling=False)
  return image, label, height, width

def _bytestring_feature(list_of_bytestrings):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings))

def _int_feature(list_of_ints): # int64
  return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints))

def _float_feature(list_of_floats): # float32
  return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats))

def to_tfrecord(tfrec_filewriter, img_bytes, label, height, width):  
  class_num = np.argmax(np.array(CLASSES)==label) # 'roses' => 2 (order defined in CLASSES)
  one_hot_class = np.eye(len(CLASSES))[class_num]     # [0, 0, 1, 0, 0] for class #2, roses

  feature = {
      "image": _bytestring_feature([img_bytes]), # one image in the list
      "class": _int_feature([class_num]),        # one class in the list
      
      # additional (not very useful) fields to demonstrate TFRecord writing/reading of different types of data
      "label":         _bytestring_feature([label]),          # fixed length (1) list of strings, the text label
      "size":          _int_feature([height, width]),         # fixed length (2) list of ints
      "one_hot_class": _float_feature(one_hot_class.tolist()) # variable length  list of floats, n=len(CLASSES)
  }
  return tf.train.Example(features=tf.train.Features(feature=feature))

In [12]:
def create_tf_record_dataset(filenames, name, items_per_record):
  dataset2 = filenames.map(decode_jpeg_and_label, num_parallel_calls=AUTO)
  dataset3 = dataset2.map(recompress_image, num_parallel_calls=AUTO)
  dataset3 = dataset3.batch(items_per_record) # sharding: there will be one "batch" of images per file

  print("Writing TFRecords")
  for shard, (image, label, height, width) in enumerate(dataset3):
    # batch size used as shard size here
    shard_size = image.numpy().shape[0]
    # good practice to have the number of records in the filename
    filename = GCS_OUTPUT + "/{}/{:02d}-{}.tfrec".format(name, shard, shard_size)
    
    np_image = image.numpy()
    np_label = label.numpy()

    with tf.io.TFRecordWriter(filename) as out_file:
      for i in range(shard_size):
        example = to_tfrecord(out_file,
                              np_image[i], # re-compressed image: already a byte string
                              np_label[i],
                              height.numpy()[i],
                              width.numpy()[i])
        out_file.write(example.SerializeToString())
      print("Wrote file {} containing {} records".format(filename, shard_size))
  print("Done.")

In [13]:
number_of_items = len(tf.io.gfile.glob(GCS_PATTERN))
print(number_of_items)
filenames = tf.data.Dataset.list_files(GCS_PATTERN, seed=777)

create_tf_record_dataset(filenames, 'all', 1024)

101119
Writing TFRecords
Wrote file gs://tomasmuzasmaster2021/paper/tfrecords/GZ1_Unique_Validation/all/00-1024.tfrec containing 1024 records
Wrote file gs://tomasmuzasmaster2021/paper/tfrecords/GZ1_Unique_Validation/all/01-1024.tfrec containing 1024 records
Wrote file gs://tomasmuzasmaster2021/paper/tfrecords/GZ1_Unique_Validation/all/02-1024.tfrec containing 1024 records
Wrote file gs://tomasmuzasmaster2021/paper/tfrecords/GZ1_Unique_Validation/all/03-1024.tfrec containing 1024 records
Wrote file gs://tomasmuzasmaster2021/paper/tfrecords/GZ1_Unique_Validation/all/04-1024.tfrec containing 1024 records
Wrote file gs://tomasmuzasmaster2021/paper/tfrecords/GZ1_Unique_Validation/all/05-1024.tfrec containing 1024 records
Wrote file gs://tomasmuzasmaster2021/paper/tfrecords/GZ1_Unique_Validation/all/06-1024.tfrec containing 1024 records
Wrote file gs://tomasmuzasmaster2021/paper/tfrecords/GZ1_Unique_Validation/all/07-1024.tfrec containing 1024 records
Wrote file gs://tomasmuzasmaster2021/pa