In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tqdm.notebook import tqdm
import numpy as np

In [2]:
from google.colab import auth
auth.authenticate_user()

In [22]:
DATASET_NAME = "CombinedDataset_Test"
GCS_OUTPUT = f'gs://tomasmuzasmaster2021/dataset/{DATASET_NAME}'
GCS_PATTERN = f'{DATASET_NAME}/*/*.jpg'
AUTO = tf.data.experimental.AUTOTUNE
CLASSES = [b'Spiral', b'Elliptical']

In [23]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
%%bash -s "$DATASET_NAME"
rsync -ah --progress drive/MyDrive/MasterThesis/Dataset/$1.zip $1.zip && unzip -q $1.zip

sending incremental file list
CombinedDataset_Test.zip
         32.77K   0%    0.00kB/s    0:00:00           24.94M  32%   23.42MB/s    0:00:02           57.44M  75%   25.67MB/s    0:00:00           75.85M 100%   30.99MB/s    0:00:02 (xfr#1, to-chk=0/1)


In [25]:
import math

def decode_jpeg_and_label(filename):
  bits = tf.io.read_file(filename)
  image = tf.image.decode_jpeg(bits)
  vals = tf.strings.split(tf.expand_dims(filename, axis=-1), sep='/')
  label = vals.values[-2]
  objid = tf.strings.regex_replace(vals.values[-1], "\.jpg", "")
  return image, label, objid

def recompress_image(image, label, objid):
  image = tf.image.encode_jpeg(image, optimize_size=True, chroma_downsampling=False)
  return image, label, objid

def _bytestring_feature(list_of_bytestrings):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings))

def _int_feature(list_of_ints): # int64
  return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints))

def _float_feature(list_of_floats): # float32
  return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats))

def to_tfrecord(tfrec_filewriter, img_bytes, label, objid):  
  class_num = np.argmax(np.array(CLASSES)==label) # 'roses' => 2 (order defined in CLASSES)
  one_hot_class = np.eye(len(CLASSES))[class_num]     # [0, 0, 1, 0, 0] for class #2, roses

  feature = {
      "image": _bytestring_feature([img_bytes]), # one image in the list
      "class": _int_feature([class_num]),        # one class in the list
      
      "label":         _bytestring_feature([label]),          # fixed length (1) list of strings, the text label
      "objid":         _bytestring_feature([objid]),          # fixed length (1) list of strings, the text label
      "one_hot_class": _float_feature(one_hot_class.tolist()) # variable length  list of floats, n=len(CLASSES)
  }
  return tf.train.Example(features=tf.train.Features(feature=feature))

In [26]:
def create_tf_record_dataset(filenames, items_per_record):
  dataset2 = filenames.map(decode_jpeg_and_label, num_parallel_calls=AUTO)
  dataset3 = dataset2.map(recompress_image, num_parallel_calls=AUTO)
  dataset3 = dataset3.batch(items_per_record) # sharding: there will be one "batch" of images per file

  print("Writing TFRecords")
  for shard, (image, label, objid) in enumerate(dataset3):
    # batch size used as shard size here
    shard_size = image.numpy().shape[0]
    # good practice to have the number of records in the filename
    filename = GCS_OUTPUT + "/{:02d}-{}.tfrec".format(shard, shard_size)
    
    np_image = image.numpy()
    np_label = label.numpy()
    np_objid = objid.numpy()

    with tf.io.TFRecordWriter(filename) as out_file:
      for i in range(shard_size):
        example = to_tfrecord(out_file,
                              np_image[i], # re-compressed image: already a byte string
                              np_label[i],
                              np_objid[i])
        out_file.write(example.SerializeToString())
      print("Wrote file {} containing {} records".format(filename, shard_size))
  print("Done.")

In [27]:
number_of_items = len(tf.io.gfile.glob(GCS_PATTERN))
print(number_of_items)
filenames = tf.data.Dataset.list_files(GCS_PATTERN, seed=777)

create_tf_record_dataset(filenames, 4096)

31594
Writing TFRecords
Wrote file gs://tomasmuzasmaster2021/dataset/CombinedDataset_Test/00-4096.tfrec containing 4096 records
Wrote file gs://tomasmuzasmaster2021/dataset/CombinedDataset_Test/01-4096.tfrec containing 4096 records
Wrote file gs://tomasmuzasmaster2021/dataset/CombinedDataset_Test/02-4096.tfrec containing 4096 records
Wrote file gs://tomasmuzasmaster2021/dataset/CombinedDataset_Test/03-4096.tfrec containing 4096 records
Wrote file gs://tomasmuzasmaster2021/dataset/CombinedDataset_Test/04-4096.tfrec containing 4096 records
Wrote file gs://tomasmuzasmaster2021/dataset/CombinedDataset_Test/05-4096.tfrec containing 4096 records
Wrote file gs://tomasmuzasmaster2021/dataset/CombinedDataset_Test/06-4096.tfrec containing 4096 records
Wrote file gs://tomasmuzasmaster2021/dataset/CombinedDataset_Test/07-2922.tfrec containing 2922 records
Done.
