<a href="https://colab.research.google.com/github/zatihakim/github-demo/blob/master/Convert_Image_to_TFRecord.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q kaggle
!wget http://sawat.odellobrien.com/kaggle.json
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d dashhax/vggface21kmasked
!unzip -qq vggface21kmasked.zip
!mv vggface2sub_train_cropped_masked data

--2020-11-23 13:25:44--  http://sawat.odellobrien.com/kaggle.json
Resolving sawat.odellobrien.com (sawat.odellobrien.com)... 173.82.86.154
Connecting to sawat.odellobrien.com (sawat.odellobrien.com)|173.82.86.154|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 63 [application/json]
Saving to: ‘kaggle.json’


2020-11-23 13:25:44 (8.27 MB/s) - ‘kaggle.json’ saved [63/63]

Downloading vggface21kmasked.zip to /content
100% 5.52G/5.52G [01:59<00:00, 87.7MB/s]
100% 5.52G/5.52G [01:59<00:00, 49.8MB/s]


In [None]:
import os, sys, math
import numpy as np
import random
from glob import glob
from matplotlib import pyplot as plt
import tensorflow as tf
from PIL import Image
from tqdm import tqdm
import time

print("Tensorflow version " + tf.__version__)
AUTO = tf.data.experimental.AUTOTUNE

Tensorflow version 2.3.0


In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
GCS_PATTERN = "gs://--private-url--/vggface2sub_train_cropped_masked/*/*.jpg"
GCS_OUTPUT = "gs://--private-url--/tfrecords/vggface2sub_masked_"
SHARDS = 32
TARGET_SIZE = [224, 224]

In [None]:
total_images = len(glob("data/*/*.jpg"))
shard_size = math.ceil(1.0 * total_images / SHARDS)
print("Processing {0} images into {1} .tfrec files with {2} images each.".format(total_images, SHARDS, shard_size))

Processing 561306 images into 32 .tfrec files with 17541 images each.


In [None]:
images_files = glob("data/*/*.jpg")
random.shuffle(images_files)

images_files_batches = [images_files[i * shard_size:(i + 1) * shard_size] for i in range((len(images_files) + shard_size - 1) // shard_size)]

print("Batches: {0}".format(len(images_files_batches)))

Batches: 32


In [None]:
labels = glob("data/*")
labels = [x.split("/")[-1] for x in labels]

print(labels[0:10])
np.save("labels.npy", np.asarray(labels))

['n003459', 'n008150', 'n009195', 'n007356', 'n002760', 'n002377', 'n000014', 'n007034', 'n007702', 'n000899']


In [None]:
def to_tfrecord(img, label):
  str_label = label.decode("utf-8")
  class_num = np.argmax(np.array(labels) == str_label)
  one_hot = np.eye(len(labels))[class_num].tolist()
  
  feature = {
      "img": tf.train.Feature(bytes_list=tf.train.BytesList(value=[img])),
      "label": tf.train.Feature(bytes_list=tf.train.BytesList(value=[label])),
      "onehot": tf.train.Feature(float_list=tf.train.FloatList(value=one_hot))
  }
  return tf.train.Example(features=tf.train.Features(feature=feature))

In [None]:
print("Writing TFRecords to Google Cloud Storage...")

for batch_no in range(len(images_files_batches)): #batch_no, batch_files in enumerate(images_files_batches):
  batch_files = images_files_batches[batch_no]

  print("Writing batch no {0}".format(batch_no + 1))

  filename = GCS_OUTPUT + "{:02d}.tfrec".format(batch_no)
  starting = time.time()

  with tf.io.TFRecordWriter(filename) as out_file:
    for file in tqdm(batch_files):
      label = file.split("/")[-2]
      label = bytes(label, 'utf-8')

      with open(file, "rb") as local_file:
        bits = local_file.read()

      img = tf.image.decode_jpeg(bits)
      img = tf.image.resize(img, TARGET_SIZE)
      img = tf.cast(img, tf.uint8)
      img = tf.image.encode_jpeg(img, optimize_size=True, chroma_downsampling=False)

      record = to_tfrecord(img.numpy(), label)
      out_file.write(record.SerializeToString())
    
  delta = time.time() - starting

  print(" >> Batch {0} complete in {1}s!".format(batch_no + 1, delta))

  0%|          | 23/17541 [00:00<01:23, 210.56it/s]

Writing TFRecords to Google Cloud Storage...
Writing batch no 1


100%|██████████| 17541/17541 [01:17<00:00, 225.92it/s]
  0%|          | 24/17541 [00:00<01:13, 239.11it/s]

 >> Batch 1 complete in 87.77685761451721s!
Writing batch no 2


100%|██████████| 17541/17541 [01:18<00:00, 222.88it/s]
  0%|          | 20/17541 [00:00<01:28, 198.77it/s]

 >> Batch 2 complete in 88.48204469680786s!
Writing batch no 3


100%|██████████| 17541/17541 [01:20<00:00, 217.06it/s]
  0%|          | 22/17541 [00:00<01:21, 213.97it/s]

 >> Batch 3 complete in 90.15770292282104s!
Writing batch no 4


100%|██████████| 17541/17541 [01:22<00:00, 212.20it/s]
  0%|          | 21/17541 [00:00<01:36, 182.29it/s]

 >> Batch 4 complete in 92.12684154510498s!
Writing batch no 5


100%|██████████| 17541/17541 [01:17<00:00, 226.90it/s]
  0%|          | 22/17541 [00:00<01:19, 219.82it/s]

 >> Batch 5 complete in 87.2783784866333s!
Writing batch no 6


100%|██████████| 17541/17541 [01:16<00:00, 229.26it/s]
  0%|          | 25/17541 [00:00<01:12, 242.86it/s]

 >> Batch 6 complete in 85.91506767272949s!
Writing batch no 7


100%|██████████| 17541/17541 [01:16<00:00, 228.91it/s]
  0%|          | 19/17541 [00:00<01:35, 184.43it/s]

 >> Batch 7 complete in 86.75346803665161s!
Writing batch no 8


100%|██████████| 17541/17541 [01:19<00:00, 220.37it/s]
  0%|          | 25/17541 [00:00<01:10, 247.35it/s]

 >> Batch 8 complete in 89.53097486495972s!
Writing batch no 9


100%|██████████| 17541/17541 [01:18<00:00, 224.29it/s]
  0%|          | 23/17541 [00:00<01:16, 229.82it/s]

 >> Batch 9 complete in 88.33728265762329s!
Writing batch no 10


100%|██████████| 17541/17541 [01:16<00:00, 228.96it/s]
  0%|          | 23/17541 [00:00<01:16, 229.10it/s]

 >> Batch 10 complete in 86.54008054733276s!
Writing batch no 11


100%|██████████| 17541/17541 [01:16<00:00, 228.05it/s]
  0%|          | 23/17541 [00:00<01:16, 228.16it/s]

 >> Batch 11 complete in 86.94231939315796s!
Writing batch no 12


100%|██████████| 17541/17541 [01:17<00:00, 227.47it/s]
  0%|          | 23/17541 [00:00<01:16, 227.96it/s]

 >> Batch 12 complete in 86.92574524879456s!
Writing batch no 13


100%|██████████| 17541/17541 [01:18<00:00, 223.00it/s]
  0%|          | 22/17541 [00:00<01:19, 219.00it/s]

 >> Batch 13 complete in 88.52751159667969s!
Writing batch no 14


100%|██████████| 17541/17541 [01:17<00:00, 226.26it/s]
  0%|          | 23/17541 [00:00<01:17, 224.99it/s]

 >> Batch 14 complete in 87.49604964256287s!
Writing batch no 15


100%|██████████| 17541/17541 [01:17<00:00, 226.68it/s]
  0%|          | 24/17541 [00:00<01:13, 237.26it/s]

 >> Batch 15 complete in 87.20063924789429s!
Writing batch no 16


100%|██████████| 17541/17541 [01:16<00:00, 228.21it/s]
  0%|          | 24/17541 [00:00<01:14, 234.76it/s]

 >> Batch 16 complete in 87.03564286231995s!
Writing batch no 17


100%|██████████| 17541/17541 [01:18<00:00, 224.38it/s]
  0%|          | 22/17541 [00:00<01:22, 211.36it/s]

 >> Batch 17 complete in 88.05421805381775s!
Writing batch no 18


100%|██████████| 17541/17541 [01:20<00:00, 218.68it/s]
  0%|          | 23/17541 [00:00<01:19, 221.55it/s]

 >> Batch 18 complete in 90.1061155796051s!
Writing batch no 19


100%|██████████| 17541/17541 [01:18<00:00, 222.12it/s]
  0%|          | 25/17541 [00:00<01:10, 246.73it/s]

 >> Batch 19 complete in 89.27714228630066s!
Writing batch no 20


100%|██████████| 17541/17541 [01:16<00:00, 227.95it/s]
  0%|          | 22/17541 [00:00<01:21, 215.18it/s]

 >> Batch 20 complete in 86.88122367858887s!
Writing batch no 21


100%|██████████| 17541/17541 [01:17<00:00, 227.34it/s]
  0%|          | 22/17541 [00:00<01:20, 218.24it/s]

 >> Batch 21 complete in 86.43573236465454s!
Writing batch no 22


100%|██████████| 17541/17541 [01:18<00:00, 222.20it/s]
  0%|          | 20/17541 [00:00<01:29, 196.29it/s]

 >> Batch 22 complete in 88.9230089187622s!
Writing batch no 23


100%|██████████| 17541/17541 [01:28<00:00, 198.04it/s]
  0%|          | 23/17541 [00:00<01:17, 224.91it/s]

 >> Batch 23 complete in 98.40546584129333s!
Writing batch no 24


100%|██████████| 17541/17541 [01:19<00:00, 219.73it/s]
  0%|          | 18/17541 [00:00<01:37, 179.75it/s]

 >> Batch 24 complete in 89.8019232749939s!
Writing batch no 25


100%|██████████| 17541/17541 [01:18<00:00, 222.97it/s]
  0%|          | 23/17541 [00:00<01:17, 225.07it/s]

 >> Batch 25 complete in 88.56018328666687s!
Writing batch no 26


100%|██████████| 17541/17541 [01:18<00:00, 223.18it/s]
  0%|          | 22/17541 [00:00<01:22, 211.11it/s]

 >> Batch 26 complete in 88.41051650047302s!
Writing batch no 27


100%|██████████| 17541/17541 [01:20<00:00, 217.46it/s]
  0%|          | 23/17541 [00:00<01:17, 225.35it/s]

 >> Batch 27 complete in 90.65263986587524s!
Writing batch no 28


100%|██████████| 17541/17541 [01:20<00:00, 218.96it/s]
  0%|          | 23/17541 [00:00<01:16, 229.39it/s]

 >> Batch 28 complete in 90.45940923690796s!
Writing batch no 29


100%|██████████| 17541/17541 [01:18<00:00, 224.32it/s]
  0%|          | 20/17541 [00:00<01:28, 198.53it/s]

 >> Batch 29 complete in 88.02453780174255s!
Writing batch no 30


100%|██████████| 17541/17541 [01:19<00:00, 221.25it/s]
  0%|          | 22/17541 [00:00<01:22, 211.47it/s]

 >> Batch 30 complete in 88.4550392627716s!
Writing batch no 31


100%|██████████| 17541/17541 [01:19<00:00, 221.87it/s]
  0%|          | 21/17535 [00:00<01:23, 209.99it/s]

 >> Batch 31 complete in 89.13739562034607s!
Writing batch no 32


100%|██████████| 17535/17535 [01:21<00:00, 216.41it/s]


 >> Batch 32 complete in 90.7984230518341s!


In [None]:
def get_data(filename):
  img = tf.image.decode_jpeg(tf.io.read_file(filename))
  img = tf.image.resize(img, TARGET_SIZE)
  img = tf.cast(img, tf.float32) / 255.0;
  label = tf.strings.split(tf.expand_dims(filename, axis=-1), sep='/')
  label = label.values[-2]

  return img, label

In [None]:
def _bytestring_feature(list_of_bytestrings):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings))

def _int_feature(list_of_ints): # int64
  return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints))

def _float_feature(list_of_floats): # float32
  return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats))

def to_tfrecord(tfrec_filewriter, img_bytes, label): 
  feature = {
      "image": _bytestring_feature([img_bytes]), # one image in the list
      # additional (not very useful) fields to demonstrate TFRecord writing/reading of different types of data
      "label":         _bytestring_feature([label]),          # fixed length (1) list of strings, the text label
  }
  return tf.train.Example(features=tf.train.Features(feature=feature))

print("Getting all images for TFRecord generation")
filenames = tf.data.Dataset.list_files(GCS_PATTERN, seed=520732)
dataset1 = filenames.map(get_data, num_parallel_calls=AUTO)
dataset1 = dataset1.batch(shard_size)

print("Writing TFRecord...")

for shard_index, (img, label) in enumerate(dataset1):
  filename = GCS_OUTPUT + "{:02d}-{}.tfrec".format(shard, shard_size)

  with tf.io.TFRecordWriter(filename) as out_file:
    for i in range(shard_size):
      example = to_tfrecord(out_file, img, label)
      out_file.write(example.SerializeToString())
    
    print("Created file {0} with {1} records!".format(filename, shard_size))



Getting all images for TFRecord generation
Writing TFRecord...
