# TFRecords generation

In [None]:
import numpy as np, pandas as pd, os
import matplotlib.pyplot as plt
import tensorflow as tf, re, math
from tqdm import tqdm

In [None]:
IMAGE_SIZE = 736
N_GROUPS = 12 # Num of groups in each fold, each group captures an interval in the distribution of the dataset
N_FOLDS = 10 # Num of folds
N_TFRs = N_GROUPS*N_FOLDS # Num of tfrecords
SUBSET = True  # Keep SUBSET=True while debugging (Faster Execution)
SUBSET_SIZE = 10000
BATCH_SIZE = 32
FOLDS = list(range(N_FOLDS))
GROUPS = list(range(N_GROUPS))
assert max(FOLDS)<N_FOLDS, "ELEMENTS OF FOLDS can't be greater than N_FOLDS"
assert max(GROUPS)<N_GROUPS, "ELEMENTS OF FOLDS can't be greater than N_FOLDS"

In [None]:
train_df = pd.read_csv('/content/data/train.csv')
train_df['original_landmark_id'] = train_df.landmark_id
print(train_df.shape)
train_df['order'] = np.arange(train_df.shape[0])
train_df['order'] = train_df.groupby('landmark_id').order.rank()-1
landmark_counts = train_df.landmark_id.value_counts()
train_df['landmark_counts'] = landmark_counts.loc[train_df.landmark_id.values].values
train_df['fold'] = (train_df['order']%N_FOLDS).astype(int)
all_groups = [(1/N_GROUPS)*x for x in range(N_GROUPS)]

print(train_df.landmark_counts.quantile(all_groups))
for i,partition_val in enumerate(train_df.landmark_counts.quantile(all_groups).values):
                     train_df.loc[train_df.landmark_counts>=partition_val,'group'] = i 
        
landmark_map = train_df.sort_values(by='landmark_counts').landmark_id.drop_duplicates().reset_index(drop=True)
landmark_dict = {landmark_map.loc[x]:81312-x for x in range(81313)}
train_df['landmark_id'] = train_df.original_landmark_id.apply(lambda x: landmark_dict[x])
train_df = train_df.sample(frac=1).reset_index(drop=True)
train_df.to_csv('train_meta_data.csv',index=False)
train_df.sample(10)

In [None]:
# Checking Null values
train_df.isna().sum().sum()

In [None]:
train_df.groupby('group').landmark_counts.agg(['min','max'])

In [None]:
# Landmark Counts
train_df.landmark_id.value_counts()

In [None]:
# Num of images GroupBy landmark counts
train_df.landmark_counts.value_counts()

In [None]:
# Num of Images in Each Folds
train_df.fold.value_counts()

In [None]:
# Num of Images in Each Group
train_df.group.value_counts()

In [None]:
# Num of Landmarks in each Fold
train_df.drop_duplicates(['fold','landmark_id']).groupby('fold').landmark_id.count()

In [None]:
train_df.groupby(['fold','group']).id.count()

In [None]:
# helper functions to write tfrecords
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_example(image,image_id,landmark_id):
    feature = {
        'image': _bytes_feature(image),
        'image_id': _bytes_feature(image_id),
        'landmark_id': _int64_feature(landmark_id),
      }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [None]:
if SUBSET:
    train_df = train_df.sample(SUBSET_SIZE)
for fold in FOLDS:
    for group in GROUPS:
        tfr_filename = '/content/tfrecords/train/train-{}-{}.tfrec'.format(fold,group)
        print("Writing",tfr_filename)
        with tf.io.TFRecordWriter(tfr_filename) as writer:
            indices = train_df[(train_df.fold==fold) & (train_df.group==group)].index.to_list()
            for index in tqdm(indices):
                image_id = train_df.loc[index,'id']
                landmark_id = train_df.loc[index,'landmark_id']
                image_path = "/content/data/train/{}/{}/{}/{}.jpg".format(image_id[0],image_id[1],image_id[2],image_id) 
                image = cv2.imread(image_path)
                if RESIZE:
                    image = cv2.resize(image, (IMAGE_SIZE,IMAGE_SIZE))
                image = cv2.imencode('.jpg', image, (cv2.IMWRITE_JPEG_QUALITY, 100))[1].tostring()
                image_id = str.encode(image_id)
                sample = serialize_example(image,image_id,landmark_id)
                writer.write(sample)

In [None]:
if SUBSET:
    train_df = train_df.sample(SUBSET_SIZE)
for fold in FOLDS:
    for group in GROUPS:
        tfr_filename = '/content/tfrecords/train/train-{}-{}.tfrec'.format(fold,group)
        print("Writing",tfr_filename)
        with tf.io.TFRecordWriter(tfr_filename) as writer:
            indices = train_df[(train_df.fold==fold) & (train_df.group==group)].index.to_list()
            for index in tqdm(indices):
                image_id = train_df.loc[index,'id']
                landmark_id = train_df.loc[index,'landmark_id']
                image_path = "/content/data/train/{}/{}/{}/{}.jpg".format(image_id[0],image_id[1],image_id[2],image_id) 
                image = tf.io.read_file(image_path)
                image = tf.image.decode_jpeg(image, channels=3)
                image = tf.image.resize(image, [IMAGE_SIZE, IMAGE_SIZE])
                image = tf.io.encode_jpeg(tf.cast(image, dtype=tf.uint8))
                image_id = str.encode(image_id)
                sample = serialize_example(image,image_id,landmark_id)
                writer.write(sample)

# TFRecords checking

In [None]:
IMAGE_SIZE_ = [IMAGE_SIZE,IMAGE_SIZE]
AUTO = tf.data.experimental.AUTOTUNE
TRAINING_FILENAMES = tf.io.gfile.glob('/content/tfrecords/train/train*.tfrec')
print(TRAINING_FILENAMES)
dataset = load_dataset(TRAINING_FILENAMES, labeled=True)
dataset = dataset.repeat()
dataset = dataset.shuffle(2048)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(AUTO) #This dataset can directly be passed to keras.fit method

In [None]:
# numpy and matplotlib defaults
np.set_printoptions(threshold=15, linewidth=80)
CLASSES = [0,1]

def batch_to_numpy_images_and_labels(data):
    images, labels = data
    numpy_images = images.numpy()
    numpy_labels = labels.numpy()
    #if numpy_labels.dtype == object: # binary string in this case, these are image ID strings
    #    numpy_labels = [None for _ in enumerate(numpy_images)]
    # If no labels, only image IDs, return None for labels (this is the case for test data)
    return numpy_images, numpy_labels

def display_single_sample(image, label, subplot, red=False, titlesize=16):
    plt.subplot(*subplot)
    plt.axis('off')
    plt.imshow(image)
    title = str(label)
    if len(title) > 0:
        plt.title(title, fontsize=int(titlesize) if not red else int(titlesize/1.2), color='red' if red else 'black', fontdict={'verticalalignment':'center'}, pad=int(titlesize/1.5))
    return (subplot[0], subplot[1], subplot[2]+1)
    
def display_batch_of_images(databatch):
    """
    Display single batch Of images 
    """
    # data
    images, labels = batch_to_numpy_images_and_labels(databatch)
    if labels is None:
        labels = [None for _ in enumerate(images)]
        
    # auto-squaring: this will drop data that does not fit into square or square-ish rectangle
    rows = int(math.sqrt(len(images)))
    cols = len(images)//rows
        
    # size and spacing
    FIGSIZE = 13.0
    SPACING = 0.1
    subplot=(rows,cols,1)
    if rows < cols:
        plt.figure(figsize=(FIGSIZE,FIGSIZE/cols*rows))
    else:
        plt.figure(figsize=(FIGSIZE/rows*cols,FIGSIZE))
    
    # display
    for i, (image, label) in enumerate(zip(images[:rows*cols], labels[:rows*cols])):
        correct = True
        dynamic_titlesize = FIGSIZE*SPACING/max(rows,cols)*40+3 # magic formula tested to work from 1x1 to 10x10 images
        subplot = display_single_sample(image, label, subplot, not correct, titlesize=dynamic_titlesize)
    
    #layout
    plt.tight_layout()
    if label is None and predictions is None:
        plt.subplots_adjust(wspace=0, hspace=0)
    else:
        plt.subplots_adjust(wspace=SPACING, hspace=SPACING)
    plt.show()

In [None]:
# Displaying single batch of TFRecord
train_batch = iter(dataset)
display_batch_of_images(next(train_batch))