# Dump images

In [1]:
import os, sys, math, argparse

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, StratifiedKFold

import tensorflow as tf

from concurrent.futures import ThreadPoolExecutor

print("Tensorflow version " + tf.__version__)
AUTO = tf.data.experimental.AUTOTUNE # used in tf.data.Dataset API

SEED = 1029
DATA_PATH = '../input/shopee-product-detection-open/'
CLASSES = sorted(os.listdir(DATA_PATH+'train/train/train'))
IM_SZ = 224  # 320, 384, 448, 512
IMAGE_SIZE = [IM_SZ, IM_SZ]
N_SPLITS = 20

Tensorflow version 2.2.0


In [2]:
def train_val_split(labels, n_splits=N_SPLITS):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    cv_indices = list(kf.split(range(len(labels)), labels))
    return cv_indices

def format_path(fn, data_split='train'):
    if data_split == 'train':
        return DATA_PATH + 'train/train/train/' + fn
    if data_split == 'test':
        return DATA_PATH + 'test/test/test/' + fn

def decode_image(filename, label=None):
    bits = tf.io.read_file(filename)
    image = tf.image.decode_jpeg(bits, channels=3)
    image = tf.image.resize(image, IMAGE_SIZE)
    if label is None:
        return image, filename
    else:
        return image, label

def recompress_image(image, label):
    image = tf.cast(image, tf.uint8)
    image = tf.image.encode_jpeg(image, optimize_size=True, chroma_downsampling=False)
    return image, label
    
def get_training_dataset(filenames, labels):
    return (
        tf.data.Dataset
        .from_tensor_slices((filenames, labels))
        .map(decode_image, num_parallel_calls=AUTO)
        .map(recompress_image, num_parallel_calls=AUTO)
        .batch(len(filenames) // N_SPLITS)
    )

def get_test_dataset(filenames):
    return (
        tf.data.Dataset
        .from_tensor_slices(filenames)
        .map(decode_image, num_parallel_calls=AUTO)
        .map(recompress_image, num_parallel_calls=AUTO)
        .batch(len(filenames) // 2)
    )

In [3]:
train_df = pd.read_csv(DATA_PATH + "train.csv")
train_df = train_df.sample(train_df.shape[0]).reset_index(drop=True)
test_df = pd.read_csv(DATA_PATH + "test.csv")

cv_indices = train_val_split(train_df.category.values)

train_filenames = np.concatenate([train_df.iloc[ind].filename.values for _,ind in cv_indices])
train_labels = np.concatenate([train_df.iloc[ind].category.values for _,ind in cv_indices])

train_filenames = [format_path(CLASSES[lb]+'/'+fn) for lb, fn in zip(train_labels, train_filenames)]
test_filenames = [format_path(fn, 'test') for fn in test_df.filename]

train_ds = get_training_dataset(train_filenames, train_labels)
test_ds = get_test_dataset(test_filenames)

In [4]:
train_dir = "train"
test_dir = "test"

os.mkdir(train_dir)
os.mkdir(test_dir)

In [5]:
def _bytestring_feature(list_of_bytestrings):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings))

def _int_feature(list_of_ints): # int64
    return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints))

def _float_feature(list_of_floats): # float32
    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats))

def to_tfrecord(img_bytes, image_filename=None, label=None, data_split='train'):
    if data_split == 'train':
        one_hot_label = np.eye(len(CLASSES))[label]  # [0, 0, 1, 0, 0] for class #2
        feature = {
            'image': _bytestring_feature([img_bytes]),
            'label': _int_feature([label]),
            'one_hot_label': _float_feature(one_hot_label.tolist()),
        }
    if data_split == 'test':
        feature = {
          'image': _bytestring_feature([img_bytes]),
          'filename': _bytestring_feature([image_filename]),
        }
    return tf.train.Example(features=tf.train.Features(feature=feature))

In [6]:
print("Writing training TFRecords...")

for shard, (images, labels) in enumerate(train_ds):
    shard_size = images.numpy().shape[0]
    filename = os.path.join(train_dir, f"{shard:02d}-{IM_SZ}x{IM_SZ}-{shard_size}.tfrec")
    
    with tf.io.TFRecordWriter(filename) as out_file:
        for i in range(shard_size):
            example = to_tfrecord(images[i].numpy(),
                                  label=labels[i].numpy())
            out_file.write(example.SerializeToString())
        print("Wrote file {} containing {} records".format(filename, shard_size))

Writing training TFRecords...
Wrote file train/00-224x224-5269.tfrec containing 5269 records
Wrote file train/01-224x224-5269.tfrec containing 5269 records
Wrote file train/02-224x224-5269.tfrec containing 5269 records
Wrote file train/03-224x224-5269.tfrec containing 5269 records
Wrote file train/04-224x224-5269.tfrec containing 5269 records
Wrote file train/05-224x224-5269.tfrec containing 5269 records
Wrote file train/06-224x224-5269.tfrec containing 5269 records
Wrote file train/07-224x224-5269.tfrec containing 5269 records
Wrote file train/08-224x224-5269.tfrec containing 5269 records
Wrote file train/09-224x224-5269.tfrec containing 5269 records
Wrote file train/10-224x224-5269.tfrec containing 5269 records
Wrote file train/11-224x224-5269.tfrec containing 5269 records
Wrote file train/12-224x224-5269.tfrec containing 5269 records
Wrote file train/13-224x224-5269.tfrec containing 5269 records
Wrote file train/14-224x224-5269.tfrec containing 5269 records
Wrote file train/15-224x2

In [7]:
print("Writing test TFRecords...")

for shard, (images, img_fns) in enumerate(test_ds):
    shard_size = images.numpy().shape[0]
    filename = os.path.join(test_dir, f"{shard:02d}-{IM_SZ}x{IM_SZ}-{shard_size}.tfrec")
    
    with tf.io.TFRecordWriter(filename) as out_file:
        for i in range(shard_size):
            example = to_tfrecord(images[i].numpy(),
                                  image_filename=img_fns[i].numpy(),
                                  data_split='test')
            out_file.write(example.SerializeToString())
        print("Wrote file {} containing {} records".format(filename, shard_size))

Writing test TFRecords...
Wrote file test/00-224x224-6093.tfrec containing 6093 records
Wrote file test/01-224x224-6093.tfrec containing 6093 records
