<a href="https://colab.research.google.com/github/tqtg/is712-2019/blob/master/sample_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*Licensed under the Apache 2.0 License.*

# IS712-2019 Course Project Sample Solution

## Setup

We use Google Drive to store data, model checkpoints, etc. Thus, we need to mount it.

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Unzip data from Google Drive into local runtime host:

In [0]:
!unzip -qq /content/drive/My\ Drive/IS712/data/train.zip -d ./data
!unzip -qq /content/drive/My\ Drive/IS712/data/public_test.zip -d ./data

Make sure GPU is available:

In [0]:
!nvidia-smi

## Global Settings and Imports

In [0]:
import os
import glob
import random
from datetime import datetime
from itertools import combinations

import numpy as np
import tensorflow as tf
from tqdm import tnrange

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

random.seed(712)
np.random.seed(712)

print("Numpy version:", np.__version__)
print("TensorFlow version:", tf.__version__)

## Data Preparation

Given the data, we split into training and test sets for model validation purpose. 

Datasets are constructed by sampling. For each entity, all combinations of image pairs from the same entity will serve as positive examples. For each positive pair, a corresponding negative pair will be sampled by two steps (i.e., negative entity then negative image).

In [0]:
DATA_DIR = "./data/train"
TEST_SIZE = 0.5
IMG_SIZE = 32

In [0]:
### DATA SAMPLING ###

def sample(folder_img_paths):
    data = []
    for folder, img_paths in folder_img_paths.items():
        # positive pairs
        pos_pairs = [(p1, p2) for (p1, p2) in combinations(img_paths, 2)]

        # negative pairs
        neg_pairs = []
        for _ in range(len(pos_pairs)):
            img_path1 = np.random.choice(img_paths, 1)[0]
            # sample negative folder
            neg_folders = [f for f in folder_img_paths.keys() if f != folder]
            assert len(neg_folders) == len(folder_img_paths.keys()) - 1
            neg_folder = np.random.choice(neg_folders, 1)[0]
            # sample negative image
            img_path2 = np.random.choice(folder_img_paths[neg_folder], 1)[0]
            neg_pairs.append((img_path1, img_path2))

        # combine positive and negative data
        data.extend([(p1, p2, 1) for (p1, p2) in pos_pairs])
        data.extend([(p1, p2, 0) for (p1, p2) in neg_pairs])

    random.shuffle(data)
    return np.asarray(data)


def gen_data():
    train_img_paths = {}
    test_img_paths = {}
    for folder in glob.glob(DATA_DIR + '/*/'):
        img_paths = [p for p in glob.glob(folder + '/*.jpg')]
        random.shuffle(img_paths)
        n_train = int(len(img_paths) * (1 - TEST_SIZE))
        train_img_paths[folder] = img_paths[:n_train]
        test_img_paths[folder] = img_paths[n_train:]

    train_data = sample(train_img_paths)
    test_data = sample(test_img_paths)
    print('Training size: {}'.format(len(train_data)))
    print('Test size: {}'.format(len(test_data)))
    return train_data, test_data

In [0]:
### IMAGE READING PARSING ###

def read_img(img_path, is_training=False):
    img_string = tf.read_file(img_path)
    img_decoded = tf.image.decode_jpeg(img_string, channels=3)
    img = tf.image.resize(img_decoded, [IMG_SIZE, IMG_SIZE])
    img = img / 255.0

    if is_training:
        """Data augmentation comes here"""
        img = tf.image.random_flip_left_right(img)

    return img


def parse_function(img_path1, img_path2, label, is_training=False):
    img1 = read_img(img_path1, is_training)
    img2 = read_img(img_path2, is_training)
    return img1, img2, [label]


def parse_function_train(img_path1, img_path2, label):
    return parse_function(img_path1, img_path2, label, is_training=True)


def parse_function_test(img_path1, img_path2, label):
    return parse_function(img_path1, img_path2, label, is_training=False)

In [0]:
### DATA SERVING ###

class DataGenerator(object):

    def __init__(self, batch_size=1, num_threads=1, 
                 train_shuffle=False, buffer_size=10000):
        self.batch_size = batch_size
        self.num_threads = num_threads
        self.buffer_size = buffer_size

        # data sampling and spliting
        self.train_data, self.test_data = gen_data()

        # build iterator
        self.train_set = self._build_data_set(self.train_data, 
                                              parse_function_train, 
                                              shuffle=train_shuffle)
        self.iterator = tf.data.Iterator.from_structure(self.train_set.output_types,
                                                        self.train_set.output_shapes)
        # for training
        self.train_init_op = self.iterator.make_initializer(self.train_set)
        self.next = self.iterator.get_next()
        self.num_train_batches = int(np.ceil(len(self.train_data) / batch_size))
        # for testing
        self.test_set = self._build_data_set(self.test_data, parse_function_test)
        self.test_init_op = self.iterator.make_initializer(self.test_set)
        self.num_test_batches = int(np.ceil(len(self.test_data) / batch_size))

    def _build_data_set(self, data, map_fn, shuffle=False):
        """
        Images are loaded from disk and processed batch by batch. Since our dataset
        is not that big, it would be faster if we load all the images into RAM once 
        and read from their. I leave it for you guys to explore :)
        """
        img_path1 = tf.convert_to_tensor(data[:, 0], dtype=tf.string)
        img_path2 = tf.convert_to_tensor(data[:, 1], dtype=tf.string)
        labels = tf.convert_to_tensor(data[:, 2], dtype=tf.int32)
        data = tf.data.Dataset.from_tensor_slices((img_path1, img_path2, labels))
        if shuffle:
            data = data.shuffle(buffer_size=self.buffer_size)
        data = data.map(map_fn, num_parallel_calls=self.num_threads)
        data = data.batch(self.batch_size)
        data = data.prefetch(self.num_threads)
        return data

## Model Definition

In [0]:
class MLP(object):

    def __init__(self, training=False):
        self.x1 = tf.placeholder(tf.float32, [None, IMG_SIZE, IMG_SIZE, 3])
        self.x2 = tf.placeholder(tf.float32, [None, IMG_SIZE, IMG_SIZE, 3])
        self.y = tf.placeholder(tf.float32, [None, 1])

        net1 = self._encoder(self.x1)
        net2 = self._encoder(self.x2)
        net = tf.abs(net1 - net2)

        with tf.variable_scope('classifier'):
            self.logits = tf.layers.dense(net, 1, name='logits')
            self.prob = tf.nn.sigmoid(self.logits, name='prob')

        if training:
            self.loss, self.train_op = self._loss_fn()

    def _encoder(self, input, name='encoder'):
        with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
            net = tf.layers.flatten(input)
            net = tf.layers.dense(net, units=300, activation=tf.nn.relu)
            net = tf.layers.dense(net, units=300, activation=tf.nn.relu)
            return net

    def _loss_fn(self):
        trained_vars = tf.trainable_variables()

        cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.y, 
                                                                logits=self.logits)
        cross_entropy = tf.reduce_mean(cross_entropy)
        l2_reg = tf.add_n([tf.nn.l2_loss(v) for v in trained_vars 
                           if 'bias' not in v.name])
        loss = cross_entropy + LAMBDA_REG * l2_reg

        global_step = tf.Variable(0, trainable=False)
        optimizer = tf.train.AdamOptimizer(LEARNING_RATE, 
                                           beta1=0.9, 
                                           beta2=0.99, 
                                           epsilon=1e-8)
        train_op = optimizer.minimize(loss, global_step, var_list=trained_vars)

        return loss, train_op

## Training

In [0]:
# Hyper-parameters
BATCH_SIZE = 256
NUM_EPOCHS = 1
LEARNING_RATE = 0.001
LAMBDA_REG = 0.0

NUM_CHECKPOINTS = 5
NUM_THREADS = 4

CHECKPOINT_DIR = "/content/drive/My Drive/IS712/checkpoints"
if tf.gfile.Exists(CHECKPOINT_DIR):
    tf.gfile.DeleteRecursively(CHECKPOINT_DIR)
tf.gfile.MakeDirs(CHECKPOINT_DIR)

In [0]:
generator = DataGenerator(batch_size=BATCH_SIZE, num_threads=NUM_THREADS, 
                          train_shuffle=True, buffer_size=10000)

In [0]:
# Just a useful function for parameter counting 
def count_parameters(trained_vars):
    total_parameters = 0
    print('=' * 100)
    for variable in trained_vars:
        variable_parameters = 1
        for dim in variable.get_shape():
            variable_parameters *= dim.value
        print('{:70} {:20} params'.format(variable.name, variable_parameters))
        print('-' * 100)
        total_parameters += variable_parameters
    print('=' * 100)
    print("Total trainable parameters: %d" % total_parameters)
    print('=' * 100)


model = MLP(training=True)
count_parameters(tf.trainable_variables())

In [0]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver(max_to_keep=NUM_CHECKPOINTS)
    
    for epoch in range(1, NUM_EPOCHS + 1):
        print("\n{} Epoch: {}/{}".format(datetime.now(), epoch, NUM_EPOCHS))

        # Training
        sum_loss = 0.
        sess.run(generator.train_init_op)
        loop = tnrange(generator.num_train_batches, desc='Training')
        for step in loop:
            batch_img1, batch_img2, batch_label = sess.run(generator.next)
            _, loss = sess.run([model.train_op, model.loss], 
                                feed_dict={model.x1: batch_img1,
                                           model.x2: batch_img2,
                                           model.y: batch_label})
            sum_loss += loss
            loop.set_postfix(loss=(sum_loss / (step + 1)))
        print('Training loss: {:.6f}'.format(sum_loss))
  
        saver.save(sess, 
                   os.path.join(CHECKPOINT_DIR, 'model_e{}.ckpt'.format(epoch)))
          
        # Testing
        pds = []
        gts = []
        sum_loss = 0.
        sess.run(generator.test_init_op)
        loop = tnrange(generator.num_test_batches, desc='Testing')
        for step in loop:
            batch_img1, batch_img2, batch_label = sess.run(generator.next)
            prob, loss = sess.run([model.prob, model.loss],
                                   feed_dict={model.x1: batch_img1,
                                              model.x2: batch_img2,
                                              model.y: batch_label})
            sum_loss += loss
            loop.set_postfix(loss=(sum_loss / (step + 1)))
            pds.extend(np.round(prob).ravel().tolist())
            gts.extend(batch_label.ravel().tolist())
        pds = np.asarray(pds)
        gts = np.asarray(gts)
        print('Test loss: {:.6f}'.format(sum_loss))
        print('Test acc: {:.6f}'.format(np.equal(pds, gts).sum() / len(gts)))

## Inference

In [0]:
DATA_DIR = "./data/public_test"
CHECKPOINT = "/content/drive/My Drive/IS712/checkpoints/model_e{}.ckpt".format(NUM_EPOCHS)

In [0]:
def read_test_data(input_file):
    test_data = []
    with open(input_file, 'r') as f:
        for line in f:
            img1, img2 = line.strip().split('\t')
            test_data.append((os.path.join(DATA_DIR, 'images', img1),
                              os.path.join(DATA_DIR, 'images', img2)))
    return np.asarray(test_data)


def data_generator(input_file):
    test_data = read_test_data(input_file)
    img_path1 = tf.convert_to_tensor(test_data[:, 0], dtype=tf.string)
    img_path2 = tf.convert_to_tensor(test_data[:, 1], dtype=tf.string)

    def parse_function(img_path1, img_path2):
        img1 = read_img(img_path1, is_training=False)
        img2 = read_img(img_path2, is_training=False)
        return img1, img2

    test_set = tf.data.Dataset.from_tensor_slices((img_path1, img_path2))
    test_set = test_set.map(parse_function, num_parallel_calls=NUM_THREADS)
    test_set = test_set.batch(BATCH_SIZE)
    test_set = test_set.prefetch(NUM_THREADS)

    iterator = tf.data.Iterator.from_structure(test_set.output_types, test_set.output_shapes)
    init_op = iterator.make_initializer(test_set)
    next = iterator.get_next()
    num_batches = int(np.ceil(len(test_data) / BATCH_SIZE))

    return init_op, next, num_batches

In [0]:
if not os.path.exists('./submission'):
    os.makedirs('./submission')

tf.reset_default_graph()
model = MLP(training=False)

with tf.Session() as sess:
    tf.train.Saver().restore(sess, CHECKPOINT)

    for input_file in glob.glob(os.path.join(DATA_DIR, 'pairs/*.txt')):
        basename = os.path.basename(input_file)
        init_op, next, num_batches = data_generator(input_file)
        pds = []
        sess.run(init_op)
        for _ in tnrange(num_batches, desc=basename):
            batch_img1, batch_img2 = sess.run(next)
            prob = sess.run(model.prob, feed_dict={model.x1: batch_img1,
                                                   model.x2: batch_img2})
            pds.extend(np.round(prob).ravel().tolist())

        # write prediction to submission files
        with open('./submission/{}'.format(basename), 'w') as f:
            f.write('\n'.join(str(int(pd)) for pd in pds))

In [0]:
!cd submission; zip /content/drive/My\ Drive/IS712/submission.zip *