In [0]:
!mkdir utils

In [0]:
import os
import numpy as np
from tensorflow.contrib import keras
import re
L = keras.layers
K = keras.backend
import tensorflow as tf
import threading
import json
import google.colab as colab
import cv2
import zipfile
import pickle
import collections
import random


from utils.file_utils import *
from utils.image_utils import *
from utils.generator_utils import *
from utils.tqdm_utils import *
from utils.keras_utils import *


import warnings
warnings.filterwarnings('ignore')

### Mount Google Drive

In [0]:
def mount_google_drive():
	'''
	# Functionality
		Mount google drive. Since colab does not save files, we want to make it easier to directly access files in google drive.
	# Arguments
		Nothing
	# Returns
		drive_root: the working directory mounted
	'''
	mount_directory = "/content/gdrive"
	drive = colab.drive
	drive.mount(mount_directory, force_remount=True)
	drive_root = mount_directory + "/" + list(filter(lambda x: x[0] != '.', os.listdir(mount_directory)))[0]
	return drive_root

In [41]:
ROOT_DIR =  mount_google_drive()
CHECKPOINT_ROOT = ROOT_DIR+ "/captioning/checkpoints/"
DATASET_DIR = ROOT_DIR + "/Dataset/"

if not os.path.exists(DATASET_DIR):
  os.makedirs(DATASET_DIR)

if not os.path.exists(CHECKPOINT_ROOT):
  os.makedirs(CHECKPOINT_ROOT)

def get_checkpoint_path(epoch=None):
    if epoch is None:
        return os.path.abspath(CHECKPOINT_ROOT + "weights")
    else:
        return os.path.abspath(CHECKPOINT_ROOT + "weights_{}".format(epoch))
      
# example of checkpoint dir
print(get_checkpoint_path(4))

Mounted at /content/gdrive
/content/gdrive/My Drive/captioning/checkpoints/weights_4


## Load embeddings

In [0]:
def save_pickle(obj, fn):
	'''
	# Functionality
		Save the data into pickle format
	# Arguments
		obj: the data object
		fn: the pickle file name
	# Returns
		Nothing. Just save to the file.
	'''
	with open(fn, "wb") as f:
		pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)


def read_pickle(fn):

	'''
	# Functionality
		Save the data into pickle format
	# Arguments
		fn: the pickle file name
	# Returns
		obj: the desired data object
	'''
	with open(fn, "rb") as f:
		return pickle.load(f)

In [7]:
train_img_embeds = read_pickle(DATASET_DIR + "train_img_embeds.pickle")
train_img_fns = read_pickle(DATASET_DIR + "train_img_fns.pickle")
val_img_embeds = read_pickle(DATASET_DIR + "val_img_embeds.pickle")
val_img_fns = read_pickle(DATASET_DIR + "val_img_fns.pickle")
# check shapes
print(train_img_embeds.shape, len(train_img_fns))
print(val_img_embeds.shape, len(val_img_fns))

(8278, 2048) 8278
(4050, 2048) 4050


## Extract Labels (Captions)

In [0]:
# extract captions from zip
def get_captions_for_fns(fns, zip_fn, zip_json_path):
    zf = zipfile.ZipFile(zip_fn)
    j = json.loads(zf.read(zip_json_path).decode("utf8"))
    id_to_fn = {img["id"]: img["file_name"] for img in j["images"]}
    fn_to_caps = defaultdict(list)
    for cap in j['annotations']:
        fn_to_caps[id_to_fn[cap['image_id']]].append(cap['caption'])
    fn_to_caps = dict(fn_to_caps)
    return list(map(lambda x: fn_to_caps[x], fns))
    
train_captions = get_captions_for_fns(train_img_fns, DATASET_DIR + "captions_train-val2014.zip", 
                                      "annotations/captions_train2014.json")

val_captions = get_captions_for_fns(val_img_fns, DATASET_DIR + "captions_train-val2014.zip", 
                                      "annotations/captions_val2014.json")

# check shape
assert len(train_img_fns) == len(train_captions)
assert len(val_img_fns) == len(val_captions)

In [9]:
train_captions[1]

['a man with a beard and a brown and white dog on a leash',
 'A man and his dog on a hike - both wearing backpacks',
 'A man and a dog standing on a dirt path in the woods.',
 'a man carrying a back pack walking a dog carrying a back pack',
 'A man with a backpack holding a dog on a leash.']

## Let's go to NLP Part now!

### General Pipeline for captions of each image:
- Original Captions is a list of sentences of various lengths
-  Change every word to lower case
- Tokenize the senteces and build vocabulary (with specific threashold word frequencies). 
- Map each token to a number.
- Sentences then become list of list of numbers
- Pad and truncate the sequence of numbers to max len

In [0]:
# special tokens
PAD = "#PAD#"
UNK = "#UNK#"
START = "#START#"
END = "#END#"

# split sentence into tokens (split into lowercased words)
def split_sentence(sentence):
    return list(filter(lambda x: len(x) > 0, re.split('\W+', sentence.lower())))

  
def generate_vocabulary(train_captions, freq_threshold=3):
    """
    Return {token: index} for all train tokens (words) that occur 5 times or more, 
        `index` should be from 0 to N, where N is a number of unique tokens in the resulting dictionary.
    Use `split_sentence` function to split sentence into tokens.
    Also, add PAD (for batch padding), UNK (unknown, out of vocabulary), 
        START (start of sentence) and END (end of sentence) tokens into the vocabulary.
    """
    
    vocab_with_repeat = []
    for captions in train_captions:
      for sentence in captions:
        tokens = split_sentence(sentence)
        vocab_with_repeat += tokens
    counter = collections.Counter(vocab_with_repeat)
    
    vocab = []
    for element in counter:
      if counter[element] >= freq_threshold:
        vocab.append(element)
    vocab = list(set(vocab))
    vocab += [PAD, UNK, START, END]
    
    return {token: index for index, token in enumerate(sorted(vocab))}

In [11]:
# prepare vocabulary
vocab = generate_vocabulary(train_captions)
vocab_inverse = {idx: w for w, idx in vocab.items()}
print(len(vocab))

4162


In [0]:
def caption_tokens_to_indices(captions, vocab):
    """
    `captions` argument is an array of arrays:
    [
        [
            "image1 caption1",
            "image1 caption2",
            ...
        ],
        [
            "image2 caption1",
            "image2 caption2",
            ...
        ],
        ...
    ]
    Use `split_sentence` function to split sentence into tokens.
    Replace all tokens with vocabulary indices, use UNK for unknown words (out of vocabulary).
    Add START and END tokens to start and end of each sentence respectively.
    For the example above you should produce the following:
    [
        [
            [vocab[START], vocab["caption1"],...,vocab[END]],
            [vocab[START], vocab["caption2"],...,vocab[END]],
            ...
        ],
        ...
    ]
    """
    res = []
    for instance in captions:
      instance_list = []
      
      for sentence in instance:
        sentence_list = []
        sentence_list.append(vocab[START])
        
        # append tokens
        tokens = split_sentence(sentence)
        for token in tokens:
          if token not in vocab.keys():
            sentence_list.append(vocab[UNK])
          else:
            sentence_list.append(vocab[token])
        
        sentence_list.append(vocab[END])
        
        instance_list.append(sentence_list)
      res.append(instance_list)

    return res

In [0]:
# we will use this during training
def batch_captions_to_matrix(batch_captions, pad_idx, max_len=None):
    """
    `batch_captions` is an array of arrays:
    [
        [vocab[START], ..., vocab[END]],
        [vocab[START], ..., vocab[END]],
        ...
    ]
    Put vocabulary indexed captions into np.array of shape (len(batch_captions), columns),
        where "columns" is max(map(len, batch_captions)) when max_len is None
        and "columns" = min(max_len, max(map(len, batch_captions))) otherwise.
    Add padding with pad_idx where necessary.
    Input example: [[1, 2, 3], [4, 5]]
    Output example: np.array([[1, 2, 3], [4, 5, pad_idx]]) if max_len=None
    Output example: np.array([[1, 2], [4, 5]]) if max_len=2
    Output example: np.array([[1, 2, 3], [4, 5, pad_idx]]) if max_len=100
    Try to use numpy, we need this function to be fast!
    """
    matrix = []
    
    if max_len:
      max_len = min(max_len, len(max(batch_captions, key=len)))  
    else:
      max_len = len(max(batch_captions, key=len))
    
    for caption in batch_captions:
      if len(caption) < max_len:
        output = caption + [pad_idx] * (max_len-len(caption))
      elif len(caption) > max_len:
        output = caption[:max_len]
      else:
        output = caption
      matrix.append(output)

    return np.array(matrix)

In [14]:
# make sure you use correct argument in caption_tokens_to_indices
assert len(caption_tokens_to_indices(train_captions[:10], vocab)) == 10
assert len(caption_tokens_to_indices(train_captions[:5], vocab)) == 5
caption_tokens_to_indices(train_captions[:3], vocab)

[[[2, 16, 4075, 2685, 4111, 16, 452, 99, 4075, 1429, 0],
  [2, 16, 2631, 2382, 681, 528, 2396, 3778, 2382, 16, 4075, 2685, 0],
  [2, 16, 681, 528, 99, 16, 1393, 2896, 3747, 270, 1144, 0],
  [2, 16, 681, 993, 2396, 16, 2685, 4111, 16, 1393, 0],
  [2,
   16,
   2631,
   2382,
   681,
   1003,
   2396,
   16,
   2685,
   4111,
   16,
   2300,
   99,
   16,
   1393,
   0]],
 [[2, 16, 2108, 4111, 16, 279, 99, 16, 451, 99, 4075, 1057, 2396, 16, 1980, 0],
  [2, 16, 2108, 99, 1702, 1057, 2396, 16, 3, 405, 4047, 195, 0],
  [2, 16, 2108, 99, 16, 1057, 3452, 2396, 16, 1031, 2537, 1784, 3691, 4122, 0],
  [2, 16, 2108, 591, 16, 190, 2455, 4001, 16, 1057, 591, 16, 190, 2455, 0],
  [2, 16, 2108, 4111, 16, 194, 1712, 16, 1057, 2396, 16, 1980, 0]],
 [[2,
   16,
   189,
   1784,
   16,
   2907,
   3539,
   3188,
   154,
   2396,
   1702,
   2241,
   3042,
   667,
   0],
  [2, 16, 2040, 189, 3689, 1834, 3297, 2396, 3373, 0],
  [2,
   16,
   4151,
   421,
   1784,
   16,
   2907,
   99,
   4075,
   3541,


In [0]:
# replace tokens with indices
train_captions_indexed = caption_tokens_to_indices(train_captions, vocab)
val_captions_indexed = caption_tokens_to_indices(val_captions, vocab)

## Training

In [0]:
IMG_EMBED_SIZE = train_img_embeds.shape[1]
IMG_EMBED_BOTTLENECK = 120
WORD_EMBED_SIZE = 100
LSTM_UNITS = 300
LOGIT_BOTTLENECK = 120
pad_idx = vocab[PAD]

batch_size = 64
n_epochs = 12
n_batches_per_epoch = 1000
n_validation_batches = 100  # how many batches are used for validation after each epoch

In [0]:
# remember to reset your graph if you want to start building it from scratch!
s = reset_tf_session()
tf.set_random_seed(46)

In [28]:
!nvidia-smi

Thu Jul 11 01:00:14 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 410.79       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   59C    P0    29W /  70W |    129MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

In [29]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://camo.githubusercontent.com/040ce95dff6d10fd3ae4c47c4a3e01ab6abed220/68747470733a2f2f6769746875622e636f6d2f6873652d616d6c2f696e74726f2d746f2d646c2f626c6f622f6d61737465722f7765656b362f696d616765732f666c617474656e5f68656c702e6a70673f7261773d31")

In [0]:
class decoder:
    # [batch_size, IMG_EMBED_SIZE] of CNN image features
    img_embeds = tf.placeholder('float32', [None, IMG_EMBED_SIZE])
    # [batch_size, time steps] of word ids
    sentences = tf.placeholder('int32', [None, None])
    
    # we use bottleneck here to reduce the number of parameters
    # image embedding -> bottleneck
    img_embed_to_bottleneck = L.Dense(IMG_EMBED_BOTTLENECK, 
                                      input_shape=(None, IMG_EMBED_SIZE), 
                                      activation='elu')
    # image embedding bottleneck -> lstm initial state
    img_embed_bottleneck_to_h0 = L.Dense(LSTM_UNITS,
                                         input_shape=(None, IMG_EMBED_BOTTLENECK),
                                         activation='elu')
    # word -> embedding
    word_embed = L.Embedding(len(vocab), WORD_EMBED_SIZE)
    # lstm cell (from tensorflow)
    lstm = tf.nn.rnn_cell.LSTMCell(LSTM_UNITS)
    
    # we use bottleneck here to reduce model complexity
    # lstm output -> logits bottleneck
    token_logits_bottleneck = L.Dense(LOGIT_BOTTLENECK, 
                                      input_shape=(None, LSTM_UNITS),
                                      activation="elu")
    # logits bottleneck -> logits for next token prediction
    token_logits = L.Dense(len(vocab),
                           input_shape=(None, LOGIT_BOTTLENECK))
    
    # initial lstm cell state of shape (None, LSTM_UNITS),
    # we need to condition it on `img_embeds` placeholder.
    
    c0 = h0 = img_embed_bottleneck_to_h0(img_embed_to_bottleneck(img_embeds))

    # embed all tokens but the last (last for not be input) for lstm input,
    # remember that L.Embedding is callable,
    # use `sentences` placeholder as input.
    
    word_embeds = word_embed(sentences[:, :-1])
    
    # during training we use ground truth tokens `word_embeds` as context for next token prediction.
    # that means that we know all the inputs for our lstm and can get 
    # all the hidden states with one tensorflow operation (tf.nn.dynamic_rnn).
    # `hidden_states` has a shape of [batch_size, time steps, LSTM_UNITS].
    hidden_states, _ = tf.nn.dynamic_rnn(lstm, word_embeds,
                                         initial_state=tf.nn.rnn_cell.LSTMStateTuple(c0, h0))

    # now we need to calculate token logits for all the hidden states
    
    # first, we reshape `hidden_states` to [-1, LSTM_UNITS]
    
    flat_hidden_states = tf.reshape(hidden_states, [-1, LSTM_UNITS])

    # then, we calculate logits for next tokens using `token_logits_bottleneck` and `token_logits` layers
    
    flat_token_logits = token_logits(token_logits_bottleneck(flat_hidden_states))
    
    # then, we flatten the ground truth (output side, compare with word_embeds) token ids.
    # remember, that we predict next tokens for each time step,
    # use `sentences` placeholder.
    
    flat_ground_truth = tf.reshape(sentences[:, 1:], [-1])

    # we need to know where we have real tokens (not padding) in `flat_ground_truth`,
    # we don't want to propagate the loss for padded output tokens,
    # fill `flat_loss_mask` with 1.0 for real tokens (not pad_idx) and 0.0 otherwise.
    
    flat_loss_mask = tf.cast(tf.not_equal(flat_ground_truth, pad_idx), 'float32')

    # compute cross-entropy between `flat_ground_truth` and `flat_token_logits` predicted by lstm
    xent = tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=flat_ground_truth, 
        logits=flat_token_logits
    )

    # compute average `xent` over tokens with nonzero `flat_loss_mask`.
    # we don't want to account misclassification of PAD tokens, because that doesn't make sense,
    # we have PAD tokens for batching purposes only!
    
    loss = tf.reduce_sum(xent*flat_loss_mask) / tf.reduce_sum(flat_loss_mask)

In [0]:
def generate_batch(images_embeddings, indexed_captions, batch_size, max_len=None):
    """
    `images_embeddings` is a np.array of shape [number of images, IMG_EMBED_SIZE].
    `indexed_captions` holds 5 vocabulary indexed captions for each image:
    [
        [
            [vocab[START], vocab["image1"], vocab["caption1"], vocab[END]],
            [vocab[START], vocab["image1"], vocab["caption2"], vocab[END]],
            ...
        ],
        ...
    ]
    Generate a random batch of size `batch_size`.
    Take random images and choose one random caption for each image.
    Remember to use `batch_captions_to_matrix` for padding and respect `max_len` parameter.
    Return feed dict {decoder.img_embeds: ..., decoder.sentences: ...}.
    """
    indices = np.random.randint(0, len(images_embeddings), batch_size)
    batch_image_embeddings = images_embeddings[indices]
    
    batch_captions = []
    for i in indices:
      all_current_captions = indexed_captions[i]
      cap_idx = np.random.randint(0, len(all_current_captions))
      batch_captions.append(all_current_captions[cap_idx])
    batch_captions_matrix = batch_captions_to_matrix(batch_captions, pad_idx, max_len)
    
    return {decoder.img_embeds: batch_image_embeddings, 
            decoder.sentences: batch_captions_matrix}

In [0]:
# define optimizer operation to minimize the loss
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
train_step = optimizer.minimize(decoder.loss)

# will be used to save/load network weights.
# you need to reset your default graph and define it in the same way to be able to load the saved weights!
saver = tf.train.Saver()

# intialize all variables
s.run(tf.global_variables_initializer())

In [42]:
# actual training loop
MAX_LEN = 20  # truncate long captions to speed up training

np.random.seed(46)
random.seed(46)

for epoch in range(n_epochs):
    
    train_loss = 0
    pbar = tqdm_notebook_failsafe(range(n_batches_per_epoch))
    counter = 0
    for _ in pbar:
        train_loss += s.run([decoder.loss, train_step], 
                            generate_batch(train_img_embeds, 
                                           train_captions_indexed, 
                                           batch_size, 
                                           MAX_LEN))[0]
        counter += 1
        pbar.set_description("Training loss: %f" % (train_loss / counter))
        
    train_loss /= n_batches_per_epoch
    
    val_loss = 0
    for _ in range(n_validation_batches):
        val_loss += s.run(decoder.loss, generate_batch(val_img_embeds,
                                                       val_captions_indexed, 
                                                       batch_size, 
                                                       MAX_LEN))
    val_loss /= n_validation_batches
    
    print('Epoch: {}, train loss: {}, val loss: {}'.format(epoch, train_loss, val_loss))

    # save weights after finishing epoch
    saver.save(s, get_checkpoint_path(epoch))
    
print("Finished!")

**************************************************
Training loss: 3.098571
Epoch: 0, train loss: 3.0985707650184633, val loss: 3.067671067714691
**************************************************
Training loss: 2.758530
Epoch: 1, train loss: 2.758529740810394, val loss: 2.9773271584510805
**************************************************
Training loss: 2.550270
Epoch: 2, train loss: 2.5502695982456207, val loss: 2.8875528812408446
**************************************************
Training loss: 2.414244
Epoch: 3, train loss: 2.4142438690662384, val loss: 2.9116009187698366
**************************************************
Training loss: 2.289532
Epoch: 4, train loss: 2.289532083749771, val loss: 2.884495747089386
**************************************************
Training loss: 2.197430


W0711 01:11:52.454319 139654049408896 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py:960: remove_checkpoint (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to delete files with this prefix.


Epoch: 5, train loss: 2.197430156946182, val loss: 2.944124858379364
**************************************************
Training loss: 2.101491
Epoch: 6, train loss: 2.1014910360574723, val loss: 2.963227038383484
**************************************************
Training loss: 2.019545
Epoch: 7, train loss: 2.0195446170568467, val loss: 3.040666756629944
**************************************************
Training loss: 1.940969
Epoch: 8, train loss: 1.9409690611362458, val loss: 3.071781952381134
**************************************************
Training loss: 1.864024
Epoch: 9, train loss: 1.8640236037969589, val loss: 3.104189293384552
**************************************************
Training loss: 1.790942
Epoch: 10, train loss: 1.7909419717788697, val loss: 3.1874444031715394
**************************************************
Training loss: 1.728954
Epoch: 11, train loss: 1.7289542013406753, val loss: 3.262008259296417
Finished!


## Inference