In [1]:
from PIL import Image
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Conv2D, MaxPool2D, AvgPool2D, Flatten, GRU, Reshape, Concatenate, TimeDistributed
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K
import tensorflow as tf
import pandas as pd
import numpy as np
from functools import partial
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import os

## Image caption dataset

- [Dataset scraper-Modified](https://github.com/rupy/PascalSentenceDataset/blob/master/pascal_sentence_dataset.py)
- [Dataset](http://vision.cs.uiuc.edu/pascal-sentences/)

install pyquery
create folder called `dataset` and `sentence` in the project dir

## Downloading the dataset

In [2]:
%run -i ./pascal_sentence_dataset.py

dataset/aeroplane\2008_000716.jpg
Already downloaded, Skipping: dataset/aeroplane\2008_000716.jpg
dataset/aeroplane\2008_001227.jpg
Already downloaded, Skipping: dataset/aeroplane\2008_001227.jpg
dataset/aeroplane\2008_001380.jpg
Already downloaded, Skipping: dataset/aeroplane\2008_001380.jpg
dataset/aeroplane\2008_001448.jpg
Already downloaded, Skipping: dataset/aeroplane\2008_001448.jpg
dataset/aeroplane\2008_001468.jpg
Already downloaded, Skipping: dataset/aeroplane\2008_001468.jpg
dataset/aeroplane\2008_001801.jpg
Already downloaded, Skipping: dataset/aeroplane\2008_001801.jpg
dataset/aeroplane\2008_001971.jpg
Already downloaded, Skipping: dataset/aeroplane\2008_001971.jpg
dataset/aeroplane\2008_001985.jpg
Already downloaded, Skipping: dataset/aeroplane\2008_001985.jpg
dataset/aeroplane\2008_002358.jpg
Already downloaded, Skipping: dataset/aeroplane\2008_002358.jpg
dataset/aeroplane\2008_002454.jpg
Already downloaded, Skipping: dataset/aeroplane\2008_002454.jpg
dataset/aeroplane\20

Already downloaded, Skipping: dataset/car\2008_004862.jpg
dataset/car\2008_005234.jpg
Already downloaded, Skipping: dataset/car\2008_005234.jpg
dataset/car\2008_005641.jpg
Already downloaded, Skipping: dataset/car\2008_005641.jpg
dataset/car\2008_005747.jpg
Already downloaded, Skipping: dataset/car\2008_005747.jpg
dataset/car\2008_005960.jpg
Already downloaded, Skipping: dataset/car\2008_005960.jpg
dataset/car\2008_006037.jpg
Already downloaded, Skipping: dataset/car\2008_006037.jpg
dataset/car\2008_006087.jpg
Already downloaded, Skipping: dataset/car\2008_006087.jpg
dataset/car\2008_006220.jpg
Already downloaded, Skipping: dataset/car\2008_006220.jpg
dataset/car\2008_006336.jpg
Already downloaded, Skipping: dataset/car\2008_006336.jpg
dataset/car\2008_006438.jpg
Already downloaded, Skipping: dataset/car\2008_006438.jpg
dataset/car\2008_006649.jpg
Already downloaded, Skipping: dataset/car\2008_006649.jpg
dataset/car\2008_006762.jpg
Already downloaded, Skipping: dataset/car\2008_006762.

Already downloaded, Skipping: dataset/dog\2008_006356.jpg
dataset/dog\2008_006511.jpg
Already downloaded, Skipping: dataset/dog\2008_006511.jpg
dataset/dog\2008_006602.jpg
Already downloaded, Skipping: dataset/dog\2008_006602.jpg
dataset/dog\2008_007478.jpg
Already downloaded, Skipping: dataset/dog\2008_007478.jpg
dataset/dog\2008_007519.jpg
Already downloaded, Skipping: dataset/dog\2008_007519.jpg
dataset/dog\2008_007537.jpg
Already downloaded, Skipping: dataset/dog\2008_007537.jpg
dataset/dog\2008_007567.jpg
Already downloaded, Skipping: dataset/dog\2008_007567.jpg
dataset/dog\2008_007694.jpg
Already downloaded, Skipping: dataset/dog\2008_007694.jpg
dataset/dog\2008_007871.jpg
Already downloaded, Skipping: dataset/dog\2008_007871.jpg
dataset/horse\2008_000219.jpg
Already downloaded, Skipping: dataset/horse\2008_000219.jpg
dataset/horse\2008_000912.jpg
Already downloaded, Skipping: dataset/horse\2008_000912.jpg
dataset/horse\2008_001031.jpg
Already downloaded, Skipping: dataset/horse\

Already downloaded, Skipping: dataset/sofa\2008_008622.jpg
dataset/sofa\2008_008628.jpg
Already downloaded, Skipping: dataset/sofa\2008_008628.jpg
dataset/sofa\2008_008642.jpg
Already downloaded, Skipping: dataset/sofa\2008_008642.jpg
dataset/sofa\2008_008649.jpg
Already downloaded, Skipping: dataset/sofa\2008_008649.jpg
dataset/train\2008_000343.jpg
Already downloaded, Skipping: dataset/train\2008_000343.jpg
dataset/train\2008_000916.jpg
Already downloaded, Skipping: dataset/train\2008_000916.jpg
dataset/train\2008_001164.jpg
Already downloaded, Skipping: dataset/train\2008_001164.jpg
dataset/train\2008_001625.jpg
Already downloaded, Skipping: dataset/train\2008_001625.jpg
dataset/train\2008_001850.jpg
Already downloaded, Skipping: dataset/train\2008_001850.jpg
dataset/train\2008_001866.jpg
Already downloaded, Skipping: dataset/train\2008_001866.jpg
dataset/train\2008_001926.jpg
Already downloaded, Skipping: dataset/train\2008_001926.jpg
dataset/train\2008_002158.jpg
Already downloade

Already downloaded, Skipping: sentence/aeroplane\2008_000716.txt
Already downloaded, Skipping: sentence/aeroplane\2008_001227.txt
Already downloaded, Skipping: sentence/aeroplane\2008_001380.txt
Already downloaded, Skipping: sentence/aeroplane\2008_001448.txt
Already downloaded, Skipping: sentence/aeroplane\2008_001468.txt
Already downloaded, Skipping: sentence/aeroplane\2008_001801.txt
Already downloaded, Skipping: sentence/aeroplane\2008_001971.txt
Already downloaded, Skipping: sentence/aeroplane\2008_001985.txt
Already downloaded, Skipping: sentence/aeroplane\2008_002358.txt
Already downloaded, Skipping: sentence/aeroplane\2008_002454.txt
Already downloaded, Skipping: sentence/aeroplane\2008_003275.txt
Already downloaded, Skipping: sentence/aeroplane\2008_003369.txt
Already downloaded, Skipping: sentence/aeroplane\2008_003575.txt
Already downloaded, Skipping: sentence/aeroplane\2008_003655.txt
Already downloaded, Skipping: sentence/aeroplane\2008_003703.txt
Already downloaded, Skipp

Already downloaded, Skipping: sentence/bus\2008_003691.txt
Already downloaded, Skipping: sentence/bus\2008_003924.txt
Already downloaded, Skipping: sentence/bus\2008_004613.txt
Already downloaded, Skipping: sentence/bus\2008_004614.txt
Already downloaded, Skipping: sentence/bus\2008_004679.txt
Already downloaded, Skipping: sentence/bus\2008_004844.txt
Already downloaded, Skipping: sentence/bus\2008_004968.txt
Already downloaded, Skipping: sentence/bus\2008_005074.txt
Already downloaded, Skipping: sentence/bus\2008_005196.txt
Already downloaded, Skipping: sentence/bus\2008_005277.txt
Already downloaded, Skipping: sentence/bus\2008_005360.txt
Already downloaded, Skipping: sentence/bus\2008_005676.txt
Already downloaded, Skipping: sentence/bus\2008_005761.txt
Already downloaded, Skipping: sentence/bus\2008_005891.txt
Already downloaded, Skipping: sentence/bus\2008_005933.txt
Already downloaded, Skipping: sentence/bus\2008_005984.txt
Already downloaded, Skipping: sentence/bus\2008_006483.t

Already downloaded, Skipping: sentence/dog\2008_005890.txt
Already downloaded, Skipping: sentence/dog\2008_006130.txt
Already downloaded, Skipping: sentence/dog\2008_006356.txt
Already downloaded, Skipping: sentence/dog\2008_006511.txt
Already downloaded, Skipping: sentence/dog\2008_006602.txt
Already downloaded, Skipping: sentence/dog\2008_007478.txt
Already downloaded, Skipping: sentence/dog\2008_007519.txt
Already downloaded, Skipping: sentence/dog\2008_007537.txt
Already downloaded, Skipping: sentence/dog\2008_007567.txt
Already downloaded, Skipping: sentence/dog\2008_007694.txt
Already downloaded, Skipping: sentence/dog\2008_007871.txt
Already downloaded, Skipping: sentence/horse\2008_000219.txt
Already downloaded, Skipping: sentence/horse\2008_000912.txt
Already downloaded, Skipping: sentence/horse\2008_001031.txt
Already downloaded, Skipping: sentence/horse\2008_001235.txt
Already downloaded, Skipping: sentence/horse\2008_001682.txt
Already downloaded, Skipping: sentence/horse\2

Already downloaded, Skipping: sentence/tvmonitor\2008_002066.txt
Already downloaded, Skipping: sentence/tvmonitor\2008_002328.txt
Already downloaded, Skipping: sentence/tvmonitor\2008_002547.txt
Already downloaded, Skipping: sentence/tvmonitor\2008_002817.txt
Already downloaded, Skipping: sentence/tvmonitor\2008_003037.txt
Already downloaded, Skipping: sentence/tvmonitor\2008_003466.txt
Already downloaded, Skipping: sentence/tvmonitor\2008_003609.txt
Already downloaded, Skipping: sentence/tvmonitor\2008_003995.txt
Already downloaded, Skipping: sentence/tvmonitor\2008_004004.txt
Already downloaded, Skipping: sentence/tvmonitor\2008_004008.txt
Already downloaded, Skipping: sentence/tvmonitor\2008_004097.txt
Already downloaded, Skipping: sentence/tvmonitor\2008_004301.txt
Already downloaded, Skipping: sentence/tvmonitor\2008_004501.txt
Already downloaded, Skipping: sentence/tvmonitor\2008_004506.txt
Already downloaded, Skipping: sentence/tvmonitor\2008_004550.txt
Already downloaded, Skipp

## Loading the data

In [3]:

""" Loading the names of the image files """
dir_content = os.walk(os.path.join('dataset'))
img_data = []
for d in dir_content:
    folder, _, files = d
    key = folder.split(os.path.sep)[-1]
    for f in files:
        if f.endswith('.jpg'):
            fpath = os.path.join(key, f.split('.')[0])
            img_data.append(fpath)
print(img_data[:5])

""" Loading the sentences of caption data """
caption_data = []
for d_path in img_data:
    
    caption_path = os.path.join('sentence',d_path)+'.txt'
    caption_list = []
    with open(caption_path, 'r') as f:
        for row in f:
            caption_list.append(row)
    caption_data.append(caption_list)
print(caption_data[:5])

""" Combine the data """
# Define a dataframe
data_df = pd.DataFrame({"images": img_data, "captions":caption_data})

# Define a column length that has caption length
data_df["length"] = data_df["captions"].str.len()

# Split train and test data
np.random.seed(100)
train_mask = np.random.choice([True, False], size=1000, p=[0.9,0.1])
train_df = data_df.loc[train_mask,:].reset_index(drop=True)
test_df = data_df.loc[~train_mask,:].reset_index(drop=True)
train_size = train_df.shape[0]
test_size = test_df.shape[0]
print('\nTrain size: {} Test size: {}'.format(train_size, test_size))

# Print the first part of data
data_df.head(n=5)


['aeroplane\\2008_000716', 'aeroplane\\2008_001227', 'aeroplane\\2008_001380', 'aeroplane\\2008_001448', 'aeroplane\\2008_001468']
[['One jet lands at an airport while another takes off next to it.\n', 'Two airplanes parked in an airport.\n', 'Two jets taxi past each other.\n', 'Two parked jet airplanes facing opposite directions.\n', 'two passenger planes on a grassy plain\n'], ['Two gentleman talking in front of propeller plane.\n', 'Two men are conversing next to a small airplane.\n', 'Two men talking in front of a plane\n', 'Two men talking in front of a small plane.\n', 'Two men talk while standing next to a small passenger plane at an airport.\n'], ['A D-ERFW-6 in flight.\n', 'An army green plane flying in the sky.\n', 'An old fighter plane flying with German military markings.\n', 'A small green and yellow plane in the sky.\n', 'A WWII fighter plane with its landing gear down.\n'], ['a larger plane in flying above a smaller plane\n', 'Black and white scene of two planes flying.\

Unnamed: 0,captions,images,length
0,[One jet lands at an airport while another tak...,aeroplane\2008_000716,5
1,[Two gentleman talking in front of propeller p...,aeroplane\2008_001227,5
2,"[A D-ERFW-6 in flight.\n, An army green plane ...",aeroplane\2008_001380,5
3,[a larger plane in flying above a smaller plan...,aeroplane\2008_001448,5
4,[A blue grounded fighter jet is parked on gras...,aeroplane\2008_001468,5


## Defining hyperparameters

In [4]:
vocab_size=1000
timesteps = 15
image_size = 168

## Preprocessing text

In [None]:

def lower_and_tokenizer(sent):
    """ Lower and tokenize text """
    return ' '.join([w.lower() for w in word_tokenize(sent)])

def preprocess_caption(x, tok, timesteps):
    """ Turn a caption in to a sequence of word ids"""
    x_clean = []
    for sent in x:
        x_clean.append(lower_and_tokenizer(sent))
    seq = tok.texts_to_sequences(x_clean)
    return pad_sequences(seq, padding='post', maxlen=timesteps)

# Fitting a tokenizer
# TODO: Define a tokenizer with vocab_size many words and out of vocabulary term 'UNK', assign this to variable tok
tok.fit_on_texts([lower_and_tokenizer(cap) for cap_group in train_df["captions"].tolist() for cap in cap_group ])

# Creating preprocessed text column for train and test data
# TODO: Preprocess text in the training dataset using the preprocess_caption function and pandas apply function
# TODO: Preprocess text in the testing dataset using the preprocess_caption function and pandas apply function

# Getting how many words on average for a caption
train_df["caption_length"] = train_df["captions"].apply(lambda x: np.mean([len(word_tokenize(xx)) for xx in x]))
print('On average each caption has {} words'.format(train_df["caption_length"].mean()))
train_df = train_df.sample(frac=1, random_state=100)

# Print the head of train data
# TODO: Print head of the dataset

On average each caption has 10.793504083147734 words


Unnamed: 0,captions,images,length,captions_preproc,caption_length
433,[a father is introducing his daughter to a cow...,cow\2008_007729,5,"[[2, 1, 11, 1, 82, 568, 17, 2, 63, 5, 2, 560, ...",13.0
787,[a dark living room with a white couch and a f...,sofa\2008_004632,5,"[[2, 219, 69, 21, 7, 2, 9, 50, 6, 2, 573, 0, 0...",11.8
686,[A group of people stand by a body of water lo...,pottedplant\2008_003144,5,"[[2, 80, 8, 35, 150, 53, 2, 344, 8, 47, 40, 14...",12.2
24,[An Air Canada airplane is ascending against a...,aeroplane\2008_006619,5,"[[16, 177, 495, 109, 11, 1, 275, 2, 25, 139, 0...",9.6
481,[A country dinner setting with a bounty of foo...,diningtable\2008_006192,5,"[[304, 265, 863, 7, 2, 1, 8, 229, 1, 145, 1, 7...",14.4
259,"[A blue party bus\n, A blue party bus is parke...",bus\2008_008252,5,"[[2, 25, 416, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",8.8
44,[An airplane is flying over a tree in the blue...,aeroplane\2008_008471,5,"[[16, 109, 11, 111, 112, 2, 103, 4, 3, 25, 139...",9.2
594,"[An orange Moped\n, An orange scooter is parke...",motorbike\2008_002926,5,"[[16, 136, 414, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8.8
606,[A close up of the front end of a purple and w...,motorbike\2008_006517,5,"[[2, 60, 30, 8, 3, 22, 369, 8, 2, 261, 6, 9, 4...",12.8
454,[A kitchen with pine doors and a table and cha...,diningtable\2008_001155,5,"[[2, 117, 7, 1, 619, 6, 2, 19, 6, 107, 0, 0, 0...",9.2


## Defining data generators

In [None]:
def data_gen(data_df, flip_lr=False):
    """ 
    Defining a data generator, that generates a batch of 
    image filenames, images, caption text, and labels
    """
    for ri, row in data_df.iterrows():
        
        caption_list = row.loc["captions_preproc"]
        
        """ Loading the image """
        image_filename = os.path.join("dataset", row["images"]) + ".jpg"
        new_size  = image_size + 50
        img = Image.open(image_filename)
        width, height = img.width, img.height
        
        """ Depending on the orientation of the image resize """
        if width < height:
            new_height = new_size * height / width 
            img.thumbnail((new_size, new_height), Image.ANTIALIAS)
        else:
            new_width = new_size * width/ height
            img.thumbnail((new_width, new_size), Image.ANTIALIAS)
        
        """ Center crop image """
        width, height = img.width, img.height
        
        rand_width, rand_height = np.random.randint(-25,25), np.random.randint(-25,25)
        left = max(((width - new_size)/2) + rand_width, 0)
        right = left + new_size - 50
        top = max(((height - new_size)/2) + rand_height, 0)
        bottom = top + new_size - 50

        img = img.crop((left, top, right, bottom))
        
        """ Creating a label """
        label_map = {'aeroplane':0, "bicycle":1, "bird":2, "boat":3, "bottle":4,
                    "bus":5, "car":6, "cat": 7, "chair":8, "cow": 9, "diningtable": 10, "dog": 11,
                   "horse":12, "motorbike": 13, "person": 14, "pottedplant":15, "sheep": 16, 'sofa': 17,
                    "train":18, "tvmonitor":19}
        label = label_map[row["images"].split(os.path.sep)[0]]
        
        """ Image as an array and get a random caption for that image """
        
        img_arr = np.array(img) - np.mean(np.array(img))
        
        if flip_lr and np.random.choice([True, False]):
            img_arr = np.flip(img_arr, 1)
        rand_idx = np.random.randint(len(caption_list))
        caption = caption_list[rand_idx]
        
        yield [image_filename], img_arr, caption, [label]

# Testing out the generator
# TODO: Get a generator from data_gen function and assign it to gen
for fname, img, cap, label in gen:
    print("Image: ",img.shape, fname, np.mean(img), np.max(img), np.min(img))
    print("Caption: ", cap)
    print("Label: ", label)
    break

Image:  (168, 168, 3) ['dataset\\cow\\2008_007729.jpg'] -4.038761430775475e-15 120.59067932728647 -134.40932067271353
Caption:  [  2   1  11   1  82 568  17   2  63   5   2 560 173   0   0]
Label:  [9]


## Using `tf.data.Dataset` to create datasets

In [None]:
K.clear_session()
tf.reset_default_graph()

""" tf.data API usage """
batch_size = 48
partial_train_gen = partial(data_gen, data_df=train_df, flip_lr=True)
partial_test_gen = partial(data_gen, data_df=test_df)

""" Defining tf.data.Dataset using generator """
# TODO: Define a tf.data.Dataset from generator using partial_train_gen, correct output types and correct output shapes,
# and assign the dataset to train_dataset
# TODO: Apply shuffle transformation to dataset with buffer size 5xbatch_size
# TODO: Apply batch transformation with drop_remainder
# TODO: Apply repeat transformation


""" Testing the tf.data.Dataset """
# TODO: get an initializable iterator for the train_dataset
# TODO: use iterator's get_next function define a tf.op

with tf.Session() as sess:
    # TODO: Initialize the iterator by executing iter.initializer
    # TODO: Print shapes of the data returned by the function
    

(48, 1) (48, 168, 168, 3) (48, 1)


## Creating the CNN

The CNN needs to be trained for an image classification task and the Sequential model on the caption generation part. In other words, the model cannot be trained end-to-end.

In [None]:
""" Defining the layers of CNN """

# TODO: Define an input layer with correct batch_shape, and assign it to image_input

# TODO: Define a Conv2D layer with 32 filters, 3 kernel size, same padding, relu activation,
# glorot_initializer and assign it to conv1_out

# TODO: Define a MaxPool2D layer with (2,2) kernel size, (2,2) strides. Assign the output to pool2_out

# TODO: Define a Conv2D layer with 64 filters and others similar as first conv layer. Assign the output to conv3_out

# TODO: Define a max pooling layer as same as previous max pooling layer. Assign the output to pool4_out

# TODO: Define a Conv2D layer with 64 filters and others similar as first conv layer. Assign the output to conv5_out

# TODO: Define a max pooling layer as same as previous max pooling layer. Assign the output to pool6_out

# TODO: Define a Conv2D layer with 128 filters and others similar as first conv layer. Assign the output to conv7_out

# TODO: Define a max pooling layer as same as previous max pooling layer. Assign the output to pool8_out

# TODO: Define a Conv2D layer with 128 filters and others similar as first conv layer. Assign the output to conv9_out

# TODO: Define a AvgPool2D layer with (7,7) kernel size, (7,7) strides. Assign the output to pool10_out

# TODO: Define a Flatten layer that takes in pool10_out and assign the result to flatten_out


input_dim = 128

image_fc = Dense(input_dim, activation='relu', kernel_initializer='glorot_uniform', name='cnn_fc1')(flatten_out) # <= (batch_size, 128)
image_pred = Dense(20, activation='softmax', kernel_initializer='glorot_uniform', name='cnn_fc2')(image_fc)

""" Defining the CNN models: One for getting predictions one for getting image embeddings"""
# TODO: Define a model which takes in image_input and produce the final prediction (cnn_model)
# TODO: Compile the model using rmsprop and categorical crossentropy

# TODO: Define a model which takes in image_input and produce image embedding (cnn_vec_model)

cnn_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
cnn_image_input (InputLayer) (48, 168, 168, 3)         0         
_________________________________________________________________
cnn_conv1 (Conv2D)           (48, 168, 168, 32)        896       
_________________________________________________________________
cnn_pool2 (MaxPooling2D)     (48, 84, 84, 32)          0         
_________________________________________________________________
cnn_conv3 (Conv2D)           (48, 84, 84, 64)          18496     
_________________________________________________________________
cnn_pool4 (MaxPooling2D)     (48, 42, 42, 64)          0         
_________________________________________________________________
cnn_conv5 (Conv2D)           (48, 42, 42, 64)          36928     
_________________________________________________________________
cnn_pool6 (MaxPooling2D)     (48, 21, 21, 64)          0         
__________

## Defining the LSTM model

In [None]:
# Defining an input layer
# TODO: Define an input layer which has timesteps-1 sequence length and call that word_input

# Defining an embedding layer and an embedding output
# TODO: Define an embedding layer that can take in word_input as an input and call it emb_layer
# TODO: Use embed_layer and produce the output embed_out

# Defining an input layer for image embeddings
# TODO: Define an input layer that takes in a batch of image embeddings and assign it to image_fc_input

# Concat image embeddings and embedding outputs
inputs = Concatenate(axis=1, name='image_caption_concat')(
    [Reshape(target_shape=(1, input_dim))(image_fc_input), embed_out]
)

# Defining GRU layer and GRU output
# TODO: Define a GRU layer with 128 units, dropout 0.5 which returns outputs for all time steps (use glorot initialization)
#       assign this to variable gru
# TODO: Get the gru layer's output and assign it to gru_out

# Defining a TimeDistributed Dense layer and its output
# TODO: Define a TimeDistributed layer that is wrapped around a Dense softmax layer that will ouput a single word
#       assign this to a variable softmax
# TODO: Get output of sofrmax and assign it to word_pred

# Defining Model and compiling the model
# TODO: Define a model that takes in image_fc_input, word_input and output word_pred. Assign this to a variable called model
# TODO: Compile the model with rmsprop and categorical crossentropy

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
image_fc (InputLayer)           (48, 128)            0                                            
__________________________________________________________________________________________________
word_input (InputLayer)         (48, 14)             0                                            
__________________________________________________________________________________________________
reshape (Reshape)               (48, 1, 128)         0           image_fc[0][0]                   
__________________________________________________________________________________________________
embeddings (Embedding)          (48, 14, 128)        128000      word_input[0][0]                 
__________________________________________________________________________________________________
image_capt

## Training the image caption generator

In [None]:
tr_iter_data = train_dataset.make_initializable_iterator()
tr_next_data = tr_iter_data.get_next()

# Defining session config
config=tf.ConfigProto(allow_soft_placement=True)
config.gpu_options.allow_growth = True

with tf.Session(config=config) as sess:
    
    for ep in range(100):
        sess.run(tr_iter_data.initializer)
        losses = []
        
        for bi in range(train_size//batch_size): 
            
            """ Getting the next batch of data """
            try:
                # Get next batch of data
                b_filenames, b_img, b_cap, b_lbl = sess.run(tr_next_data)
                assert b_img.shape[0] == batch_size
            except:
                # If the Dataset runs out of data, re-initialize
                sess.run(tr_iter_data.initializer)
                b_filenames, b_img, b_cap, b_lbl = sess.run(tr_next_data)

            """ Defining inputs for the CNN training and training the CNN """
            # TODO: Define inputs (inp_cnn) and outputs (out_cnn) for training CNN model.
            # TODO: Train the model using inp_cnn and out_cnn
            # TODO: Predict the image embeddings for the image batch (remember to set batch_size)            
            
            """ Defining inputs for the Sequential model and training the Sequential model """
            # TODO: Define input list required to train the caption generator model as inp_full
            # TODO: Define the output for the caption generator model as out_full
            
            # TODO: Train the model using inp_full and out_full
            
            """ Computing the model loss """
            l = model.evaluate(inp_full, out_full, batch_size=batch_size, verbose=0)
                        
            if bi==0 and (ep+1) % 5 == 0:

                """ Getting some predictions from the model """ 
                pred = model.predict(inp_full, batch_size=batch_size)
                word_inds = np.argmax(pred, axis=-1)
                true_word_inds = np.argmax(out_full, axis=-1)
                
                """ Generating the string to print based on model predictions """
                for fn, word_ind_vec, true_vec in zip(b_filenames, word_inds, true_word_inds):
                    string = fn[0].decode('utf-8')
                    string += ":\t" + ' '.join([tok.index_word[w] for w in word_ind_vec if w != 0])
                    string += ":\t" + " ".join([tok.index_word[w] for w in true_vec if w!=0])
                    print(string)
                    
            losses.append(l)
        
        """ Saving the model """
        if ep==1 or (ep+1)%25==0:
            model.save('caption_gen_{}.h5'.format(ep+1))
            cnn_vec_model.save('cnn_vec_{}.h5'.format(ep+1))
        
        """ Printing the losses """
        print("\nFinished epoch {}".format(ep+1))
        print("\tTrain loss: {}".format(np.mean(losses)))
        
        

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "



Finished epoch 1
	Train loss: 5.189673092630175

Finished epoch 2
	Train loss: 4.013754208882649

Finished epoch 3
	Train loss: 3.8662480778164334

Finished epoch 4
	Train loss: 3.684454745716519
dataset\boat\2008_006014.jpg:	a a a a a a:	a UNK pier and a boat in the water with several people on it
dataset\boat\2008_000437.jpg:	a a a a a a:	two boats docked on a UNK
dataset\bottle\2008_005752.jpg:	a a a a a a:	a bottle of water sitting on a stone in the UNK
dataset\bottle\2008_004487.jpg:	a a a a a a a:	a long table covered with a UNK being used as a desk
dataset\chair\2008_002674.jpg:	a a a a a a:	a woman in black rocking a new baby UNK in a blanket
dataset\train\2008_004214.jpg:	a a a a a:	yellow blue and orange train on train tracks
dataset\sheep\2008_008319.jpg:	a a a a a:	children looking a sheep
dataset\motorbike\2008_007746.jpg:	a a a a a a a:	brightly colored motorcycles are parked in a line
dataset\motorbike\2008_005213.jpg:	a a a a a a:	group of four dirt bikers riding down 


Finished epoch 10
	Train loss: 3.1597721576690674

Finished epoch 11
	Train loss: 3.1529812150531344

Finished epoch 12
	Train loss: 3.034616880946689

Finished epoch 13
	Train loss: 2.9697323640187583

Finished epoch 14
	Train loss: 2.8969971736272178
dataset\boat\2008_005321.jpg:	a UNK UNK UNK on a UNK:	a small UNK is in the shallow water
dataset\pottedplant\2008_006112.jpg:	a UNK UNK with a UNK:	the small tree has some flower UNK
dataset\horse\2008_008232.jpg:	a UNK UNK on a UNK:	two people ride horses down a park trail
dataset\bird\2008_004805.jpg:	a man UNK with a UNK on a UNK a UNK:	a small bird with yellow UNK and black stripes eats from a bird feeder
dataset\cat\2008_003622.jpg:	a UNK UNK on on a a UNK:	tan domestic kitten lying in bed in a cage
dataset\bird\2008_006667.jpg:	a UNK white UNK with on a:	green and white bird perched on tree branch
dataset\diningtable\2008_008363.jpg:	a man and on a:	a UNK decorated dining room
dataset\tvmonitor\2008_002817.jpg:	a UNK UNK on a UNK


Finished epoch 20
	Train loss: 2.7033318678538003

Finished epoch 21
	Train loss: 2.651468700832791


## Loading the saved model

Infering from the model is not as straight-forward as training. In this we will define three separate models.
* `cnn_vec_model`: Given a batch of images, provides the image embeddings
* `embed_model`: Given a batch of words, outputs the embedding vectors
* `gru_model`: Given a batch of embeddings, ouput the predicted words

In [None]:

K.clear_session()
tf.reset_default_graph()

config=tf.ConfigProto(allow_soft_placement=True)
config.gpu_options.allow_growth = True

sess =  tf.Session(config=config)

# Loading the CNN model and GRU model
cnn_vec_model = load_model('cnn_vec_100.h5')
trained_model = load_model('caption_gen_100.h5')

for layer in trained_model.layers:    
    """ Restoring all the layers of the model. However the input layer needs to change to have only 1 timestep """
    
    if layer.name == 'word_input':
        test_word_input = Input(batch_shape=(batch_size, 1), name='infer_word_input')
    if layer.name=='embeddings':
        emb_layer = Embedding(vocab_size, input_dim, input_length=1, name='infer_embeddings')
        test_embed_out = emb_layer(test_word_input)
        
    if layer.name == 'gru':
        test_embed_input = Input(batch_shape=(batch_size, 1,input_dim), name='infer_embed_input')
        gru = GRU(128, return_sequences=True, stateful=True, name='infer_gru')
        test_gru_out = gru(test_embed_input)
    
    if layer.name == 'time_distributed':
        print(test_gru_out.shape)
        softmax = TimeDistributed(Dense(vocab_size, activation='softmax', name='infer_gru_fc'),name='infer_time_distributed')
        test_word_pred = softmax(test_gru_out)

""" Defining the embedding model """
embed_model = Model(inputs=test_word_input, outputs=test_embed_out)

""" Defining the GRU model """
gru_model = Model(inputs=test_embed_input, outputs=test_word_pred)

cnn_vec_model.summary()

# Setting weights of the layers in embed_model
for layer in embed_model.layers:
    mod_layer_name = layer.name[6:]
    if not layer.name == "infer_embed_input":
        layer.set_weights(trained_model.get_layer(mod_layer_name).get_weights())

embed_model.summary()

# Setting weights of the layers in the gru_model
for layer in gru_model.layers:
    mod_layer_name = layer.name[6:]
    if not layer.name == "infer_embed_input":
        layer.set_weights(trained_model.get_layer(mod_layer_name).get_weights())

gru_model.summary()

## Inferring predictions from the loaded model

In [None]:
test_dataset = tf.data.Dataset().from_generator(
    partial_test_gen, output_types= (tf.string, tf.float32, tf.int32, tf.int32), 
    output_shapes=(tf.TensorShape([1]), tf.TensorShape([image_size, image_size, 3]), tf.TensorShape([timesteps]), tf.TensorShape([1]))
).batch(batch_size, drop_remainder=True)

ts_iter_data = test_dataset.make_initializable_iterator()
ts_next_data = ts_iter_data.get_next()

sess.run(ts_iter_data.initializer)

losses = []

for bi in range(test_size//batch_size):

    """ Get test data"""

    try:
        b_filenames, b_img, b_cap, _ = sess.run(ts_next_data)
        assert b_img.shape[0] == batch_size
    except:
        sess.run(ts_iter_data.initializer)
        b_filenames, b_img, b_cap, _ = sess.run(ts_next_data)

    """ Computing word predictions iteratively """
    latent_vecs = np.expand_dims(cnn_vec_model.predict(b_img),1) #+ np.random.normal(size=(batch_size,1,128))
    
    
    b_pred_words = []
    for ti in range(timesteps):
        word_probs = gru_model.predict(latent_vecs, batch_size=batch_size)[:,0,:]
        test_words = np.expand_dims(np.argmax(word_probs, axis=-1),-1)
        latent_vecs = embed_model.predict(test_words, batch_size=batch_size)
        b_pred_words.append(test_words)
    
    """ Combining the interative results to a single tensor """

    b_pred_words = np.concatenate(b_pred_words, axis=-1)
    gru_model.reset_states()

    """ Writing results to disk """

    print('\n','='*50, '\n')

    b_pred_strings = []
    for fn, p_vec in zip(b_filenames, b_pred_words):
        pred_string = fn[0].decode('utf-8') + ":" + ' '.join(tok.index_word[p] for p in p_vec if p != 0)+'\n'
        b_pred_strings.append(pred_string)
        print(pred_string)
    with open('predictions_{}.txt'.format(bi),'w') as f:
        f.writelines(b_pred_strings)

sess.close()