In [1]:
from PIL import Image
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Conv2D, MaxPool2D, AvgPool2D, Flatten, GRU, Reshape, Concatenate, TimeDistributed
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K
import tensorflow as tf
import pandas as pd
import numpy as np
from functools import partial
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import os

## Image caption dataset

- [Dataset scraper-Modified](https://github.com/rupy/PascalSentenceDataset/blob/master/pascal_sentence_dataset.py)
- [Dataset](http://vision.cs.uiuc.edu/pascal-sentences/)

install pyquery
create folder called `dataset` and `sentence` in the project dir

## Downloading the dataset

In [2]:
%run -i ./pascal_sentence_dataset.py

dataset/aeroplane\2008_000716.jpg
Already downloaded, Skipping: dataset/aeroplane\2008_000716.jpg
dataset/aeroplane\2008_001227.jpg
Already downloaded, Skipping: dataset/aeroplane\2008_001227.jpg
dataset/aeroplane\2008_001380.jpg
Already downloaded, Skipping: dataset/aeroplane\2008_001380.jpg
dataset/aeroplane\2008_001448.jpg
Already downloaded, Skipping: dataset/aeroplane\2008_001448.jpg
dataset/aeroplane\2008_001468.jpg
Already downloaded, Skipping: dataset/aeroplane\2008_001468.jpg
dataset/aeroplane\2008_001801.jpg
Already downloaded, Skipping: dataset/aeroplane\2008_001801.jpg
dataset/aeroplane\2008_001971.jpg
Already downloaded, Skipping: dataset/aeroplane\2008_001971.jpg
dataset/aeroplane\2008_001985.jpg
Already downloaded, Skipping: dataset/aeroplane\2008_001985.jpg
dataset/aeroplane\2008_002358.jpg
Already downloaded, Skipping: dataset/aeroplane\2008_002358.jpg
dataset/aeroplane\2008_002454.jpg
Already downloaded, Skipping: dataset/aeroplane\2008_002454.jpg
dataset/aeroplane\20

dataset/chair\2008_008589.jpg
Already downloaded, Skipping: dataset/chair\2008_008589.jpg
dataset/chair\2008_008593.jpg
Already downloaded, Skipping: dataset/chair\2008_008593.jpg
dataset/cow\2008_000335.jpg
Already downloaded, Skipping: dataset/cow\2008_000335.jpg
dataset/cow\2008_000711.jpg
Already downloaded, Skipping: dataset/cow\2008_000711.jpg
dataset/cow\2008_000876.jpg
Already downloaded, Skipping: dataset/cow\2008_000876.jpg
dataset/cow\2008_000905.jpg
Already downloaded, Skipping: dataset/cow\2008_000905.jpg
dataset/cow\2008_000919.jpg
Already downloaded, Skipping: dataset/cow\2008_000919.jpg
dataset/cow\2008_000964.jpg
Already downloaded, Skipping: dataset/cow\2008_000964.jpg
dataset/cow\2008_001359.jpg
Already downloaded, Skipping: dataset/cow\2008_001359.jpg
dataset/cow\2008_002270.jpg
Already downloaded, Skipping: dataset/cow\2008_002270.jpg
dataset/cow\2008_002278.jpg
Already downloaded, Skipping: dataset/cow\2008_002278.jpg
dataset/cow\2008_002686.jpg
Already downloaded

dataset/sofa\2008_005926.jpg
Already downloaded, Skipping: dataset/sofa\2008_005926.jpg
dataset/sofa\2008_006038.jpg
Already downloaded, Skipping: dataset/sofa\2008_006038.jpg
dataset/sofa\2008_006436.jpg
Already downloaded, Skipping: dataset/sofa\2008_006436.jpg
dataset/sofa\2008_006616.jpg
Already downloaded, Skipping: dataset/sofa\2008_006616.jpg
dataset/sofa\2008_007021.jpg
Already downloaded, Skipping: dataset/sofa\2008_007021.jpg
dataset/sofa\2008_007043.jpg
Already downloaded, Skipping: dataset/sofa\2008_007043.jpg
dataset/sofa\2008_007050.jpg
Already downloaded, Skipping: dataset/sofa\2008_007050.jpg
dataset/sofa\2008_007169.jpg
Already downloaded, Skipping: dataset/sofa\2008_007169.jpg
dataset/sofa\2008_007837.jpg
Already downloaded, Skipping: dataset/sofa\2008_007837.jpg
dataset/sofa\2008_008103.jpg
Already downloaded, Skipping: dataset/sofa\2008_008103.jpg
dataset/sofa\2008_008106.jpg
Already downloaded, Skipping: dataset/sofa\2008_008106.jpg
dataset/sofa\2008_008162.jpg
Alr

Already downloaded, Skipping: sentence/aeroplane\2008_000716.txt
Already downloaded, Skipping: sentence/aeroplane\2008_001227.txt
Already downloaded, Skipping: sentence/aeroplane\2008_001380.txt
Already downloaded, Skipping: sentence/aeroplane\2008_001448.txt
Already downloaded, Skipping: sentence/aeroplane\2008_001468.txt
Already downloaded, Skipping: sentence/aeroplane\2008_001801.txt
Already downloaded, Skipping: sentence/aeroplane\2008_001971.txt
Already downloaded, Skipping: sentence/aeroplane\2008_001985.txt
Already downloaded, Skipping: sentence/aeroplane\2008_002358.txt
Already downloaded, Skipping: sentence/aeroplane\2008_002454.txt
Already downloaded, Skipping: sentence/aeroplane\2008_003275.txt
Already downloaded, Skipping: sentence/aeroplane\2008_003369.txt
Already downloaded, Skipping: sentence/aeroplane\2008_003575.txt
Already downloaded, Skipping: sentence/aeroplane\2008_003655.txt
Already downloaded, Skipping: sentence/aeroplane\2008_003703.txt
Already downloaded, Skipp

Already downloaded, Skipping: sentence/cat\2008_000116.txt
Already downloaded, Skipping: sentence/cat\2008_000182.txt
Already downloaded, Skipping: sentence/cat\2008_000227.txt
Already downloaded, Skipping: sentence/cat\2008_000345.txt
Already downloaded, Skipping: sentence/cat\2008_000670.txt
Already downloaded, Skipping: sentence/cat\2008_001290.txt
Already downloaded, Skipping: sentence/cat\2008_001335.txt
Already downloaded, Skipping: sentence/cat\2008_001592.txt
Already downloaded, Skipping: sentence/cat\2008_001836.txt
Already downloaded, Skipping: sentence/cat\2008_001885.txt
Already downloaded, Skipping: sentence/cat\2008_002067.txt
Already downloaded, Skipping: sentence/cat\2008_002201.txt
Already downloaded, Skipping: sentence/cat\2008_002294.txt
Already downloaded, Skipping: sentence/cat\2008_002329.txt
Already downloaded, Skipping: sentence/cat\2008_002410.txt
Already downloaded, Skipping: sentence/cat\2008_002749.txt
Already downloaded, Skipping: sentence/cat\2008_002845.t

Already downloaded, Skipping: sentence/person\2008_000422.txt
Already downloaded, Skipping: sentence/person\2008_000448.txt
Already downloaded, Skipping: sentence/person\2008_000547.txt
Already downloaded, Skipping: sentence/person\2008_000748.txt
Already downloaded, Skipping: sentence/person\2008_000806.txt
Already downloaded, Skipping: sentence/person\2008_000825.txt
Already downloaded, Skipping: sentence/person\2008_000834.txt
Already downloaded, Skipping: sentence/person\2008_000917.txt
Already downloaded, Skipping: sentence/person\2008_000987.txt
Already downloaded, Skipping: sentence/person\2008_001035.txt
Already downloaded, Skipping: sentence/person\2008_001074.txt
Already downloaded, Skipping: sentence/person\2008_001249.txt
Already downloaded, Skipping: sentence/person\2008_001301.txt
Already downloaded, Skipping: sentence/person\2008_001318.txt
Already downloaded, Skipping: sentence/person\2008_001349.txt
Already downloaded, Skipping: sentence/person\2008_001501.txt
Already 

1 => aeroplane/2008_000716.jpg
2 => aeroplane/2008_001227.jpg
3 => aeroplane/2008_001380.jpg
4 => aeroplane/2008_001448.jpg
5 => aeroplane/2008_001468.jpg
6 => aeroplane/2008_001801.jpg
7 => aeroplane/2008_001971.jpg
8 => aeroplane/2008_001985.jpg
9 => aeroplane/2008_002358.jpg
10 => aeroplane/2008_002454.jpg
11 => aeroplane/2008_003275.jpg
12 => aeroplane/2008_003369.jpg
13 => aeroplane/2008_003575.jpg
14 => aeroplane/2008_003655.jpg
15 => aeroplane/2008_003703.jpg
16 => aeroplane/2008_003788.jpg
17 => aeroplane/2008_003905.jpg
18 => aeroplane/2008_004165.jpg
19 => aeroplane/2008_004348.jpg
20 => aeroplane/2008_004532.jpg
21 => aeroplane/2008_005538.jpg
22 => aeroplane/2008_005905.jpg
23 => aeroplane/2008_005907.jpg
24 => aeroplane/2008_005916.jpg
25 => aeroplane/2008_006401.jpg
26 => aeroplane/2008_006548.jpg
27 => aeroplane/2008_006619.jpg
28 => aeroplane/2008_006621.jpg
29 => aeroplane/2008_006623.jpg
30 => aeroplane/2008_006700.jpg
31 => aeroplane/2008_006933.jpg
32 => aeroplane/2

## Loading the data

In [3]:

""" Loading the names of the image files """
dir_content = os.walk(os.path.join('dataset'))
img_data = []
for d in dir_content:
    folder, _, files = d
    key = folder.split(os.path.sep)[-1]
    for f in files:
        if f.endswith('.jpg'):
            fpath = os.path.join(key, f.split('.')[0])
            img_data.append(fpath)
print(img_data[:5])

""" Loading the sentences of caption data """
caption_data = []
for d_path in img_data:
    
    caption_path = os.path.join('sentence',d_path)+'.txt'
    caption_list = []
    with open(caption_path, 'r') as f:
        for row in f:
            caption_list.append(row)
    caption_data.append(caption_list)
print(caption_data[:5])

""" Combine the data """
# Define a dataframe
data_df = pd.DataFrame({"images": img_data, "captions":caption_data})

# Define a column length that has caption length
data_df["length"] = data_df["captions"].str.len()

# Split train and test data
np.random.seed(100)
train_mask = np.random.choice([True, False], size=1000, p=[0.9,0.1])
train_df = data_df.loc[train_mask,:].reset_index(drop=True)
test_df = data_df.loc[~train_mask,:].reset_index(drop=True)
train_size = train_df.shape[0]
test_size = test_df.shape[0]
print('\nTrain size: {} Test size: {}'.format(train_size, test_size))

# Print the first part of data
data_df.head(n=5)


['aeroplane\\2008_000716', 'aeroplane\\2008_001227', 'aeroplane\\2008_001380', 'aeroplane\\2008_001448', 'aeroplane\\2008_001468']
[['One jet lands at an airport while another takes off next to it.\n', 'Two airplanes parked in an airport.\n', 'Two jets taxi past each other.\n', 'Two parked jet airplanes facing opposite directions.\n', 'two passenger planes on a grassy plain\n'], ['Two gentleman talking in front of propeller plane.\n', 'Two men are conversing next to a small airplane.\n', 'Two men talking in front of a plane\n', 'Two men talking in front of a small plane.\n', 'Two men talk while standing next to a small passenger plane at an airport.\n'], ['A D-ERFW-6 in flight.\n', 'An army green plane flying in the sky.\n', 'An old fighter plane flying with German military markings.\n', 'A small green and yellow plane in the sky.\n', 'A WWII fighter plane with its landing gear down.\n'], ['a larger plane in flying above a smaller plane\n', 'Black and white scene of two planes flying.\

Unnamed: 0,captions,images,length
0,[One jet lands at an airport while another tak...,aeroplane\2008_000716,5
1,[Two gentleman talking in front of propeller p...,aeroplane\2008_001227,5
2,"[A D-ERFW-6 in flight.\n, An army green plane ...",aeroplane\2008_001380,5
3,[a larger plane in flying above a smaller plan...,aeroplane\2008_001448,5
4,[A blue grounded fighter jet is parked on gras...,aeroplane\2008_001468,5


## Defining hyperparameters

In [4]:
vocab_size=1000
timesteps = 15
image_size = 168

## Preprocessing text

In [5]:

def lower_and_tokenizer(sent):
    """ Lower and tokenize text """
    return ' '.join([w.lower() for w in word_tokenize(sent)])

def preprocess_caption(x, tok, timesteps):
    """ Turn a caption in to a sequence of word ids"""
    x_clean = []
    for sent in x:
        x_clean.append(lower_and_tokenizer(sent))
    seq = tok.texts_to_sequences(x_clean)
    return pad_sequences(seq, padding='post', maxlen=timesteps)

# Fitting a tokenizer
tok = Tokenizer(num_words=vocab_size, oov_token='UNK')
tok.fit_on_texts([lower_and_tokenizer(cap) for cap_group in train_df["captions"].tolist() for cap in cap_group ])

# Creating preprocessed text column for train and test data
train_df["captions_preproc"] = train_df["captions"].apply(lambda x: preprocess_caption(x, tok, timesteps))
test_df["captions_preproc"] = test_df["captions"].apply(lambda x: preprocess_caption(x, tok, timesteps))

# Getting how many words on average for a caption
train_df["caption_length"] = train_df["captions"].apply(lambda x: np.mean([len(word_tokenize(xx)) for xx in x]))
print('On average each caption has {} words'.format(train_df["caption_length"].mean()))
train_df = train_df.sample(frac=1, random_state=100)

# Print the head of train data
train_df.head(n=25)

On average each caption has 10.793504083147734 words


Unnamed: 0,captions,images,length,captions_preproc,caption_length
433,[a father is introducing his daughter to a cow...,cow\2008_007729,5,"[[2, 1, 11, 1, 82, 568, 17, 2, 63, 5, 2, 560, ...",13.0
787,[a dark living room with a white couch and a f...,sofa\2008_004632,5,"[[2, 219, 69, 21, 7, 2, 9, 50, 6, 2, 573, 0, 0...",11.8
686,[A group of people stand by a body of water lo...,pottedplant\2008_003144,5,"[[2, 80, 8, 35, 150, 53, 2, 344, 8, 47, 40, 14...",12.2
24,[An Air Canada airplane is ascending against a...,aeroplane\2008_006619,5,"[[16, 177, 495, 109, 11, 1, 275, 2, 25, 139, 0...",9.6
481,[A country dinner setting with a bounty of foo...,diningtable\2008_006192,5,"[[304, 265, 863, 7, 2, 1, 8, 229, 1, 145, 1, 7...",14.4
259,"[A blue party bus\n, A blue party bus is parke...",bus\2008_008252,5,"[[2, 25, 416, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",8.8
44,[An airplane is flying over a tree in the blue...,aeroplane\2008_008471,5,"[[16, 109, 11, 111, 112, 2, 103, 4, 3, 25, 139...",9.2
594,"[An orange Moped\n, An orange scooter is parke...",motorbike\2008_002926,5,"[[16, 136, 414, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8.8
606,[A close up of the front end of a purple and w...,motorbike\2008_006517,5,"[[2, 60, 30, 8, 3, 22, 369, 8, 2, 261, 6, 9, 4...",12.8
454,[A kitchen with pine doors and a table and cha...,diningtable\2008_001155,5,"[[2, 117, 7, 1, 619, 6, 2, 19, 6, 107, 0, 0, 0...",9.2


## Defining data generators

In [6]:
def data_gen(data_df, flip_lr=False):
    """ 
    Defining a data generator, that generates a batch of 
    image filenames, images, caption text, and labels
    """
    for ri, row in data_df.iterrows():
        
        caption_list = row.loc["captions_preproc"]
        
        """ Loading the image """
        image_filename = os.path.join("dataset", row["images"]) + ".jpg"
        new_size  = image_size + 50
        img = Image.open(image_filename)
        width, height = img.width, img.height
        
        """ Depending on the orientation of the image resize """
        if width < height:
            new_height = new_size * height / width 
            img.thumbnail((new_size, new_height), Image.ANTIALIAS)
        else:
            new_width = new_size * width/ height
            img.thumbnail((new_width, new_size), Image.ANTIALIAS)
        
        """ Center crop image """
        width, height = img.width, img.height
        
        rand_width, rand_height = np.random.randint(-25,25), np.random.randint(-25,25)
        left = max(((width - new_size)/2) + rand_width, 0)
        right = left + new_size - 50
        top = max(((height - new_size)/2) + rand_height, 0)
        bottom = top + new_size - 50

        img = img.crop((left, top, right, bottom))
        
        """ Creating a label """
        label_map = {'aeroplane':0, "bicycle":1, "bird":2, "boat":3, "bottle":4,
                    "bus":5, "car":6, "cat": 7, "chair":8, "cow": 9, "diningtable": 10, "dog": 11,
                   "horse":12, "motorbike": 13, "person": 14, "pottedplant":15, "sheep": 16, 'sofa': 17,
                    "train":18, "tvmonitor":19}
        label = label_map[row["images"].split(os.path.sep)[0]]
        
        """ Image as an array and get a random caption for that image """
        
        img_arr = np.array(img) - np.mean(np.array(img))
        
        if flip_lr and np.random.choice([True, False]):
            img_arr = np.flip(img_arr, 1)
        rand_idx = np.random.randint(len(caption_list))
        caption = caption_list[rand_idx]
        
        yield [image_filename], img_arr, caption, [label]

# Testing out the generator
gen = data_gen(train_df)
for fname, img, cap, label in gen:
    print("Image: ",img.shape, fname, np.mean(img), np.max(img), np.min(img))
    print("Caption: ", cap)
    print("Label: ", label)
    break

Image:  (168, 168, 3) ['dataset\\cow\\2008_007729.jpg'] -4.038761430775475e-15 120.59067932728647 -134.40932067271353
Caption:  [  2   1  11   1  82 568  17   2  63   5   2 560 173   0   0]
Label:  [9]


## Using `tf.data.Dataset` to create datasets

In [7]:
K.clear_session()
tf.reset_default_graph()

""" tf.data API usage """
batch_size = 48
partial_train_gen = partial(data_gen, data_df=train_df, flip_lr=True)
partial_test_gen = partial(data_gen, data_df=test_df)

""" Defining tf.data.Dataset using generator """
train_dataset = tf.data.Dataset().from_generator(
    partial_train_gen, output_types= (tf.string, tf.float32, tf.int32, tf.int32), 
    output_shapes=(tf.TensorShape([1]), 
                   tf.TensorShape([image_size, image_size, 3]), 
                   tf.TensorShape([timesteps]), tf.TensorShape([1]))
).shuffle(5*batch_size).batch(batch_size, drop_remainder=True).repeat()

""" Testing the tf.data.Dataset """
iter = train_dataset.make_initializable_iterator()
el = iter.get_next()

with tf.Session() as sess:
    sess.run(iter.initializer)
    print(sess.run(el)[0].shape, sess.run(el)[1].shape, sess.run(el)[3].shape)

(48, 1) (48, 168, 168, 3) (48, 1)


## Creating the CNN

The CNN needs to be trained for an image classification task and the Sequential model on the caption generation part. In other words, the model cannot be trained end-to-end.

In [8]:
""" Defining the layers of CNN """
image_input = Input(batch_shape=(batch_size, image_size,image_size,3), name='cnn_image_input') #<= 112
conv1_out = Conv2D(32, 3, padding='same', activation='relu', 
                   kernel_initializer='glorot_uniform',  name='cnn_conv1')(image_input) #<= 112
pool2_out = MaxPool2D((2,2), strides=(2,2),  name='cnn_pool2')(conv1_out) #<= 56
conv3_out = Conv2D(64, 3, padding='same', kernel_initializer='glorot_uniform', 
                   activation='relu', name='cnn_conv3')(pool2_out) #<= 56
pool4_out = MaxPool2D((2,2), strides=(2,2), name='cnn_pool4')(conv3_out) #<= 28
conv5_out = Conv2D(64, 3, padding='same', activation='relu', name='cnn_conv5')(pool4_out) #<= 28
pool6_out = MaxPool2D((2,2), strides=(2,2), name='cnn_pool6')(conv5_out) #<= 14
conv7_out = Conv2D(128, 3, padding='same', kernel_initializer='glorot_uniform', 
                   activation='relu', name='cnn_conv7')(pool6_out) #<= 14
pool8_out = MaxPool2D((2,2), strides=(2,2), name='cnn_pool8')(conv7_out) #<= 7
conv9_out = Conv2D(128, 3, padding='same', kernel_initializer='glorot_uniform', 
                   activation='relu', name='cnn_conv9')(pool8_out) #<= 7
pool10_out = AvgPool2D((7,7), strides=(7,7), name='cnn_pool10')(conv9_out)
flatten_out = Flatten(name='cnn_flatten')(pool10_out)

input_dim = 128

image_fc = Dense(input_dim, activation='relu', kernel_initializer='glorot_uniform', name='cnn_fc1')(flatten_out) # <= (batch_size, 128)
image_pred = Dense(20, activation='softmax', kernel_initializer='glorot_uniform', name='cnn_fc2')(image_fc)

""" Defining the CNN models: One for getting predictions one for getting image embeddings"""
cnn_model = Model(inputs=image_input, outputs=image_pred)
cnn_model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

cnn_vec_model = Model(inputs=image_input, outputs=image_fc)

cnn_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
cnn_image_input (InputLayer) (48, 168, 168, 3)         0         
_________________________________________________________________
cnn_conv1 (Conv2D)           (48, 168, 168, 32)        896       
_________________________________________________________________
cnn_pool2 (MaxPooling2D)     (48, 84, 84, 32)          0         
_________________________________________________________________
cnn_conv3 (Conv2D)           (48, 84, 84, 64)          18496     
_________________________________________________________________
cnn_pool4 (MaxPooling2D)     (48, 42, 42, 64)          0         
_________________________________________________________________
cnn_conv5 (Conv2D)           (48, 42, 42, 64)          36928     
_________________________________________________________________
cnn_pool6 (MaxPooling2D)     (48, 21, 21, 64)          0         
__________

## Defining the LSTM model

In [9]:
# Defining an input layer
word_input = Input(batch_shape=(batch_size, timesteps-1,), name='word_input') # <= (batch_size, timesteps-1, 1)

# Defining an embedding layer and an embedding output
emb_layer = Embedding(vocab_size, input_dim, input_length=timesteps-1, name='embeddings')
embed_out = emb_layer(word_input) # <= (batch_size, timesteps, 128)

# Defining an input layer for image embeddings
image_fc_input = Input(batch_shape=(batch_size, input_dim), name='image_fc')

# Concat image embeddings and embedding outputs
inputs = Concatenate(axis=1, name='image_caption_concat')(
    [Reshape(target_shape=(1, input_dim))(image_fc_input), embed_out]
)

# Defining GRU layer and GRU output
gru = GRU(128, dropout=0.5, return_sequences=True, kernel_initializer='glorot_uniform', name='gru')
gru_out = gru(inputs)

# Defining a TimeDistributed Dense layer and its output
softmax = TimeDistributed(Dense(vocab_size, activation='softmax', name='gru_fc'),name='time_distributed')
word_pred = softmax(gru_out)

# Defining Model and compiling the model
model = Model(inputs=[image_fc_input, word_input], outputs=word_pred)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
image_fc (InputLayer)           (48, 128)            0                                            
__________________________________________________________________________________________________
word_input (InputLayer)         (48, 14)             0                                            
__________________________________________________________________________________________________
reshape (Reshape)               (48, 1, 128)         0           image_fc[0][0]                   
__________________________________________________________________________________________________
embeddings (Embedding)          (48, 14, 128)        128000      word_input[0][0]                 
__________________________________________________________________________________________________
image_capt

## Training the image caption generator

In [10]:
tr_iter_data = train_dataset.make_initializable_iterator()
tr_next_data = tr_iter_data.get_next()

# Defining session config
config=tf.ConfigProto(allow_soft_placement=True)
config.gpu_options.allow_growth = True

with tf.Session(config=config) as sess:
    
    for ep in range(100):
        sess.run(tr_iter_data.initializer)
        losses = []
        
        for bi in range(train_size//batch_size): 
            
            """ Getting the next batch of data """
            try:
                # Get next batch of data
                b_filenames, b_img, b_cap, b_lbl = sess.run(tr_next_data)
                assert b_img.shape[0] == batch_size
            except:
                # If the Dataset runs out of data, re-initialize
                sess.run(tr_iter_data.initializer)
                b_filenames, b_img, b_cap, b_lbl = sess.run(tr_next_data)

            """ Defining inputs for the CNN training and training the CNN """
            inp_cnn, out_cnn = b_img, to_categorical(b_lbl, num_classes=20)
            cnn_model.train_on_batch(inp_cnn, out_cnn)
            latent_vecs = cnn_vec_model.predict(b_img, batch_size=batch_size)
            
            """ Defining inputs for the Sequential model and training the Sequential model """
            inp_full, out_full = [latent_vecs, b_cap[:,:-1]], to_categorical(b_cap, num_classes=vocab_size)
            model.train_on_batch(inp_full, out_full)
            
            """ Computing the model loss """
            l = model.evaluate(inp_full, out_full, batch_size=batch_size, verbose=0)
                        
            if bi==0 and (ep+1) % 10 == 0:

                """ Getting some predictions from the model """ 
                pred = model.predict(inp_full, batch_size=batch_size)
                word_inds = np.argmax(pred, axis=-1)
                true_word_inds = np.argmax(out_full, axis=-1)
                
                """ Generating the string to print based on model predictions """
                for fn, word_ind_vec, true_vec in zip(b_filenames, word_inds, true_word_inds):
                    string = fn[0].decode('utf-8')
                    string += ":\t" + ' '.join([tok.index_word[w] for w in word_ind_vec if w != 0])
                    string += ":\t" + " ".join([tok.index_word[w] for w in true_vec if w!=0])
                    print(string)
                    
            losses.append(l)
        
        """ Saving the model """
        if ep==1 or (ep+1)%25==0:
            model.save('caption_gen_{}.h5'.format(ep+1))
            cnn_vec_model.save('cnn_vec_{}.h5'.format(ep+1))
        
        """ Printing the losses """
        print("\nFinished epoch {}".format(ep+1))
        print("\tTrain loss: {}".format(np.mean(losses)))
        
        

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "



Finished epoch 1
	Train loss: 5.207840389675564

Finished epoch 2
	Train loss: 3.93342571788364

Finished epoch 3
	Train loss: 3.8227036661571927

Finished epoch 4
	Train loss: 3.6626402934392295

Finished epoch 5
	Train loss: 3.5685185856289334

Finished epoch 6
	Train loss: 3.3235139979256525

Finished epoch 7
	Train loss: 3.308411161104838

Finished epoch 8
	Train loss: 3.255168875058492

Finished epoch 9
	Train loss: 3.1307964060041638
dataset\sofa\2008_002903.jpg:	a UNK UNK in:	a living room
dataset\horse\2008_007319.jpg:	a UNK UNK UNK a UNK UNK a UNK:	two white horses one large the other smaller with UNK face look over a fence
dataset\bus\2008_007997.jpg:	a UNK UNK UNK UNK a UNK a:	a double decker bus viewed through a UNK of flag poles
dataset\chair\2008_002674.jpg:	a UNK UNK a UNK UNK UNK a UNK:	a woman in black rocking a new baby UNK in a blanket
dataset\diningtable\2008_008363.jpg:	a UNK UNK UNK a:	a UNK decorated dining room
dataset\bus\2008_005933.jpg:	a UNK UNK in a UNK a 


Finished epoch 20
	Train loss: 2.6982023451063366

Finished epoch 21
	Train loss: 2.6323389609654746

Finished epoch 22
	Train loss: 2.64070717493693

Finished epoch 23
	Train loss: 2.6861844460169473

Finished epoch 24
	Train loss: 2.6653649144702487

Finished epoch 25
	Train loss: 2.6201118628184

Finished epoch 26
	Train loss: 2.6341889434390597

Finished epoch 27
	Train loss: 2.6301302512486777

Finished epoch 28
	Train loss: 2.594937245051066

Finished epoch 29
	Train loss: 2.580762267112732
dataset\sheep\2008_008319.jpg:	a and a UNK:	children looking a sheep
dataset\dog\2008_007567.jpg:	a man and a UNK with on UNK a UNK:	a woman UNK a child that is in a UNK costume
dataset\car\2008_005960.jpg:	a UNK and UNK with on a camera a the background:	a UNK black car parked in the country with mountains in the distance
dataset\cat\2008_007039.jpg:	a man and cat is a UNK UNK a UNK a UNK UNK UNK:	a grey striped cat with white UNK and UNK atop a green patterned spread
dataset\person\2008_006


Finished epoch 40
	Train loss: 2.4086295631196766

Finished epoch 41
	Train loss: 2.4452763266033597

Finished epoch 42
	Train loss: 2.416387332810296

Finished epoch 43
	Train loss: 2.3938205507066517

Finished epoch 44
	Train loss: 2.3632485469182334

Finished epoch 45
	Train loss: 2.407769216431512

Finished epoch 46
	Train loss: 2.3526423374811807

Finished epoch 47
	Train loss: 2.33470626672109

Finished epoch 48
	Train loss: 2.3620620568593345

Finished epoch 49
	Train loss: 2.3321250279744468
dataset\pottedplant\2008_006112.jpg:	a UNK white with a UNK:	the small tree has some flower UNK
dataset\boat\2008_003951.jpg:	a people are on a UNK a UNK UNK the background:	two people stand on a shore with a UNK bridge in the background
dataset\motorbike\2008_006517.jpg:	a of a UNK of a UNK UNK UNK UNK UNK of:	shot from the top of a UNK motorcycle 's gas UNK UNK and front wheel
dataset\sheep\2008_008623.jpg:	a UNK is UNK on UNK white UNK white UNK the camera:	the sheep is UNK blue and pur


Finished epoch 60
	Train loss: 2.2846389479107327

Finished epoch 61
	Train loss: 2.241225759188334

Finished epoch 62
	Train loss: 2.248250060611301

Finished epoch 63
	Train loss: 2.2425759633382163

Finished epoch 64
	Train loss: 2.2659013271331787

Finished epoch 65
	Train loss: 2.2426288392808704

Finished epoch 66
	Train loss: 2.1785086923175387

Finished epoch 67
	Train loss: 2.164928436279297

Finished epoch 68
	Train loss: 2.199179106288486

Finished epoch 69
	Train loss: 2.1643281247880726
dataset\bottle\2008_003290.jpg:	a people women in a UNK of a camera a a UNK:	three young men in the UNK smile for the camera one holding a guitar
dataset\bird\2008_006924.jpg:	a women sitting sitting on a couch in head a the:	small bird is sitting on a person 's hand that has bird UNK in it
dataset\boat\2008_007953.jpg:	a small jet is on to a:	a tour boat docked next to flowers
dataset\diningtable\2008_007048.jpg:	a UNK UNK UNK room:	UNK 's UNK dining room
dataset\boat\2008_006925.jpg:	a U


Finished epoch 80
	Train loss: 2.07484726773368

Finished epoch 81
	Train loss: 2.1284261412090726

Finished epoch 82
	Train loss: 2.106028331650628

Finished epoch 83
	Train loss: 2.112757835123274

Finished epoch 84
	Train loss: 2.0958728194236755

Finished epoch 85
	Train loss: 2.082973407374488

Finished epoch 86
	Train loss: 2.092400007777744

Finished epoch 87
	Train loss: 2.0925099187427096

Finished epoch 88
	Train loss: 2.0860752794477673

Finished epoch 89
	Train loss: 2.0962478783395557
dataset\train\2008_006158.jpg:	a UNK the field:	train in a station
dataset\horse\2008_005642.jpg:	a black horse is UNK at the camera a the fence:	a white horse is looking at the camera from inside it 's pen
dataset\chair\2008_006841.jpg:	a a living with a UNK UNK and table and a a:	in a house with a UNK floor wooden table and chairs and glass patio door
dataset\dog\2008_004653.jpg:	a black and is on a UNK:	a black dog is in a forest
dataset\tvmonitor\2008_005625.jpg:	a man of people sitting 


Finished epoch 100
	Train loss: 2.0027016931109958

Finished epoch 101
	Train loss: 2.0672613779703775

Finished epoch 102
	Train loss: 2.007352292537689

Finished epoch 103
	Train loss: 1.9987101289961073

Finished epoch 104
	Train loss: 2.006197863154941

Finished epoch 105
	Train loss: 1.978498597939809

Finished epoch 106
	Train loss: 1.9788035882843866

Finished epoch 107
	Train loss: 1.9327590200636122

Finished epoch 108
	Train loss: 1.9967520766788058

Finished epoch 109
	Train loss: 1.9762057529555426
dataset\horse\2008_006096.jpg:	a in a on UNK to:	horse rides UNK the next UNK
dataset\aeroplane\2008_008373.jpg:	a passenger plane parked a UNK on:	a lufthansa plane has just taken off
dataset\bottle\2008_007737.jpg:	a sitting decker on:	girl double sitting beers
dataset\aeroplane\2008_005538.jpg:	a airplane airplane is a UNK on a UNK UNK UNK the UNK day:	an old airplane with yellow wings and a blue tail flying on a beautiful day
dataset\chair\2008_006691.jpg:	a UNK bus parked p


Finished epoch 120
	Train loss: 1.9116833872265286

Finished epoch 121
	Train loss: 1.9503306084209018

Finished epoch 122
	Train loss: 1.9358777867423163

Finished epoch 123
	Train loss: 1.9159788091977437

Finished epoch 124
	Train loss: 1.9558123879962497

Finished epoch 125
	Train loss: 1.9233237041367426

Finished epoch 126
	Train loss: 1.9329797824223836

Finished epoch 127
	Train loss: 1.902450402577718

Finished epoch 128
	Train loss: 1.9215673671828375

Finished epoch 129
	Train loss: 1.9086675975057814
dataset\tvmonitor\2008_002328.jpg:	a people UNK young women sitting on a living a UNK at the camera:	three dark haired young men sit in a UNK with one looking at his laptop
dataset\person\2008_001501.jpg:	a UNK UNK UNK the UNK a UNK:	UNK teenagers UNK into the air UNK for this UNK beach photo
dataset\tvmonitor\2008_002817.jpg:	a and with UNK UNK UNK UNK it desk chairs UNK of UNK UNK:	computer monitor and various UNK UNK on a desk and a UNK to the left
dataset\horse\2008_006096


Finished epoch 140
	Train loss: 1.8723714417881436

Finished epoch 141
	Train loss: 1.8768457902802362

Finished epoch 142
	Train loss: 1.8792097038692899

Finished epoch 143
	Train loss: 1.8359231352806091

Finished epoch 144
	Train loss: 1.8382729291915894

Finished epoch 145
	Train loss: 1.8827628095944722

Finished epoch 146
	Train loss: 1.8639415568775601

Finished epoch 147
	Train loss: 1.8455028269026015

Finished epoch 148
	Train loss: 1.8686481449339125

Finished epoch 149
	Train loss: 1.8791323237948947
dataset\bottle\2008_004487.jpg:	a and on a couch table:	laptop computer on a UNK covered table
dataset\sheep\2008_008109.jpg:	a black in a:	a sheep under UNK trees
dataset\tvmonitor\2008_007916.jpg:	a and chair a UNK:	computer lit by a lamp
dataset\car\2008_006087.jpg:	a black with on a horse a UNK of UNK a UNK:	a boy sits on a horse near a group of people and another horse
dataset\diningtable\2008_007402.jpg:	a man with a UNK shirt sits sitting a UNK a UNK table:	a woman wea


Finished epoch 160
	Train loss: 1.8445248537593417

Finished epoch 161
	Train loss: 1.7853726943333943

Finished epoch 162
	Train loss: 1.771051737997267

Finished epoch 163
	Train loss: 1.7948794960975647

Finished epoch 164
	Train loss: 1.7751034763124254

Finished epoch 165
	Train loss: 1.7826206882794697

Finished epoch 166
	Train loss: 1.8053379323747423

Finished epoch 167
	Train loss: 1.8099064297146268

Finished epoch 168
	Train loss: 1.8015383217069838

Finished epoch 169
	Train loss: 1.7981515659226313
dataset\pottedplant\2008_003144.jpg:	a small in a helmet child in a men UNK are at the camera:	a man with a small child and two other people looking at the water
dataset\horse\2008_008393.jpg:	a man up of a horse with a UNK UNK:	a close up of a horse wearing a blue halter
dataset\pottedplant\2008_007226.jpg:	a man of a UNK pot:	a UNK in a UNK pot
dataset\cow\2008_006290.jpg:	a cows are in a field field:	two cows standing in a large field
dataset\bus\2008_006635.jpg:	a man UNK 


Finished epoch 180
	Train loss: 1.758868071768019

Finished epoch 181
	Train loss: 1.7781336704889934

Finished epoch 182
	Train loss: 1.7318966719839308

Finished epoch 183
	Train loss: 1.7633169889450073

Finished epoch 184
	Train loss: 1.7733397218916152

Finished epoch 185
	Train loss: 1.7827845878071256

Finished epoch 186
	Train loss: 1.7073475586043463

Finished epoch 187
	Train loss: 1.7426876028378804

Finished epoch 188
	Train loss: 1.7771065500047472

Finished epoch 189
	Train loss: 1.7434345020188227
dataset\cat\2008_000227.jpg:	a man and is on a white chair:	a grey cat lying on a wooden table
dataset\motorbike\2008_008177.jpg:	a in with on UNK a:	parked motorcycles resting between rides
dataset\chair\2008_007691.jpg:	a orange UNK chair with:	an UNK wooden chair
dataset\bottle\2008_005560.jpg:	a and cat lying on a at:	white domestic cat sitting on window ledge looking outside
dataset\train\2008_006158.jpg:	a UNK a station:	train in a station
dataset\bicycle\2008_004592.jpg


Finished epoch 200
	Train loss: 1.7100438475608826

Finished epoch 201
	Train loss: 1.6624458697107103

Finished epoch 202
	Train loss: 1.7102025747299194

Finished epoch 203
	Train loss: 1.7173674636416965

Finished epoch 204
	Train loss: 1.679411358303494

Finished epoch 205
	Train loss: 1.711411111884647

Finished epoch 206
	Train loss: 1.690837926334805

Finished epoch 207
	Train loss: 1.701108243730333

Finished epoch 208
	Train loss: 1.6870535678333707

Finished epoch 209
	Train loss: 1.670275694794125
dataset\motorbike\2008_007344.jpg:	a UNK dog is sitting on to a UNK table front a UNK:	the brown dog is sitting next to a wall covered in graffiti and a motorcycle
dataset\bird\2008_002970.jpg:	a man bird and white bird is on a grassy:	a small red and grey bird perched in a tree
dataset\aeroplane\2008_005538.jpg:	a blue and white plane airplane the sky from the:	a yellow and white UNK in the sky viewed from below
dataset\bird\2008_005186.jpg:	a brown of a UNK UNK is a UNK:	a pictu


Finished epoch 220
	Train loss: 1.6640848517417908

Finished epoch 221
	Train loss: 1.6632948716481526

Finished epoch 222
	Train loss: 1.6624401410420735

Finished epoch 223
	Train loss: 1.6632674137751262

Finished epoch 224
	Train loss: 1.6763258907530043

Finished epoch 225
	Train loss: 1.6449776490529378

Finished epoch 226
	Train loss: 1.642408377594418

Finished epoch 227
	Train loss: 1.6522373027271695

Finished epoch 228
	Train loss: 1.6013350552982755

Finished epoch 229
	Train loss: 1.6350906822416518
dataset\car\2008_006827.jpg:	a UNK a dirt UNK a UNK dirt of a and and UNK UNK:	are crossing a rural highway and UNK the way of UNK cars and a truck
dataset\sofa\2008_004670.jpg:	a living room with:	a party room
dataset\motorbike\2008_002752.jpg:	a people are a men on a street day:	four people and two motorcycles on a sunny day
dataset\train\2008_003068.jpg:	a UNK UNK to UNK:	UNK locomotive speeding UNK the forest
dataset\aeroplane\2008_006621.jpg:	a is jet is UNK UNK open down


Finished epoch 240
	Train loss: 1.6383481489287481

Finished epoch 241
	Train loss: 1.6495753990279303

Finished epoch 242
	Train loss: 1.645359906885359

Finished epoch 243
	Train loss: 1.6106386449601915

Finished epoch 244
	Train loss: 1.603389581044515

Finished epoch 245
	Train loss: 1.5760747326744928

Finished epoch 246
	Train loss: 1.6070093115170796

Finished epoch 247
	Train loss: 1.611832042535146

Finished epoch 248
	Train loss: 1.6196213960647583

Finished epoch 249
	Train loss: 1.5914920038647122
dataset\tvmonitor\2008_007916.jpg:	a living UNK UNK a UNK a UNK UNK:	a UNK UNK from a lamp shows a desktop computer
dataset\dog\2008_004653.jpg:	a and haired dog sitting in front of a:	black long haired dog standing in middle of garden
dataset\person\2008_001501.jpg:	a man of people walking a beach the the air:	a row of children on a beach jumping in the air
dataset\bus\2008_005933.jpg:	a bus bus is UNK UNK to to be the afternoon:	the peaceful garden is an UNK place to enjoy the

## Loading the saved model

Infering from the model is not as straight-forward as training. In this we will define three separate models.
* `cnn_vec_model`: Given a batch of images, provides the image embeddings
* `embed_model`: Given a batch of words, outputs the embedding vectors
* `gru_model`: Given a batch of embeddings, ouput the predicted words

In [19]:

K.clear_session()
tf.reset_default_graph()

config=tf.ConfigProto(allow_soft_placement=True)
config.gpu_options.allow_growth = True

sess =  tf.Session(config=config)

# Loading the CNN model and GRU model
cnn_vec_model = load_model('cnn_vec_100.h5')
trained_model = load_model('caption_gen_100.h5')

for layer in trained_model.layers:    
    """ Restoring all the layers of the model. However the input layer needs to change to have only 1 timestep """
    
    if layer.name == 'word_input':
        test_word_input = Input(batch_shape=(batch_size, 1), name='infer_word_input')
    if layer.name=='embeddings':
        emb_layer = Embedding(vocab_size, input_dim, input_length=1, name='infer_embeddings')
        test_embed_out = emb_layer(test_word_input)
        
    if layer.name == 'gru':
        test_embed_input = Input(batch_shape=(batch_size, 1,input_dim), name='infer_embed_input')
        gru = GRU(128, return_sequences=True, stateful=True, name='infer_gru')
        test_gru_out = gru(test_embed_input)
    
    if layer.name == 'time_distributed':
        print(test_gru_out.shape)
        softmax = TimeDistributed(Dense(vocab_size, activation='softmax', name='infer_gru_fc'),name='infer_time_distributed')
        test_word_pred = softmax(test_gru_out)

""" Defining the embedding model """
embed_model = Model(inputs=test_word_input, outputs=test_embed_out)

""" Defining the GRU model """
gru_model = Model(inputs=test_embed_input, outputs=test_word_pred)

cnn_vec_model.summary()

# Setting weights of the layers in embed_model
for layer in embed_model.layers:
    mod_layer_name = layer.name[6:]
    if not layer.name == "infer_embed_input":
        layer.set_weights(trained_model.get_layer(mod_layer_name).get_weights())

embed_model.summary()

# Setting weights of the layers in the gru_model
for layer in gru_model.layers:
    mod_layer_name = layer.name[6:]
    if not layer.name == "infer_embed_input":
        layer.set_weights(trained_model.get_layer(mod_layer_name).get_weights())

gru_model.summary()

(48, 1, 128)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
cnn_image_input (InputLayer) (48, 168, 168, 3)         0         
_________________________________________________________________
cnn_conv1 (Conv2D)           (48, 168, 168, 32)        896       
_________________________________________________________________
cnn_pool2 (MaxPooling2D)     (48, 84, 84, 32)          0         
_________________________________________________________________
cnn_conv3 (Conv2D)           (48, 84, 84, 64)          18496     
_________________________________________________________________
cnn_pool4 (MaxPooling2D)     (48, 42, 42, 64)          0         
_________________________________________________________________
cnn_conv5 (Conv2D)           (48, 42, 42, 64)          36928     
_________________________________________________________________
cnn_pool6 (MaxPooling2D)     (48, 21, 21, 64)          0       

## Inferring predictions from the loaded model

In [20]:
test_dataset = tf.data.Dataset().from_generator(
    partial_test_gen, output_types= (tf.string, tf.float32, tf.int32, tf.int32), 
    output_shapes=(tf.TensorShape([1]), tf.TensorShape([image_size, image_size, 3]), tf.TensorShape([timesteps]), tf.TensorShape([1]))
).batch(batch_size, drop_remainder=True)

ts_iter_data = test_dataset.make_initializable_iterator()
ts_next_data = ts_iter_data.get_next()

sess.run(ts_iter_data.initializer)

losses = []

for bi in range(test_size//batch_size):

    """ Get test data"""

    try:
        b_filenames, b_img, b_cap, _ = sess.run(ts_next_data)
        assert b_img.shape[0] == batch_size
    except:
        sess.run(ts_iter_data.initializer)
        b_filenames, b_img, b_cap, _ = sess.run(ts_next_data)

    """ Computing word predictions iteratively """
    latent_vecs = np.expand_dims(cnn_vec_model.predict(b_img, batch_size=batch_size),1) #+ np.random.normal(size=(batch_size,1,128))
    
    
    b_pred_words = []
    for ti in range(timesteps):
        word_probs = gru_model.predict(latent_vecs, batch_size=batch_size)[:,0,:]
        test_words = np.expand_dims(np.argmax(word_probs, axis=-1),-1)
        latent_vecs = embed_model.predict(test_words, batch_size=batch_size)
        b_pred_words.append(test_words)
    
    """ Combining the interative results to a single tensor """

    b_pred_words = np.concatenate(b_pred_words, axis=-1)
    gru_model.reset_states()

    """ Writing results to disk """

    print('\n','='*50, '\n')

    b_pred_strings = []
    for fn, p_vec in zip(b_filenames, b_pred_words):
        pred_string = fn[0].decode('utf-8') + ":" + ' '.join(tok.index_word[p] for p in p_vec if p != 0)+'\n'
        b_pred_strings.append(pred_string)
        print(pred_string)
    with open('predictions_{}.txt'.format(bi),'w') as f:
        f.writelines(b_pred_strings)

sess.close()



dataset\aeroplane\2008_003788.jpg:a blue and white plane flying in the sky

dataset\aeroplane\2008_005905.jpg:a small plane is parked on a UNK

dataset\aeroplane\2008_008044.jpg:a UNK UNK UNK UNK UNK

dataset\bicycle\2008_000090.jpg:a black and white cow with a large UNK in the background

dataset\bicycle\2008_004113.jpg:a man riding a bicycle

dataset\bicycle\2008_004363.jpg:a living room with a computer and a UNK UNK

dataset\bicycle\2008_005276.jpg:a man wearing a UNK UNK a UNK UNK

dataset\bicycle\2008_006064.jpg:a black and white photo of a UNK UNK UNK

dataset\bicycle\2008_007470.jpg:a black and white cat laying on a UNK

dataset\bird\2008_003087.jpg:a man riding a bicycle

dataset\bird\2008_003160.jpg:a UNK UNK UNK UNK UNK

dataset\bird\2008_003580.jpg:a UNK UNK UNK UNK UNK UNK

dataset\bird\2008_003997.jpg:a black and white cow in a field

dataset\bird\2008_007003.jpg:a close up of a black and white photo of a UNK UNK

dataset\bird\2008_007752.jpg:a man wearing a UNK UNK a UN