# Image Captioning Model

This notebook does everything, hands down.

# Imports

In [121]:
from collections import Counter
import os
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import pickle
from pprint import pprint
from IPython.display import clear_output
import traceback


import keras
from keras import backend as K
from keras.applications.vgg16 import VGG16
from keras.callbacks import ModelCheckpoint, ProgbarLogger, TensorBoard, CSVLogger
from keras.layers import Dense, GlobalAveragePooling2D, LSTM, Embedding, TimeDistributed, \
                         RepeatVector, Merge, Activation, Flatten
from keras.models import Model, Sequential
from keras.preprocessing import image
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Viz
from keras.utils.vis_utils import model_to_dot
from IPython.display import SVG


rseed = 4444 # use this seed for any functions that utilizes randomness
EMBEDDING_DIM = 128
GLOVE_DIM = 100

%matplotlib inline

In [122]:
DATASET_PATH = '../Flickr8k_Dataset' # path to where the images are located
CAPTION_PATH = '../Flickr8k_Captions/Flickr8k.token.txt' # Captions
TRAIN_PATH = '../Flickr8k_Captions/Flickr_8k.trainImages.txt' # Ids of Images used for training
TEST_PATH = '../Flickr8k_Captions/Flickr_8k.testImages.txt' # Ids of images used for testing
GLOVE_DIR = '../glove.6B'

# CaptionGenerator Class 

This class handles all of the work required to build and train the model. There are some notable things happening here:
1. We're adding <start> and <end> tags to each of the captions and reducing the vocabulary size by removing the words that aren't used often.
2. There are 3 sequential models within this. A CNN to understand the images, a LSTM to understand the language within the data, and another LSTM that takes in high level features from the previous models to build a prediction of the next word in a given caption and image.

In [113]:
class CaptionGenerator():

    def __init__(self):
        # instantiated -> variable_initializer 
        # maintains the largest caption size and size of the vocabulary
        self.max_cap_len = None
        self.vocab_size = None
        
        # dictionarys that map an index to word and vice versa
        self.index_word = None
        self.word_index = None
        
        # tracks the number of training and test samples
        self.total_samples = None
        self.training_samples = None
        self.test_samples = None
        
        # stores the df used to keep track of the samples and if you have the images encoded, that too :)
        self.df = None
        self.encoded_images = pickle.load( open( "encoded_images.pkl", "rb" ) )
        
        # starts the process! good luck!
        self.variable_initializer()

    def variable_initializer(self):
        """
        This prepares the caption data, and builds the information declared within the class init
        """
        # load the df and clean the pound signs at the end of the images
        self.df = pd.read_csv(CAPTION_PATH,
                              sep='\t',
                              header=None,
                              names=['image', 'caption'])
        self.df['image'] = self.df.image.str.replace(r'#\d$', '')
        
        train_df = pd.read_csv(TRAIN_PATH,
                               header=None,
                               names=['image'])
        test_df = pd.read_csv(TEST_PATH,
                              header=None,
                              names=['image'])
        
        # add 'train' and 'test' labels and add the start and end tags
        # then merge the training and test set into a single df
        train_df = pd.merge(train_df, self.df, on='image')
        train_df['label'] = 'train'
        test_df = pd.merge(test_df, self.df, on='image')
        test_df['label'] = 'test'
        self.df = pd.concat([train_df, test_df])
        
        # Preprocess the text by:
        #  - adding in the start and end tags, 
        #  - converting everything to lowercase
        self.df['caption'] = self.df.caption.apply(lambda cap: '<start> ' + cap + ' <end>')
        self.df['caption'] = self.df.caption.str.lower()
        self.df['caption'] = self.df.caption.str.replace('#', 'number')
        
        # shuffle the dataset
        self.df = self.df.sample(frac=1, random_state=rseed)
        
        # add all the captions
        caps = []
        for row in self.df.iterrows():
            caps.append(row[1][1])
            
        # This builds our vocabulary. We use this to reference what index means what word, and vice versa.
        words = [text.split() for text in caps] # flatten to a list containing all words from the captions
        word_count = Counter()
        for word in words:
            word_count.update(word)
        
        # adds word or removes them from list
        unique = set() # prepare a list to add in all words
        kill = set() # words to remove
        kill.update(['"', '\'', '\(' ])
        for word, count in word_count.items():
            if count >= 5:
                unique.add(word)
            else:
                kill.add(word)
        
        # remove all the words below 5
        print('Removing all the words below the count threshold')
        self.df['caption'] = self.df.caption.apply(lambda x: " ".join([word for word in x.split() if word not in kill]))

        self.vocab_size = len(unique)
        self.word_index = {}
        self.index_word = {}
        for i, word in enumerate(unique):
            # be able to look up corresponding index to word and vice versa
            self.word_index[word]=i # word  -> index 
            self.index_word[i]=word # index -> word
        

        # rebuilds into the new captions
        caps = []
        for row in self.df.iterrows():
            caps.append(row[1][1])
        
        print('Building statistics for our dataset')
        # Determines what the largest caption is
        max_len = 0
        for caption in caps:
            if(len(caption.split()) > max_len): # checks if this caption is larger than the max
                max_len = len(caption.split()) # if so, rewrites the maximum
        self.max_cap_len = max_len
        
        # This calculates the total, training and test samples (aka observations).
        # This data is used in the data generator, and we remove 1 because when building
        # the dataset for training and testing, we always guess UP to the last word.
        self.total_samples=0
        for text in caps:
            self.total_samples += len(text.split()) - 1 # store the amount of data we have
        print("Total samples :", self.total_samples)
        
        self.training_samples = 0
        for row in self.df[self.df.label == 'train'].iterrows():
            self.training_samples += len(row[1][1].split()) - 1
        print("Training samples:", self.training_samples)
        
        self.test_samples = 0
        for row in self.df[self.df.label == 'test'].iterrows():
            self.test_samples += len(row[1][1].split()) - 1
        print("Test samples:", self.test_samples)
        
        print("Vocabulary size:", self.vocab_size)
        print("Maximum caption length:", self.max_cap_len)
        print("Variables initialization done!")

    def create_model(self, ret_model=False, include_base=False):
        
        # Handles the image encoded features
        image_model = Sequential()
        image_model.add(Dense(EMBEDDING_DIM, input_dim = 4096, activation='relu'))
        image_model.add(RepeatVector(self.max_cap_len))

        # Initial Embedding of the Language
        lang_model = Sequential()
        # lang_model.add(Embedding(self.vocab_size, 256, input_length=self.max_cap_len))
        lang_model.add(self.build_embedding_layer())
        lang_model.add(LSTM(256, return_sequences=True))
        lang_model.add(TimeDistributed(Dense(GLOVE_DIM)))

        # The final layer
        model = Sequential()
        model.add(Merge([image_model, lang_model], mode='concat'))
        model.add(LSTM(1000, return_sequences=False, dropout=0.2))
        model.add(Dense(self.vocab_size, activity_regularizer=keras.regularizers.l2())) 
        model.add(Activation('softmax'))

        print("Model created!")

        if(ret_model == True):
            return model

        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model

    def build_embedding_layer(self):
        print('Building embedding layer.')
        embeddings_index = {}
        with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
            for line in f:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs

        print('Found %s word vectors within glove.' % len(embeddings_index))
        
        embedding_matrix = np.zeros((self.vocab_size, GLOVE_DIM))
        for word, index in self.word_index.items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                embedding_matrix[index] = embedding_vector
                
        embedding_layer = Embedding(self.vocab_size,
                                    GLOVE_DIM,
                                    weights=[embedding_matrix],
                                    input_length=self.max_cap_len,
                                    trainable=False)
        
        return embedding_layer

    def data_generator(self, data='train', batch_size = 32):
        partial_caps = [] # our features, it holds a list of partial captions
        next_words = [] # our predictors, holds a list of vectors that is the next word in a partial caption
        images = [] # list of numpy arrays of images for each partial caption
        batch_count = 0 # maintains the number of batches we've built
        samples_built = 0 # maintains track of how many samples within the batch we've built
        
        print("Generating data...")
        
        caps = [] # array to store captions
        imgs = [] # array to store paths of images
        
        # get the training data or test data
        for row in self.df[self.df.label == data].iterrows():
            imgs.append(row[1][0]) # add the images
            caps.append(row[1][1]) # add the caption
        
        # We want this to continually run while our model is training!
        # This continuously returns batches 
        while True: 
            current_image = ''
            
            # start cycling through the captions data, each full caption at a time, and when it's done
            for index,text in enumerate(caps):
                
                # make sure we're not reloading the same image constantly
                if current_image != imgs[index]:
                    # load's the current image associated with the index
                    current_image = imgs[index] # self.load_image(DATASET_PATH + '/' + imgs[index])
                    img_encoding = self.encoded_images.loc[current_image]
                
                # cycle through the entire string length of the caption up until the last one, as we'll
                # need that for our prediction.
                words_in_caption = text.split()
                for i in range(len(text.split())-1): 
                    
                    samples_built += 1
                    
                    # We first build a partial list of words in a caption, where each element of the list is an index 
                    # to the word. We then append them to a list where we maintain track of what the partial
                    # captions are for a particular sample.
                    partial = [self.word_index[txt] for txt in words_in_caption[:i+1]] 
                    partial_caps.append(partial)
                    
                    # This stores the next word in the partial sequence above. We retrieve the next word
                    # in the sequence we're working on right now, then flip it to 1 for it's respective index.
                    next_word = np.zeros(self.vocab_size)
                    next_word[ self.word_index[ words_in_caption[i+1] ]] = 1
                    
                    next_words.append(next_word)
                    images.append(img_encoding)
                    
                    # Check if we hit the batch size, and return the features (X) and predictors (y).
                    if samples_built >= batch_size:
                        
                        # prepare data for Neural Net by creating numpy arrays
                        next_words = np.asarray(next_words)
                        images = np.asarray(images)
                        
                        # pad the partial captions so that they're of uniform length, where the length is the 
                        # size of the largest caption.
                        partial_caps = pad_sequences(partial_caps, maxlen=self.max_cap_len, padding='post')
                        
                        batch_count += 1
                        if batch_count % 50 == 0:
                            with open("batch_watch.log", "a") as f:
                                f.write("Training on Batch #: {}\n".format(batch_count))
                        
                        yield [[images, partial_caps], next_words]
                        
                        # reset the feature variables
                        partial_caps = []
                        next_words = []
                        images = []
                        samples_built = 0
    
    def load_image(self, path):
        # loads images into target size of VGG net
        img = image.load_img(path, target_size=(224,224))
        x = image.img_to_array(img)
        return np.asarray(x)
    
    def get_word(self,index):
        return self.index_word[index]

In [114]:
caption_gen = CaptionGenerator()

Removing all the words below the count threshold
Building statistics for our dataset
Total samples : 438393
Training samples: 375515
Test samples: 62878
Vocabulary size: 2751
Maximum caption length: 39
Variables initialization done!


In [116]:
# use this to check on the vocabulary
# a = Counter()
# for i in caption_gen.df.iterrows():
#     a.update([word for word in i[1][1].split()])
# a
## caption_gen.df[caption_gen.df.caption.str.contains("")]

# Encode Images

You want to encode your images using the last layer of VGG16 (or any base model) before dumping it into the model.

In [20]:
encoding_model = VGG16()
layer_name = 'fc2'
intermediate_layer_model = Model(inputs=encoding_model.input,
                                 outputs=encoding_model.get_layer(layer_name).output)

In [None]:
# This specs the encoding model
SVG(model_to_dot(encoding_model, show_shapes=True).create(prog='dot', format='svg'))

In [21]:
def encode_img(img_path, model=intermediate_layer_model):
    img = image.load_img(DATASET_PATH + '/' + img_path, target_size=(224,224))
    img = image.img_to_array(img)
    img = np.asarray(img)
    img = img[np.newaxis,:,:,:]
    return model.predict(img).flatten()[np.newaxis,:]

In [None]:
encoded_images = []
for index, img_name in enumerate(caption_gen.df.image.unique()):
    print('Encoding image:', index)
    img = caption_gen.load_image(DATASET_PATH + '/' + img_name)
    img = img[np.newaxis,:,:,:]
    pred = intermediate_layer_model.predict(img).flatten()
    
    encoded_images.append([img_name, pred])

df = pd.DataFrame(encoded_images, columns=['image', 'encoding'])
df = df.sort_values('image')
df = df.set_index('image')
# df = df.encoding.apply(lambda x: x.flatten()) # this won't be needed
df.to_pickle('encoded_images.pkl')

# Training the model

This handles the training for the model. If you need to pick up where you left off, you have options of passing in weights and setting the initial epoch to continue the training.

In [118]:
def train_model(cg, model, weight=None, batch_size=32, epochs=10, initial_epoch=0):

    if weight != None:
        model.load_weights(weight)
    
    # location of where your model weights will be stored
    file_name = './weights_reduced_vocab/glove_weights.{epoch:02d}-{loss:.2f}.hdf5'
    
    # Callbacks for tracking the model during training
    checkpoint = ModelCheckpoint(file_name, monitor='loss', verbose=1, save_best_only=False, mode='min')
    tboard = TensorBoard(log_dir='./logs_reduced_vocab', histogram_freq=2,
                         write_graph=True, write_images=False)
    csv_logger = CSVLogger('keras_reduced_vocab', separator=',', append=True)
    callbacks_list = [checkpoint, tboard, csv_logger]
    
    # Pass in the data generator and train
    try:
        model.fit_generator(cg.data_generator(batch_size=batch_size, data='train'),
                            steps_per_epoch=cg.training_samples/batch_size,
                            validation_data=cg.data_generator(batch_size=batch_size, data='test'),
                            validation_steps=cg.test_samples/batch_size,
                            epochs=epochs,
                            verbose=2,
                            callbacks=callbacks_list,
                            initial_epoch=initial_epoch)
    except Exception as e:
        # this will catch and log any issues you have during training
        with open('error.log', 'a') as f:
            f.write(traceback.format_exc())
    
    # Save your work!
    try:
        model.save('../Models/WholeModel.h5', overwrite=True)
        model.save_weights('../Models/Weights.h5',overwrite=True)
    except:
        with open('error.log', 'a') as f:
            f.write("Error in saving model.")
        print("Error in saving model.")
    print("Training complete...\n")

In [119]:
caption_model = caption_gen.create_model()

Building embedding layer.
Found 400000 word vectors within glove.




Model created!


In [None]:
SVG(model_to_dot(caption_model, show_shapes=True).create(prog='dot', format='svg'))

In [None]:
caption_model.summary()

In [None]:
train_model(caption_gen, caption_model,
            weight='./weights_reduced_vocab/glove_weights.18-7.81.hdf5',
            batch_size=64, epochs=50, initial_epoch=20)

INFO:tensorflow:Summary name lstm_8/kernel:0 is illegal; using lstm_8/kernel_0 instead.
INFO:tensorflow:Summary name lstm_8/recurrent_kernel:0 is illegal; using lstm_8/recurrent_kernel_0 instead.
INFO:tensorflow:Summary name lstm_8/bias:0 is illegal; using lstm_8/bias_0 instead.
INFO:tensorflow:Summary name dense_12/kernel:0 is illegal; using dense_12/kernel_0 instead.
INFO:tensorflow:Summary name dense_12/bias:0 is illegal; using dense_12/bias_0 instead.
Epoch 21/50Generating data...



# Testing the Model

This contains a list of functions used in testing the captions produced by the model.

In [None]:
def predict_caption(cg, model, image):
    """
    This is a simple predictor where it simply takes the maximum probability from each predictive step.
    """
    start = cg.word_index['<start>']
    captions = [start]

    # this will cycle through the sequence until we hit the maximum caption length.
    while(len(captions) < cg.max_cap_len):
        # pad the sequence so that it fits the input
        partial_caption = pad_sequences([captions], maxlen=cg.max_cap_len, padding='post')

        # pass the image and caption into the model
        next_word_pred = model.predict([image, partial_caption])[0]

        # get the largest predictor
        next_word = np.argsort(next_word_pred)[-1]

        captions.append(next_word)
    
    full_caption = []
    for word in captions:
        full_caption.append(cg.index_word[word])
    
    return " ".join(full_caption[1:])

In [125]:
def beam_search(cg, model, image, beam_size):
    """
    The idea behind beam search here is that we wish to maximize the probability of a given caption (and get more
    natural sounding ones!) by doing a bit deeper of search into the tree of captions. Opposed
    to doing something exhaustive, we search for paths by taking some (aka beam size) of
    the best possible candidates, and continuing to build predictions from the base branch, and now
    beamed branches!
    
    Check out this video for an idea of what beam search does:
    https://www.youtube.com/watch?v=UXW6Cs82UKo
    """
    # this is setting the initial sequence to use the start tag, and total probability of the caption be 0
    start = [cg.word_index['<start>']]
    captions = [[start,0.0]]
    
    # this will cycle through the sequence until we hit the maximum caption length.
    # captions[0][0] is the sequence we're building, it only started with <start> initially
    while(len(captions[0][0]) < cg.max_cap_len):
        temp_captions = []
        for caption, prob in captions:
            # pad the sequence so that it fits the input
            partial_caption = pad_sequences([caption], maxlen=cg.max_cap_len, padding='post')
            next_words_pred = model.predict([image, partial_caption])[0]
            
            # sort, and get the indicies of highest probable words (the values at the end of the list)
            next_words = np.argsort(next_words_pred)[-beam_size:]
            
            for word in next_words:
                # temporarily store the caption, and the previous probability of the caption,
                # then append the news word to the caption
                new_partial_caption, new_partial_caption_prob = caption[:], prob
                
                # add the probability to the total
                # next_words_pred[word] gets the probability of that word
                new_partial_caption.append(word)
                
                # appends the list new_partial_caption and the associated probability to the list
                # new_partial_caption_prob is a scalar
                # an example ouput is [[12,8499, 41], 2.34]
                new_partial_caption_prob += next_words_pred[word]
                temp_captions.append([new_partial_caption,new_partial_caption_prob])
        
        captions = temp_captions
        captions.sort(key = lambda l:l[1])
        captions = captions[-beam_size:]

    return captions

In [124]:
def get_best_caption(captions):
    """
    Used to filter for the best caption given back from the beam search function and turn the caption
    into words
    """
    captions.sort(key = lambda l:l[1])
    best_caption = captions[-1][0]
    return " ".join([caption_gen.index_word[index] for index in best_caption])

In [123]:
def process_caption(caption):
    """
    Removes the <start> and <end> tags from a single caption (Expects a string)
    """
    caption_split = caption.split()
    
    # removes the <start> tag
    processed_caption = caption_split[1:]
    
    try:
        # trys getting the first index of the <end> tag, and if it does we're in business!
        end_index = processed_caption.index('<end>')
        processed_caption = processed_caption[:end_index]
    except:
        pass
    
    return " ".join([word for word in processed_caption])

In [None]:
weight = 'weights_reduced_vocab/glove_weights.49-7.80.hdf5'
test_model = caption_gen.create_model(ret_model=True)
test_model.load_weights(weight)

In [150]:
test_image1 = '112178718_87270d9b4d.jpg'
test_image2 = '667626_18933d713e.jpg' # girl laying on water
test_image3 = '23445819_3a458716c1.jpg' # dogs playing on grass
test_image4 = '172097782_f0844ec317.jpg'
test_image5 = '242064301_a9d12f1754.jpg'
test_image6 = '289599470_cc665e2dfb.jpg'

In [None]:
pred = beam_search(caption_gen, test_model, encode_img(test_image1), beam_size=7)
caption = process_caption(get_best_caption(pred))
caption

In [None]:
# test_model_on_images(caption_gen, test_model, beam_size=3)

# Testing out Functions

In [None]:
def test_model_on_images(cg, model, beam_size = 3):    
        # this is setting the initial sequence to use the start tag, and total probability of the caption be 0
        start = cg.word_index['<start>']
        prob_counter = 0.0
        captions = [[[start], prob_counter]]

        # this will cycle through the sequence until we hit the maximum caption length.
        # captions[0][0] is the sequence we're building, it only started with <start> initially
        while(len(captions[0][0]) < cg.max_cap_len):
            # preparing the temporary storage of the caption we're building?
            temp_captions = []
            
            # go through the captions we have (i don't know how this is built)
            for caption in captions:
                # pad the sequence so that it fits the input
                partial_caption = pad_sequences([caption], maxlen=cg.max_cap_len, padding='post')

                # pass the image and caption into the model
                next_word_pred = model.predict([image, partial_caption])[0]

                # sort, and get the indicies of highest probable words (the values at the end of the list)
                # have to understand what 'beam' does. It might be a beam search, but i'm not sure what that is.
                next_word = np.argsort(next_word_pred)[-beam_size:]

                captions.append(next_word)

                # go through the possible words
                for word in next_words:
                    # temporarily store the caption, and the previous probability of the caption,
                    # then append the news word to the caption
                    new_partial_caption, new_partial_caption_prob = caption[0][0], caption[1]
                    new_partial_caption.append(word)

                    # add the probability to the total
                    # next_words_pred[word] gets the probability of that word
                    new_partial_caption_prob += next_words_pred[word]

                    # appends the list new_partial_caption and the associated probability to the list
                    # new_partial_caption_prob is a scalar
                    # an example ouput is [[12,8499, 41], 2.34]
                    temp_captions.append([new_partial_caption, new_partial_caption_prob])

            captions = temp_captions
            captions.sort(key = lambda l:l[1])
            captions = captions[-beam_size:]
    
    full_caption = []
    for word in captions:
        full_caption.append(caption_gen.index_word[word])
    
    return " ".join(full_caption[1:])

In [None]:
def beam_prediction(cg, model, image, beam_size=3):
    """
    This is a simple predictor where it simply takes the maximum probability from
    each predictive step.
    """
    start = cg.word_index['<start>']
    captions = [ [[start],0] ]

    # this will cycle through the sequence until we hit the maximum caption length.
    while(len(captions) < cg.max_cap_len):
        
        if len(captions) > 3:
            # take the top 3 captions
            captions = [sorted(captions, key=lambda x: x[1])[-3:]]
        
        # for each caption, build a prediction
        for caption, prob in captions:
            
            # pad the sequence so that it fits the input
            partial_caption = pad_sequences([caption], maxlen=cg.max_cap_len, padding='post')

            # pass the image and caption into the model
            next_word_pred = model.predict([image, partial_caption])[0]

            # get the index of the largest predictors
            next_words = np.argsort(next_word_pred)[-beam_size:]
            
            temp_captions = []
            for next_word in next_words:
                # store the new temporary caption and new associated prob
                temp_caption = [caption + [next_word], prob + next_word_pred[next_word]]
                
                # store them
                temp_captions.append(temp_caption)
            
            
    
    full_caption = []
    for word in captions:
        full_caption.append(cg.index_word[word])
    
    return " ".join(full_caption[1:])

In [None]:
def get_all_captions(captions):
    final_captions = []
    captions.sort(key = lambda l:l[1])
    
    for caption in captions:
        text_caption = " ".join([caption_gen.index_word[index] for index in caption[0]])
        final_captions.append([text_caption, caption[1]])
        
    return final_captions