In [16]:
import string
from sklearn.model_selection import train_test_split
from keras.applications.inception_v3 import InceptionV3
from keras.models import Model
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.inception_v3 import preprocess_input
import numpy as np
import os
from os import listdir
from keras.layers import Input, Dense, LSTM, Dropout, Embedding, add
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import glob
from numpy import array
import matplotlib.pyplot as plt
import random
%matplotlib inline

In [17]:
# Read captions corresponding to each image and store them in list
filename = "flicker8k-dataset/Flickr8k_text/Flickr8k.token.txt"
file = open(filename, 'r')
doc = file.read()

In [18]:
# Read the captions file. Process each line extract 5 captions for each image and append to list.
descriptions = dict()
for line in doc.split('\n'):
    # Splitting the line by tab space
    tokens = line.split('\t')
    # Storing image id and descriptions in different variables
    image_id, image_desc = tokens[0], tokens[1:]
    # Removing the extension of image type from the image id
    image_id = image_id.split('.')[0]
    # Storing all the descriptions as one string
    image_desc = ' '.join(image_desc)
    if image_id not in descriptions:
        descriptions[image_id] = list()
    descriptions[image_id].append(image_desc)

In [19]:
# Cleaning the image captions
table = str.maketrans('', '', string.punctuation)
for key, desc_list in descriptions.items():
    for i in range(len(desc_list)):
        desc = desc_list[i]
        # Tokenizing the string
        desc = desc.split()
        # Converting the entire string to lower case
        desc = [word.lower() for word in desc]
        # Removing punctuation from each token
        desc = [w.translate(table) for w in desc]
        # Removing 's and article "A"
        desc = [word for word in desc if len(word)>1]
        # Removing words with numbers
        desc = [word for word in desc if word.isalpha()]
        # Storing the caption as a string
        desc_list[i] =  ' '.join(desc)

In [20]:
# Creating an empty set for vocabulary to store unique words
vocabulary = set()
# Counting the size of vocabulary
for key in descriptions.keys():
    [vocabulary.update(d.split()) for d in descriptions[key]]

print('Original Vocabulary Size: %d' % len(vocabulary))

Original Vocabulary Size: 8763


In [21]:
# Below path contains all the images
images = 'flicker8k-dataset/Flickr8k_Dataset/Flicker8k_Dataset/'
# Creating a list of all the image names in the directory
img = glob.glob(images + '*.jpg')

In [22]:
# Below file conatains the names of images to be used in train data
train_images_file = 'flicker8k-dataset/Flickr8k_text/Flickr_8k.trainImages.txt'
# Reading the train image names in a set
train_images = set(open(train_images_file, 'r').read().strip().split('\n'))

# Creating a list of all the training images with their full path names
train_img = []

for i in img: # img contains full path names of all images
    if i[len(images):] in train_images: # Checking if the image belongs to training set
        train_img.append(i) # Adding it to the list of train images

In [23]:
# Creating Development set same as above
dev_images_file = 'flicker8k-dataset/Flickr8k_text/Flickr_8k.devImages.txt'
dev_images = set(open(dev_images_file, 'r').read().strip().split('\n'))

dev_img = []

for i in img: 
    if i[len(images):] in dev_images: 
        dev_img.append(i)

In [24]:
# Creating Test set same as above
test_images_file = 'flicker8k-dataset/Flickr8k_text/Flickr_8k.testImages.txt'
test_images = set(open(test_images_file, 'r').read().strip().split('\n'))

test_img = []

for i in img:
    if i[len(images):] in test_images:
        test_img.append(i)

In [25]:
# creating list to store image and corresponding 5 captions
def caption_dataset(data):
    desc = list()
    for key, value in descriptions.items():
        temp = [key,value]
        if key+'.jpg' in data:
            desc.append(temp)
    return desc

train_desc = caption_dataset(train_images) #list
dev_desc = caption_dataset(dev_images)
test_desc = caption_dataset(test_images)
print(train_desc[:2])

[['1000268201_693b08cb0e', ['child in pink dress is climbing up set of stairs in an entry way', 'girl going into wooden building', 'little girl climbing into wooden playhouse', 'little girl climbing the stairs to her playhouse', 'little girl in pink dress going into wooden cabin']], ['1001773457_577c3a7d70', ['black dog and spotted dog are fighting', 'black dog and tricolored dog playing with each other on the road', 'black dog and white dog with brown spots are staring at each other in the street', 'two dogs of different breeds looking at each other on the road', 'two dogs on pavement moving toward each other']]]


In [26]:
# convert above generated list to dictionary for faster accessing
train_description,dev_description,test_description= {},{},{}
for each in train_desc:
    train_description[each[0]] = each[1]
for each in dev_desc:
    dev_description[each[0]] = each[1]
for each in test_desc:
    test_description[each[0]] = each[1]

In [27]:
def get_vocab_size(description):
    # Create a list of all the training captions
    all_captions = []
    for key, val in description.items():
        for cap in val:
            all_captions.append(cap)


    # Consider only words which occur at least 10 times in the corpus
    word_count_threshold = 10
    word_counts = {}
    nsents = 0
    for sent in all_captions:
        nsents += 1
        for w in sent.split(' '):
            word_counts[w] = word_counts.get(w, 0) + 1

    vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]

    print('preprocessed words %d ' % len(vocab))
    return vocab

In [28]:
# Adding 'start' and 'end' to all captions just to identify start and end of sentence 
temp = []
for key, value in train_description.items():
    temp = []
    for each in value:
        str1 = 'start '+ each+' end'
        temp.append(str1)
    train_description[key] = temp
print(train_description['1000268201_693b08cb0e'])       

['start child in pink dress is climbing up set of stairs in an entry way end', 'start girl going into wooden building end', 'start little girl climbing into wooden playhouse end', 'start little girl climbing the stairs to her playhouse end', 'start little girl in pink dress going into wooden cabin end']


In [29]:
# Adding 'start' and 'end' to all captions just to identify start and end of sentence in dev data set
temp = []
for key, value in dev_description.items():
    temp = []
    for each in value:
        str1 = 'start '+ each+' end'
        temp.append(str1)
    dev_description[key] = temp
     

In [21]:
# Using pretrained InceptionV3 model trained on imagenet data
model = InceptionV3(weights='imagenet')
# Removing the last layer (output softmax layer)
model_new = Model(model.input, model.layers[-2].output)

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.5/inception_v3_weights_tf_dim_ordering_tf_kernels.h5


In [None]:
# Converting all the images to size 299x299 as expected by inception v3 model

train_features={}
for img in train_img:
        # load an image from file
        image = load_img(img, target_size=(299, 299))
        # convert the image pixels to a numpy array
        x = img_to_array(image)
        # Add one more dimension
        x = np.expand_dims(x, axis=0)
        # preprocess images using preprocess_input() from inception module
        x = preprocess_input(x)
        x = model_new.predict(x, verbose=0)
        feature = np.reshape(x, x.shape[1])
        train_features[img[len(images):]] = feature
    

print(len(train_features))


In [None]:
# dev features
dev_features={}
for img in dev_img:
        # load an image from file
        image = load_img(img, target_size=(299, 299))
        # convert the image pixels to a numpy array
        x = img_to_array(image)
        # Add one more dimension
        x = np.expand_dims(x, axis=0)
        # preprocess images using preprocess_input() from inception module
        x = preprocess_input(x)
        x = model_new.predict(x, verbose=0)
        feature = np.reshape(x, x.shape[1])
        dev_features[img[len(images):]] = feature
    

print(len(dev_features),type(dev_features))

In [30]:
# Creating indexes for all words in vocabulary
def word_ix_fun(vocab):
    ixtoword = {}
    wordtoix = {}
    ix = 1
    for w in vocab:
        wordtoix[w] = ix
        ixtoword[ix] = w
        ix += 1

    print(len(ixtoword))
    return wordtoix,ixtoword

In [31]:
# Converting dictionary to list
def dict_list(set1,set2):
    set0,set0_list={},[]
    set0.update(set1)
    set0.update(set2)

    for key, value in set0.items():
        temp = [key,value]
        set0_list.append(temp)
    return set0_list

In [33]:
# Creating Feature list and Dictionary list
#feature_list = dict_list(train_features,dev_features)
description_list = dict_list(train_description,dev_description)

In [39]:
# converting a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

# calculating the length of the descriptions with the most words
def max_length(descriptions):
    lines = to_lines(descriptions)
    return max(len(d.split()) for d in lines)

# determining the maximum sequence length
max_length = max(max_length(train_description),max_length(dev_description))
print('Max Description Length: %d' % max_length)
# Max Description Length: 34

Max Description Length: 34


In [40]:
# data generator, intended to be used in a call to model.fit_generator()
def data_generator(descriptions, photos, wordtoix,max_length, num_photos_per_batch):
    X1, X2, y = list(), list(), list()
    n=0
    
    # loop for ever over images
    while 1:
        for key, desc_list in descriptions.items():
            n+=1
            # retrieve the photo feature
            photo = photos[key]
            for desc in desc_list:
                # encode the sequence
                seq = [wordtoix[word] for word in desc.split(' ') if word in wordtoix]
                # split one sequence into multiple X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pair
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    # store
                    X1.append(photo)
                    X2.append(in_seq)
                    y.append(out_seq)
            # yield the batch data
            if n==num_photos_per_batch:
                yield [[array(X1), array(X2)], array(y)]
                X1, X2, y = list(), list(), list()
                n=0

In [23]:
# Load Glove vectors
glove_dir = 'Glove/'
embeddings_index = {} # empty dictionary
vocab_size=10000
f = open(os.path.join(glove_dir, 'glove.6B.200d.txt'), encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [24]:
#Get Glove embeddings 200 dim dense vector for our vocabulary
def data_weight(wordtoix):
    embedding_dim = 200
    vocab_size = 10000
    # Get 200-dim dense vector for each of the 10000 words in out vocabulary
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in wordtoix.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in the embedding index will be all zeros
            embedding_matrix[i] = embedding_vector
    print(len(embedding_matrix[0]))
    return embedding_matrix

In [26]:
# image feature extractor model
inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

# partial caption sequence model
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 200, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)

# decoder (feed forward) model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

# merge the two input models
model = Model(inputs=[inputs1, inputs2], outputs=outputs)

In [27]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 34)           0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 2048)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 34, 200)      2000000     input_4[0][0]                    
__________________________________________________________________________________________________
dropout_2 (Dropout)             (None, 2048)         0           input_3[0][0]                    
____________________________________________________________________________________________