In [1]:
import string
from sklearn.model_selection import train_test_split
from keras.applications.inception_v3 import InceptionV3
from keras.models import Model
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.inception_v3 import preprocess_input
import numpy as np
import os
from os import listdir
from keras.layers import Input, Dense, LSTM, Dropout, Embedding, add
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import glob
from numpy import array
import matplotlib.pyplot as plt
import random
%matplotlib inline

Using TensorFlow backend.


In [2]:
# Read captions corresponding to each image and store them in list
filename = "flicker8k-dataset/Flickr8k_text/Flickr8k.token.txt"
file = open(filename, 'r')
doc = file.read()

In [3]:
# Read the captions file. Process each line extract 5 captions for each image and append to list.
descriptions = dict()
for line in doc.split('\n'):
    # Splitting the line by tab space
    tokens = line.split('\t')
    # Storing image id and descriptions in different variables
    image_id, image_desc = tokens[0], tokens[1:]
    # Removing the extension of image type from the image id
    image_id = image_id.split('.')[0]
    # Storing all the descriptions as one string
    image_desc = ' '.join(image_desc)
    if image_id not in descriptions:
        descriptions[image_id] = list()
    descriptions[image_id].append(image_desc)

In [4]:
# Cleaning the image captions
table = str.maketrans('', '', string.punctuation)
for key, desc_list in descriptions.items():
    for i in range(len(desc_list)):
        desc = desc_list[i]
        # Tokenizing the string
        desc = desc.split()
        # Converting the entire string to lower case
        desc = [word.lower() for word in desc]
        # Removing punctuation from each token
        desc = [w.translate(table) for w in desc]
        # Removing 's and article "A"
        desc = [word for word in desc if len(word)>1]
        # Removing words with numbers
        desc = [word for word in desc if word.isalpha()]
        # Storing the caption as a string
        desc_list[i] =  ' '.join(desc)

In [5]:
# Creating an empty set for vocabulary to store unique words
vocabulary = set()
# Counting the size of vocabulary
for key in descriptions.keys():
    [vocabulary.update(d.split()) for d in descriptions[key]]

print('Original Vocabulary Size: %d' % len(vocabulary))

Original Vocabulary Size: 8763


In [6]:
# Below path contains all the images
images = 'flicker8k-dataset/Flickr8k_Dataset/Flicker8k_Dataset/'
# Creating a list of all the image names in the directory
img = glob.glob(images + '*.jpg')

In [7]:
# Below file conatains the names of images to be used in train data
train_images_file = 'flicker8k-dataset/Flickr8k_text/Flickr_8k.trainImages.txt'
# Reading the train image names in a set
train_images = set(open(train_images_file, 'r').read().strip().split('\n'))

# Creating a list of all the training images with their full path names
train_img = []

for i in img: # img contains full path names of all images
    if i[len(images):] in train_images: # Checking if the image belongs to training set
        train_img.append(i) # Adding it to the list of train images

In [8]:
# Creating Development set same as above
dev_images_file = 'flicker8k-dataset/Flickr8k_text/Flickr_8k.devImages.txt'
dev_images = set(open(dev_images_file, 'r').read().strip().split('\n'))

dev_img = []

for i in img: 
    if i[len(images):] in dev_images: 
        dev_img.append(i)

In [9]:
# Creating Test set same as above
test_images_file = 'flicker8k-dataset/Flickr8k_text/Flickr_8k.testImages.txt'
test_images = set(open(test_images_file, 'r').read().strip().split('\n'))

test_img = []

for i in img:
    if i[len(images):] in test_images:
        test_img.append(i)

In [10]:
# creating list to store image and corresponding 5 captions
def caption_dataset(data):
    desc = list()
    for key, value in descriptions.items():
        temp = [key,value]
        if key+'.jpg' in data:
            desc.append(temp)
    return desc

train_desc = caption_dataset(train_images) #list
dev_desc = caption_dataset(dev_images)
test_desc = caption_dataset(test_images)
print(train_desc[:2])

[['1000268201_693b08cb0e', ['child in pink dress is climbing up set of stairs in an entry way', 'girl going into wooden building', 'little girl climbing into wooden playhouse', 'little girl climbing the stairs to her playhouse', 'little girl in pink dress going into wooden cabin']], ['1001773457_577c3a7d70', ['black dog and spotted dog are fighting', 'black dog and tricolored dog playing with each other on the road', 'black dog and white dog with brown spots are staring at each other in the street', 'two dogs of different breeds looking at each other on the road', 'two dogs on pavement moving toward each other']]]


In [11]:
# convert above generated list to dictionary for faster accessing
train_description,dev_description,test_description= {},{},{}
for each in train_desc:
    train_description[each[0]] = each[1]
for each in dev_desc:
    dev_description[each[0]] = each[1]
for each in test_desc:
    test_description[each[0]] = each[1]

In [12]:
def get_vocab_size(description):
    # Create a list of all the training captions
    all_captions = []
    for key, val in description.items():
        for cap in val:
            all_captions.append(cap)


    # Consider only words which occur at least 10 times in the corpus
    word_count_threshold = 10
    word_counts = {}
    nsents = 0
    for sent in all_captions:
        nsents += 1
        for w in sent.split(' '):
            word_counts[w] = word_counts.get(w, 0) + 1

    vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]

    print('preprocessed words %d ' % len(vocab))
    return vocab