In [2]:
import numpy as np
import cv2
import os
import pickle
import numpy as np

from keras.applications.xception import Xception, preprocess_input
from keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import add
from keras.models import Model, load_model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout

import nltk
from nltk.corpus import stopwords
import re
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

# small library for seeing the progress of loops.
from tqdm.notebook import tqdm
tqdm.pandas()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
#To get text from the file
def load_doc(fname):
    file = open(fname, 'r')
    txt = file.read()
    file.close()
    return txt

In [4]:
def img_cap_into_dic(fname):
    file = load_doc(fname)
    captions = file.split('\n')
    descriptions ={}
    for caption in captions[:-1]:
        img, cap = caption.split('\t')
        if img[:-2] not in descriptions:
            descriptions[img[:-2]] = [cap]
        else:
            descriptions[img[:-2]].append(cap)
    return descriptions

In [5]:
def text_preprocessing(captions):
    ps = PorterStemmer()
    for img,caps in captions.items():
        for i,img_caption in enumerate(caps):
            review = re.sub('[^a-zA-Z]', ' ', img_caption)
            review = review.lower()
            review = review.split()
            review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
            review = ' '.join(review)
            captions[img][i]= img_caption
    return captions

In [6]:
def get_vocabulary(descriptions):
    vocab = set()
    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]
    return vocab

In [7]:
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + '\t' + desc )
    data = "\n".join(lines)
    file = open(filename,"w")
    file.write(data)
    file.close()

In [8]:
dataset_text = "C:\Users\Vardhman Jain\Desktop\Machine Learning\Image Caption Generator\Flickr8k_text"
dataset_images = "C:\Users\Vardhman Jain\Desktop\Machine Learning\Image Caption Generator\Flickr8k_Dataset"

In [9]:
filename = dataset_text + "/" + "Flickr8k.token.txt"
#loading the file that contains all data
#mapping them into descriptions dictionary img to 5 captions
descriptions = img_cap_into_dic(filename)
print("Length of descriptions =" ,len(descriptions))

#cleaning the descriptions
clean_descriptions = text_preprocessing(descriptions)

#building vocabulary
vocabulary = get_vocabulary(clean_descriptions)
print("Length of vocabulary =", len(vocabulary))

#saving each description to file
save_descriptions(clean_descriptions, "descriptions.txt")

Length of descriptions = 8092
Length of vocabulary = 9630


In [10]:
def extract_features(directory):
    model = Xception( include_top=False, pooling='avg' )
    features = {}
    for img in tqdm(os.listdir(directory)):
        fname = directory + "//" + img
        image = cv2.imread(fname)
        image = cv2.resize(image,(299,299))
        image = np.expand_dims(image, axis=0)
        #This extra dimension represents the batch size, making it compatible with the input shape expected by the model
        image = image/127.5
        image = image - 1.0

        feature = model.predict(image)
        features[img] = feature
    return features

In [None]:
features = extract_features(dataset_images)
pickle.dump(features, open("features.p","wb"))

In [11]:
features = pickle.load(open("features.p","rb"))

In [12]:
#load the data
def load_photos(fname):
    file = load_doc(fname)
    photos = file.split("\n")[:-1]
    return photos

In [13]:
def load_clean_descriptions(fname, photos):
    #loading clean_descriptions
    file = load_doc(fname)
    descriptions = {}
    for line in file.split("\n"):

        words = line.split()
        if len(words)<1 :
            continue

        img, img_caption = words[0], words[1:]

        if img in photos:
            if img not in descriptions:
                descriptions[img] = []
            desc = '<s> ' + " ".join(img_caption) + ' <e>'
            descriptions[img].append(desc)

    return descriptions

In [14]:
def load_features(photos):
    #loading all features
    all_features = pickle.load(open("features.p","rb"))
    #selecting only needed features
    features = {k:all_features[k] for k in photos}
    return features

In [15]:
fname = dataset_text + "/" + "Flickr_8k.trainImages.txt"

train_imgs = load_photos(fname)
train_descriptions = load_clean_descriptions("descriptions.txt", train_imgs)
train_features = load_features(train_imgs)

In [16]:
train_descriptions

{'1000268201_693b08cb0e.jpg': ['<s> A child in a pink dress is climbing up a set of stairs in an entry way . <e>',
  '<s> A girl going into a wooden building . <e>',
  '<s> A little girl climbing into a wooden playhouse . <e>',
  '<s> A little girl climbing the stairs to her playhouse . <e>',
  '<s> A little girl in a pink dress going into a wooden cabin . <e>'],
 '1001773457_577c3a7d70.jpg': ['<s> A black dog and a spotted dog are fighting <e>',
  '<s> A black dog and a tri-colored dog playing with each other on the road . <e>',
  '<s> A black dog and a white dog with brown spots are staring at each other in the street . <e>',
  '<s> Two dogs of different breeds looking at each other on the road . <e>',
  '<s> Two dogs on pavement moving toward each other . <e>'],
 '1002674143_1b742ab4b8.jpg': ['<s> A little girl covered in paint sits in front of a painted rainbow with her hands in a bowl . <e>',
  '<s> A little girl is sitting in front of a large painted rainbow . <e>',
  '<s> A smal

In [17]:
#converting dictionary to clean list of descriptions
def dict_to_list(descriptions):
    all_desc = []
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

In [18]:
def create_tokenizer(descriptions):
    desc_list = dict_to_list(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(desc_list)
    return tokenizer

In [19]:
# give each word a index, and store that into tokenizer.p pickle file
tokenizer = create_tokenizer(train_descriptions)
pickle.dump(tokenizer, open('tokenizer.p', 'wb'))
vocab_size = len(tokenizer.word_index) + 1
vocab_size

7377

In [20]:
#calculate maximum length of descriptions
def max_length(descriptions):
    desc_list = dict_to_list(descriptions)
    return max(len(d.split()) for d in desc_list)

max_length = max_length(descriptions)
max_length

38

In [21]:
def data_generator(descriptions, features, tokenizer, max_length):
    while 1:
        for key, description_list in descriptions.items():
            #retrieve photo features
            feature = features[key][0]
            input_image, input_sequence, output_word = create_sequences(tokenizer, max_length, description_list, feature)
            yield ((input_image, input_sequence), output_word)

def create_sequences(tokenizer, max_length, desc_list, feature):
    X1, X2, y = list(), list(), list()
    # walk through each description for the image
    for desc in desc_list:
        # encode the sequence
        seq = tokenizer.texts_to_sequences([desc])[0]
        # split one sequence into multiple X,y pairs
        for i in range(1, len(seq)):
            # split into input and output pair
            in_seq, out_seq = seq[:i], seq[i]
            # pad input sequence
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            # encode output sequence
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            # store
            X1.append(feature)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

In [22]:
[a,b],c = next(data_generator(train_descriptions, features, tokenizer, max_length))
a.shape, b.shape, c.shape

((58, 2048), (58, 38), (58, 7377))

In [23]:
# define the captioning model
def define_model(vocab_size, max_length):

    # features from the CNN model squeezed from 2048 to 256 nodes
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    # LSTM sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    # Merging both models
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    # summarize model
    print(model.summary())

    return model

In [24]:
print('Dataset: ', len(train_imgs))
print('Descriptions: train=', len(train_descriptions))
print('Photos: train=', len(train_features))
print('Vocabulary Size:', vocab_size)
print('Description Length: ', max_length)

model = define_model(vocab_size, max_length)
epochs = 10
steps = len(train_descriptions)
# making a directory models to save our models
os.mkdir("models")
for i in range(epochs):
    train_generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
    model.fit(train_generator, epochs=1, steps_per_epoch= steps, verbose=1)
    model.save("models/model_" + str(i) + ".keras")

Dataset:  6000
Descriptions: train= 6000
Photos: train= 6000
Vocabulary Size: 7377
Description Length:  38
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 38)]                 0         []                            
                                                                                                  
 input_1 (InputLayer)        [(None, 2048)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, 38, 256)              1888512   ['input_2[0][0]']             
                                                                                                  
 dropout (Dropout)           (None, 2048)                 0         ['input_1[0][0]'] 