In [1]:
!wget -q https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip
!wget -q https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip
!unzip -qq Flickr8k_Dataset.zip
!unzip -qq Flickr8k_text.zip
!rm Flickr8k_Dataset.zip Flickr8k_text.zip

In [9]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Attention
from tensorflow.keras.layers import Add, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications import Xception
from tensorflow.keras.applications.xception import preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import numpy as np
from pickle import load
import os
import nltk
import string
from PIL import Image
# Set the seed for reproducibility
tf.random.set_seed(42)
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
# Loading a text file into memory
def load_doc(filename):
    # Opening the file as read only
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

# get all imgs with their captions
def all_img_captions(filename):
    file = load_doc(filename)
    captions = file.split('\n')
    descriptions = {}
    for caption in captions[:-1]:
        img, caption = caption.split('\t')
        if img[:-2] not in descriptions:
            descriptions[img[:-2]] = [caption]
        else:
            descriptions[img[:-2]].append(caption)
    return descriptions

# Data cleaning - lowercasing, removing punctuation, and words containing numbers
def cleaning_text(captions):
    table = str.maketrans('', '', string.punctuation)
    for img, caps in captions.items():
        for i, img_caption in enumerate(caps):

            img_caption.replace("-", " ")
            desc = img_caption.split()

            # Converts to lowercase
            desc = [word.lower() for word in desc]
            # Remove punctuation from each token
            desc = [word.translate(table) for word in desc]
            # Remove hanging 's' and 'a'
            desc = [word for word in desc if (len(word) > 1)]
            # Remove tokens with numbers in them
            desc = [word for word in desc if (word.isalpha())]
            # Convert back to string

            img_caption = ' '.join(desc)
            captions[img][i] = img_caption
    return captions

# Building vocabulary
def text_vocabulary(descriptions):
    # Build vocabulary of all unique words
    vocab = set()

    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]

    return vocab

# All descriptions in one file
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + '\t' + desc)
    data = "\n".join(lines)
    file = open(filename, "w")
    file.write(data)
    file.close()

In [11]:
# Set these paths according to your project folder in your system
dataset_text = "/content"
dataset_images = "/content/Flicker8k_Dataset"

# We prepare our text data
filename = dataset_text + "/" + "Flickr8k.token.txt"
# Loading the file that contains all data
# Mapping them into descriptions dictionary img to 5 captions
descriptions = all_img_captions(filename)
print("Length of descriptions =", len(descriptions))

# Cleaning the descriptions
clean_descriptions = cleaning_text(descriptions)

# Building vocabulary
vocabulary = text_vocabulary(clean_descriptions)
print("Length of vocabulary = ", len(vocabulary))

# Saving each description to a file
save_descriptions(clean_descriptions, "descriptions.txt")

Length of descriptions = 8092
Length of vocabulary =  8763


In [12]:
# def extract_features(directory):
#     model = Xception(include_top=False, pooling='avg')
#     features = {}
#     for img in tqdm(os.listdir(directory)):
#         filename = directory + "/" + img
#         image = Image.open(filename)
#         image = image.resize((299, 299))
#         image = np.expand_dims(image, axis=0)
#         image = image / 127.5
#         image = image - 1.0

#         feature = model.predict(image)
#         features[img] = feature
#     return features

# # 2048 feature vector
# features = extract_features(dataset_images)
# dump(features, open("features.p", "wb"))

In [13]:
# Load the data
def load_photos(filename):
    file = load_doc(filename)
    photos = file.split("\n")[:-1]
    return photos

def load_clean_descriptions(filename, photos):
    # Loading clean_descriptions
    file = load_doc(filename)
    descriptions = {}
    for line in file.split("\n"):

        words = line.split()
        if len(words) < 1:
            continue

        image, image_caption = words[0], words[1:]

        if image in photos:
            if image not in descriptions:
                descriptions[image] = []
            desc = '<start> ' + " ".join(image_caption) + ' <end>'
            descriptions[image].append(desc)

    return descriptions

def load_features(photos):
    # Loading all features
    all_features = load(open("features.p", "rb"))
    # Selecting only needed features
    features = {k: all_features[k] for k in photos}
    return features


In [15]:

filename = dataset_text + "/" + "Flickr_8k.trainImages.txt"
# train = loading_data(filename)
train_imgs = load_photos(filename)
train_descriptions = load_clean_descriptions("descriptions.txt", train_imgs)
train_features = load_features(train_imgs)

In [16]:
# Converting dictionary to a clean list of descriptions
def dict_to_list(descriptions):
    all_desc = []
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

# Creating a tokenizer class
# This will vectorize the text corpus
# Each integer will represent a token in the dictionary

def create_tokenizer(descriptions):
    desc_list = dict_to_list(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(desc_list)
    return tokenizer

# Give each word an index and store that into tokenizer.p pickle file
tokenizer = create_tokenizer(train_descriptions)
#dump(tokenizer, open('tokenizer.p', 'wb'))
vocab_size = len(tokenizer.word_index) + 1
vocab_size


7577

In [17]:
# Calculate the maximum length of descriptions
def max_length(descriptions):
    desc_list = dict_to_list(descriptions)
    return max(len(d.split()) for d in desc_list)

max_length = max_length(descriptions)
max_length


32

In [18]:


# Max_lengths is 32, vocab_size is 7577

# Create input-output sequence pairs from the image description

# Data generator, used by model.fit_generator()
def data_generator(descriptions, features, tokenizer, max_length):
    while 1:
        for key, description_list in descriptions.items():
            # Retrieve photo features
            feature = features[key][0]
            input_image, input_sequence, output_word = create_sequences(tokenizer, max_length, description_list, feature)
            yield [[input_image, input_sequence], output_word]

def create_sequences(tokenizer, max_length, desc_list, feature):
    X1, X2, y = list(), list(), list()
    # Get the maximum sequence length
    max_seq_length = max_length

    # Walk through each description for the image
    for desc in desc_list:
        # Encode the sequence
        seq = tokenizer.texts_to_sequences([desc])[0]

        # Split one sequence into multiple X, y pairs
        for i in range(1, len(seq)):
            # Split into input and output pair
            in_seq, out_seq = seq[:i], seq[i]

            # Pad input sequence if it's shorter than max_seq_length
            in_seq = pad_sequences([in_seq], maxlen=max_seq_length)[0]

            # Encode output sequence
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

            # Store
            X1.append(feature)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)



In [19]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Concatenate

def define_model_with_attention(vocab_size, max_length):
    # Image feature extractor
    input_image_features = tf.keras.layers.Input(shape=(2048,))
    image_features = Dense(256, activation='relu')(input_image_features)

    # Sequence model
    input_sequence = tf.keras.layers.Input(shape=(max_length,))
    sequence_embedding = Embedding(vocab_size, 256, mask_zero=True)(input_sequence)
    sequence_lstm = LSTM(256, return_sequences=True)(sequence_embedding)

    # Attention mechanism
    attention = tf.keras.layers.Attention()([sequence_lstm, image_features])
    context = Concatenate(axis=-1)([sequence_lstm, attention])

    # Decoder
    decoder_lstm1 = LSTM(256)(context)
    output = Dense(vocab_size, activation='softmax')(decoder_lstm1)

    # Combine the inputs and outputs into a complete model
    model = tf.keras.models.Model(inputs=[input_image_features, input_sequence], outputs=output)

    return model

# Define your vocabulary size and maximum sequence length
vocab_size = 7577
max_length = 32

# Create the model
model = define_model_with_attention(vocab_size, max_length)

# Compile the model and specify your loss and optimizer
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Print the model summary
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 32)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 32, 256)              1939712   ['input_2[0][0]']             
                                                                                                  
 input_1 (InputLayer)        [(None, 2048)]               0         []                            
                                                                                                  
 lstm (LSTM)                 (None, 32, 256)              525312    ['embedding[0][0]']           
                                                                                              

In [20]:
# Train the model
print('Dataset: ', len(train_imgs))
print('Descriptions: train=', len(train_descriptions))
print('Photos: train=', len(train_features))
print('Vocabulary Size:', vocab_size)
print('Description Length: ', max_length)

epochs = 30
steps = len(train_descriptions)



Dataset:  6000
Descriptions: train= 6000
Photos: train= 6000
Vocabulary Size: 7577
Description Length:  32


In [21]:
os.mkdir("models")

In [None]:
# Making a directory to save our models

for i in range(epochs):
    generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    model.save("models/model_" + str(i) + ".h5")


  model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)




  saving_api.save_model(


