In [None]:
import os
import random
import time
import re
import string
import numpy as np
import matplotlib.pyplot as plt
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.applications import EfficientNetV2B0
from tensorflow.keras.layers import TextVectorization

Since the data augmentation and dropout are applied to the model, the result will be non-deterministic. Set the random seed to a number to get the same result every time you run the notebook.

In [None]:
seed = 10
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

## Download the dataset

The Flickr8K dataset will be used in this notebook. It comprises over 8,000 images, that are each paired with five different captions.

In [None]:
!wget -q https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip 
!wget -q https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip 
!unzip -qq Flickr8k_Dataset.zip -d data
!unzip -qq Flickr8k_text.zip -d data
!rm Flickr8k_Dataset.zip Flickr8k_text.zip

In [None]:
# Path to the images
IMAGES_PATH = "data/Flicker8k_Dataset"

# Desired image dimensions
IMAGE_SIZE = (300, 300)

# Number of captions
NUM_CAPTIONS = 5

# Vocabulary size
VOCAB_SIZE = 8000

# Fixed length allowed for any sequence
SEQ_LENGTH = 40

# Dimension for the image embeddings and token embeddings
EMBED_DIM = 100

# LSTM units 
UNITS = 512

# Other training parameters
BATCH_SIZE = 32
AUTOTUNE = tf.data.AUTOTUNE

## Preparing the dataset

In [None]:
def load_captions_data(filename):
    """Load data from text file and map each image name to corresponding captions.

    Args:
        filename (string): Path to the text file containing image-caption data.

    Returns:
        captions_mapping (dict): Dictionary mapping image names and the corresponding captions
    """

    with open(filename) as caption_file:
        caption_data = caption_file.readlines()
        captions_mapping = {}

        for line in caption_data:
            line = line.rstrip("\n")
            # Image name and captions are separated using a tab
            img_name, caption = line.split("\t")

            # Each image is repeated five times for the five different captions.
            # Each image name has a suffix `#(caption_number)`
            img_name = img_name.split("#")[0]
            img_name = os.path.join(IMAGES_PATH, img_name.strip())

            if img_name.endswith("jpg"):
                # We will add a start and an end token to each caption
                caption = "<start> " + caption.strip() + " <end>"
               
                if img_name in captions_mapping:
                    captions_mapping[img_name].append(caption)
                else:
                    captions_mapping[img_name] = [caption]

        return captions_mapping


def train_val_split(caption_data, train_size=0.8, shuffle=True):
    """Split the captioning dataset into train and validation sets.

    Args:
        caption_data (dict): Dictionary containing the mapped caption data
        train_size (float): Fraction of all the full dataset to use as training data
        shuffle (bool): Whether to shuffle the dataset before splitting

    Returns:
        Traning and validation datasets as two separated dicts
    """

    # 1. Get the list of all image names
    all_images = list(caption_data.keys())

    # 2. Shuffle if necessary
    if shuffle:
        np.random.shuffle(all_images)

    # 3. Split into training and validation sets
    train_size = int(len(caption_data) * train_size)

    training_data = {
        img_name: caption_data[img_name] for img_name in all_images[:train_size]
    }
    validation_data = {
        img_name: caption_data[img_name] for img_name in all_images[train_size:]
    }

    # 4. Return the splits
    return training_data, validation_data


# Load the dataset
captions_mapping = load_captions_data("data/Flickr8k.token.txt")

# Split the dataset into training and validation sets
train_data, valid_data = train_val_split(captions_mapping)
print("Number of training samples: ", len(train_data))
print("Number of validation samples: ", len(valid_data))

## Vectorizing the text data

In [None]:
# Punctuations which are not allowed except < and >
strip_chars = string.punctuation
strip_chars = strip_chars.replace("<", "")
strip_chars = strip_chars.replace(">", "")

# Turn characters to lower case and remove the punctuations from the text
def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")

# Turn each word to a interger index
vectorization = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=SEQ_LENGTH,
    standardize=custom_standardization,
)

# Build a vocab from the training data
text_data = list(train_data.values())
text_data = np.array(text_data)
text_data = np.reshape(text_data, (-1,))
vectorization.adapt(text_data)

In [None]:
len(vectorization.get_vocabulary())

# Build tf.data.Dataset pipeline

In [None]:
def decode_and_resize(img_path):
    """Read an image from image path, resize it and turn it to an array

    Args:
        img_path (string): Path to the image file

    Returns:
        A 3D-array where each elements a pixel value of the image
    """   
    img = tf.io.read_file(img_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, IMAGE_SIZE)
    img = tf.image.convert_image_dtype(img, tf.float32)
    return img


def format_dataset(img_path, caption):
    """Format the dataset for training model

    Args:
        img_path (string): Path to the image file
        caption (string): The corresponding caption of the image

    Returns:
        A pair of a 3D-array of the image and the vectorized caption

    """
    return decode_and_resize(img_path), vectorization(caption)


def make_dataset(img_paths, captions):
    """ Build batches of data from a set of image paths and a set of captions

    Args:
        img_paths (list): Paths to the image file
        captions (list): The corresponding captions of the image

    Returns:
        An iterator where contains batches of data
    """

    dataset = tf.data.Dataset.from_tensor_slices((img_paths, captions))
    dataset = dataset.shuffle(len(img_paths))
    dataset = dataset.map(format_dataset, num_parallel_calls=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE).prefetch(AUTOTUNE)

    return dataset


# Pass the list of images and the list of corresponding captions
train_dataset = make_dataset(list(train_data.keys()), list(train_data.values()))

In [None]:
# train_dataset is an iterator containing multiple batches
# Each batch is an tuple of images and captions.  
# Get 1 random batch and check the dimension of each element
for (images, captions) in train_dataset.take(1):
  print(images.shape)
  print(captions.shape)

# Glove word embedding

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip -d glove
!rm glove.6B.zip

In [None]:
path_to_glove_file = "glove/glove.6B.100d.txt"
embeddings_index = {}

with open(path_to_glove_file) as f:
  for line in f:
    word, coefs = line.split(maxsplit=1)
    coefs = np.fromstring(coefs, "f", sep=" ")
    embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors.")

In [None]:
vocab = vectorization.get_vocabulary()
word_index_lookup = dict(zip(range(len(vocab)), vocab))
embedding_matrix = np.zeros((VOCAB_SIZE, EMBED_DIM)) 

# Create a embedding matrix based on learned semantic features for all the words in the vocab
for i, word, in word_index_lookup.items():
  if i < VOCAB_SIZE:
    embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None: 
    embedding_matrix[i] = embedding_vector

# Build model

**Load pre-trained model**


In [None]:
def get_cnn_model():
    base_model = EfficientNetV2B0(input_shape=(*IMAGE_SIZE, 3), include_top=False, weights="imagenet")
    # Freeze the feature extractor so that the weights are not updated during training
    base_model.trainable = False
    base_model_out = base_model.output   # Shape (10,10,1280)
    base_model_out = layers.Reshape((-1, base_model_out.shape[-1]))(base_model_out)  # Shape (100, 1280)
    cnn_model = keras.models.Model(base_model.input, base_model_out)
    return cnn_model


# Data augmentation for image data
image_augmentation = keras.Sequential(
    [
        layers.RandomFlip("horizontal"),
        layers.RandomRotation(0.2),
        layers.RandomContrast(0.3)
    ]
)

cnn_model = get_cnn_model()

To see the effect of data augmentation, the random seed defined at the beginning of the notebook have to be removed!

In [None]:
plt.figure(figsize=(10, 10))
for images, _ in train_dataset.take(1):
    for i in range(9):
        augmented_images = image_augmentation(images)
        ax = plt.subplot(3, 3, i + 1)
        plt.imshow(augmented_images[0].numpy().astype("uint8"))
        plt.axis("off")

**Attention mechanism & Positional Embedding**

In [None]:
class BahdanauAttention(keras.Model):
    def __init__(self, units):
        super().__init__()
        self.W1 = layers.Dense(units)
        self.W2 = layers.Dense(units)
        self.V = layers.Dense(1)

    def call(self, features, hidden):
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        # Aligment-model
        attention_hidden_layer = (tf.nn.tanh(self.W1(features) +
                                             self.W2(hidden_with_time_axis)))
        # Aligment score
        score = self.V(attention_hidden_layer)

        # Produce attended context vector
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim
        # Embedding layer for the token indices
        self.token_embeddings = layers.Embedding(
            input_dim=input_dim,
            output_dim=output_dim,
            embeddings_initializer=keras.initializers.Constant(embedding_matrix),
            trainable=False,
            mask_zero=True)
        # Embedding layer for the token positions
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim)
        
    def call(self, inputs, position):
        embedded_tokens = self.token_embeddings(inputs)      
        embedded_positions = self.position_embeddings(position)
        # Add both embedding vectors together
        return embedded_tokens + embedded_positions

    # Generate a mask so padding 0s in the inputs can be ignored. 
    # The method will called automatically by the framework, and the 
    # mask will get propagated to the next layer
    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

**Encoder-Decoder Architecture**

In [None]:
class ImageEncoder(keras.Model):
    def __init__(self, cnn_model, embed_dim, image_augmentation, **kwargs):
        super().__init__(**kwargs)
        self.cnn_model = cnn_model
        self.embed_dim = embed_dim
        self.image_augmentation = image_augmentation
        self.preprocess_input = keras.applications.efficientnet_v2.preprocess_input
        self.dropout = layers.Dropout(0.5)
        self.fc = layers.Dense(embed_dim, activation="relu")
    def call(self, inputs):
        x = self.image_augmentation(inputs) 
        x = self.preprocess_input(x)    
        x = cnn_model(x)                  
        x = self.dropout(x)
        img_feature = self.fc(x)         
        return img_feature

class CaptionDecoder(keras.Model):
    def __init__(self, sequence_length, embedding_dim, units, vocab_size, embedding_matrix):
        super().__init__()
        self.units = units
        self.positional_embeding = PositionalEmbedding(sequence_length, vocab_size, embedding_dim)  
        self.lstm = layers.LSTM(self.units,
                                return_sequences=True,
                                return_state=True,
                                recurrent_initializer='glorot_uniform')
        self.dropout = layers.Dropout(0.25)
        self.layer_normalize = layers.LayerNormalization()
        self.fc = layers.Dense(vocab_size)
        self.activator = layers.Activation("softmax")
        self.attention = BahdanauAttention(self.units)

    def call(self, inputs, features, hidden_state, position):
        # defining attention as a separate model
        context_vector, attention_weights = self.attention(features, hidden_state)
        # word turned into a vector with position information
        x = self.positional_embeding(inputs, position)
        # word vector and context vector a concatenated
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        # passing the concatenated vector to the LSTM
        output, hidden_state, _ = self.lstm(x)        
  
        x = self.dropout(output)
        x = self.fc(x)
        x = tf.reshape(x, (-1, x.shape[2]))
        x = self.layer_normalize(x)
    
        # ouput a probability distribution over words in vocab
        output = self.activator(x)

        return output, hidden_state, attention_weights
    
    def reset_state(self, batch_size):
        return tf.zeros((batch_size, self.units))

encoder = ImageEncoder(cnn_model, EMBED_DIM, image_augmentation)
decoder = CaptionDecoder(SEQ_LENGTH, EMBED_DIM, UNITS, VOCAB_SIZE, embedding_matrix)

In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()
acc_fn = tf.keras.metrics.SparseCategoricalAccuracy()

## Load the pretrained weights into the model

The model has already been trained and has obtained a good result. You just need to load the trained weights into the model and run the evaluation to see the result.

In [None]:
!unzip -qq weights_efficientnetv2b0_finetuned.zip

In [None]:
encoder.load_weights("weights/encoder_weights_efficientnetv2b0")
decoder.load_weights("weights/decoder_weights_efficientnetv2b0")

In [None]:
#del cnn_model
#del encoder
#del decoder

# Training Loop

If the pre-trained weights are already loaded into the model, this training process and fine-tuning can be skipped.
The entire training process can be summarized as follows.
- 3 epochs with the frozen CNN-model (learning_rate = 1e-3)
- 30 epochs with the unfrozen CNN-model and frozen Decoder (learning_rate = 1e-5)
- 5 epochs with the unfrozen CNN-model and Decoder (learning_rate = 1e-5)

In [None]:
@tf.function
def train_step(image, target):
    loss = 0
    acc = 0

    # initializing the hidden state for each batch
    # so that the decoder doesn't continue with the data in the previous batch
    hidden_state = decoder.reset_state(batch_size=target.shape[0])

    # a start token is considered as the first input for the decoder in a batch
    dec_input = tf.expand_dims(vectorization(["<start>"] * target.shape[0])[:,0], 1)

    with tf.GradientTape() as tape:
      # Features are first extracted by the encoder 
        features = encoder(image)

      # For position in the processing caption
        for position in range(1, target.shape[1]):
            # Passing the word, features, hidden state, position through the decoder
            predictions, hidden_state, attention_weights = decoder(dec_input, features, hidden_state, position)

            # Compute the loss and accuracy
            loss += loss_fn(target[:, position], predictions)
            acc_fn.update_state(target[:, position], predictions)
            acc += acc_fn.result() 
            
            # Using teacher forcing
            # The model learns the correct word in each time step, not the predicted word is
            dec_input = tf.expand_dims(target[:, position], 1)

    total_loss = (loss / int(target.shape[1]))

    total_acc = (acc / int(target.shape[1]))
    
    # Get the weights of the model
    trainable_variables = encoder.trainable_variables + decoder.trainable_variables

    # Calculate the gradients of the loss with respect to the weights
    gradients = tape.gradient(loss, trainable_variables)

    # Update the gradients by the optimizer
    optimizer.apply_gradients(zip(gradients, trainable_variables))

    return loss, total_loss , acc, total_acc

In [None]:
loss_plot = []
acc_plot = []

In [None]:
EPOCHS = 3

for epoch in range(0, EPOCHS):
    start = time.time()
    total_loss = 0
    total_acc = 0
    num_steps = 0

    for (batch, (image, target)) in enumerate(train_dataset):
      # Train model with all 5 captions
        for i in range(NUM_CAPTIONS):
            batch_loss, t_loss, batch_acc, t_acc = train_step(image, target[:, i, :])
            total_loss += t_loss
            total_acc += t_acc

        num_steps += 1
        
        # Print the average loss and acc every 100 batches
        if batch % 100 == 0:
            average_batch_loss = batch_loss.numpy()/int(target.shape[1])
            average_batch_acc = batch_acc.numpy()/int(target.shape[1])
            print(f'Epoch {epoch+1} Batch {batch} Loss {average_batch_loss:.4f} Acc {average_batch_acc:.4f}')

    current_loss = total_loss / num_steps
  
    current_acc = total_acc / num_steps

    # Storing the epoch end loss value to plot later
    loss_plot.append(current_loss)

    acc_plot.append(current_acc)
  
    print ('Epoch: {} - Loss: {:.6f} - Acc: {:.6f} - Time taken: {:.1f} sec'.format(
        epoch + 1,
        current_loss,
        current_acc,
        time.time() - start))
        

In [None]:
plt.clf()
plt.plot(range(1, len(loss_plot)+1) ,loss_plot, label="Training loss")
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss Plot')
plt.legend()
plt.savefig('loss_plot.png')
plt.show()

In [None]:
plt.clf()
plt.plot(range(1, len(loss_plot)+1), acc_plot, label="Training accuracy")
plt.xlabel('Epochs')
plt.ylabel('Acc')
plt.title('Acc Plot')
plt.legend()
plt.savefig('acc_plot.png')
plt.show()

## Fine-tuning the pretrained EfficientNetV2B0


After training the model in the first 3 epochs, fine tune the EfficientNetV2B0 in 30 epochs. Run the code in the section "Training loop" (the third one) and set the EPOCHS variable to 30.

In [None]:
# Fine-tuning a portion of layers of the pretrained-CNN
for layer in encoder.cnn_model.layers[-10:]:
  if not isinstance(layer, layers.BatchNormalization):
    layer.trainable = True

# Decoder will not be updated 
decoder.trainable = False
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)

In [None]:
# Check if the decoder is frozen
assert len(decoder.trainable_variables) == 0, "decoder should be frozen!"

The decoder will be unfrozen and the model is trained further in 5 epochs. Run the code in the section "Training loop" (the third one) and set the EPOCHS variable to 5.




In [None]:
decoder.trainable = True

# Save model & weights

In [None]:
encoder.save_weights("weights/encoder_weights_efficientnetv2b0")
decoder.save_weights("weights/decoder_weights_efficientnetv2b0")

In [None]:
!zip -r weights.zip weights/

# Evaluation

In [None]:
def generate_caption(img):
    # Extract feature from an image
    feature = encoder(tf.expand_dims(img, axis=0))

    # Prepare an array to store attention weights of each predicted word
    attention_features_shape = feature.shape[1]
    attention_plot = np.zeros((SEQ_LENGTH, attention_features_shape))

    # Init the hidden state of the decoder
    hidden = decoder.reset_state(batch_size=1)

    # Prepare the start token as input for the decoder
    dec_input = tf.expand_dims([vectorization(["<start>"])[0][0]], 1)
    decoded_caption = "<start>"

    for position in range(SEQ_LENGTH):
        # Passing the word, features, hidden state, position through the decoder
        predictions, hidden, attention_weights = decoder(dec_input,
                                                         feature,
                                                         hidden, position)
        # Store the attention weights for later plot
        attention_plot[position] = tf.reshape(attention_weights, (-1,)).numpy()

        # Choose the word with the highst probability as predicted word
        predicted_id = np.argmax(predictions)
        # Turn the predicted word index to word as string
        predicted_word = word_index_lookup[predicted_id]

        # Concat each predicted word to form a caption
        decoded_caption += " " + predicted_word

        dec_input = tf.expand_dims([predicted_id], 0)
        
        # The model finishes the caption
        if predicted_word == "<end>":
          break

    # Clean the generated caption by remove start and end token    
    decoded_caption = decoded_caption.replace("<start>", "")
    decoded_caption = decoded_caption.replace("<end>", "").strip()

    # The length of the generated caption does not always reach the maximum (= 40)
    # Remove zeros from the attention's storage
    attention_plot = attention_plot[:len(decoded_caption), :]
    return decoded_caption, attention_plot

In [None]:
def prepare_references(captions):
  """Build a list of references from the  captions.
    Args:
        captions: a list of a list of 5 captions -> [capA1,..., capA5]
    Returns:
        references: a list of references where each reference is a list of token 
        -> [refA1,...,refA5] with refAi = ['token1', 'token2',..]
    """
  references = []
  for caption in captions:
    # Clean the caption
    caption = caption.replace("<start>","")
    caption = caption.replace("<end>","")
    caption = caption.replace(",","")
    caption = caption.replace(".","").strip()
    # Split the caption into list of tokens
    caption = caption.split()
    references.append(caption)
  return references

Running this cell below could take about 10 minutes!

In [None]:
# Get all the validation image path
valid_images = list(valid_data.keys())

def evaluate_model(valid_images, captions_mapping):
    """Calculate BLEU-1 to BLEU-4 Score for a list of captions as a text corpus.

    Args:
        valid_images (list): list of image paths
        captions_mapping (dict): dictionary maps an image path to a list of captions
    """
    references_list = []
    predictions = []
    for image_path in valid_images:
        # Prepare the references from the valid captions 
        captions = captions_mapping[image_path]
        references = prepare_references(captions)
        references_list.append(references)

        # The model generates a caption from a given image
        image = decode_and_resize(image_path)
        prediction, _ = generate_caption(image)
        prediction = prediction.split()
        predictions.append(prediction)

    # Calculate BLEU score
    bleu1 = corpus_bleu(references_list, predictions, weights=(1.0, 0, 0, 0)) * 100
    bleu2 = corpus_bleu(references_list, predictions, weights=(0.5, 0.5, 0, 0)) * 100
    bleu3 = corpus_bleu(references_list, predictions, weights=(0.3, 0.3, 0.3, 0)) * 100
    bleu4 = corpus_bleu(references_list, predictions, weights=(0.25, 0.25, 0.25, 0.25)) * 100
    
    print('BLEU-1: %f' % bleu1)
    print('BLEU-2: %f' % bleu2)
    print('BLEU-3: %f' % bleu3) 
    print('BLEU-4: %f' % bleu4)

evaluate_model(valid_images, captions_mapping)

In [None]:
def plot_attention(image_path, result, attention_plot):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.convert_image_dtype(image, tf.float32)
    image = np.array(image)
  
    result = result.split()
    len_result = len(result)

    fig = plt.figure(figsize=(12, 12))

    for i in range(len_result):
        temp_att = np.resize(attention_plot[i], (10, 10))
        grid_size = max(int(np.ceil(len_result/2)), 2)
        ax = fig.add_subplot(grid_size, grid_size, i+1)
        ax.set_title(result[i])
        img = ax.imshow(image)
        ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=img.get_extent())

    plt.tight_layout()
    plt.show()

def show_img(img_path):
  img = tf.io.read_file(img_path)
  img = tf.image.decode_jpeg(img, channels=3)
  img = tf.image.resize(img, IMAGE_SIZE)
  img = img.numpy().clip(0, 255).astype(np.uint8)
  plt.axis("off")
  plt.imshow(img)
  plt.show()

Run this cell multiple times to see results from the different images.

In [None]:
valid_images = list(valid_data.keys())
# Select a random image from the validation dataset
image_path = np.random.choice(valid_images)

# Read the image from the disk
sample_img = decode_and_resize(image_path)
result, attention_plot = generate_caption(sample_img)

plot_attention(image_path, result, attention_plot)

show_img(image_path)
print('Predicted caption: ' + result)
print('True captions:')
for true_caption in captions_mapping[image_path]:
  print(true_caption)

### Data analysis

In [None]:
def calculate_bleu1(image_path, captions_mapping):
    """Calculate BLEU-1 for a description of an image 
    Args:
        image_path (string): Path to the image
        captions_mapping (dict): dictionary maps an image path to a list of captions
    Return:
        bleu1 (float): BLEU-1 score
    """
    # Prepare the references from the valid captions 
    captions = captions_mapping[image_path]
    references = prepare_references(captions)

    # The model generates a caption from a given image
    image = decode_and_resize(image_path)
    prediction, _ = generate_caption(image)
    hypothesis = prediction.split()

    # Calculate BLEU-1 score
    bleu1 = sentence_bleu(references, hypothesis, weights=(1.0, 0, 0, 0)) * 100

    return bleu1, prediction

Running this cell below could take about 10 minutes!

In [None]:
# Quality of captions
exellent = {}
high = {}
medium = {}
low = {}
very_low = {}

# List contains all the BLEU-1 Scores
overall = []

for img_path in valid_images:
  bleu1, prediction = calculate_bleu1(img_path, captions_mapping)
  overall.append(bleu1)

  # Predicted captions divided into diff. groups depending on the BLEU-1 Score
  if bleu1 > 80:
    exellent[img_path] = [bleu1, prediction]
  elif bleu1 > 60 and bleu1 <= 80:
    high[img_path] = [bleu1, prediction]
  elif bleu1 > 40 and bleu1 <= 60:
    medium[img_path] = [bleu1, prediction]
  elif bleu1 > 20 and bleu1 <= 40:
    low[img_path] = [bleu1, prediction]
  elif bleu1 <= 20:
    very_low[img_path] = [bleu1, prediction]

print("Number of exellent captions: ", len(exellent))
print("Number of high quality captions: ",len(high))
print("Number of medium quality captions: ",len(medium))
print("Number of low quality captions: ",len(low))
print("Number of very low captions: ",len(very_low))


In [None]:
plt.clf()
plt.hist(overall, 150)
plt.xlabel('BLEU-1 Scores')
plt.ylabel('Number of descriptions')
plt.title('Data distribution of the BLEU-1 score in the valid set.')
plt.legend()
plt.savefig('bleu1_distribution.png')
plt.show()

In [None]:
# Select the quality of the descriptions 
#and the number of descriptions to display.
QUALITY = exellent
N = 5

for i, image_path in enumerate(QUALITY):
  if i > N:
    break
  bleu1, caption = QUALITY[image_path]
  show_img(image_path)
  print("Path: ", image_path)
  print("Caption: ", caption)
  print("BLEU-1: ", '{0:.3g}'.format(bleu1))

In [None]:
# Select a random image from the internet
image_url = 'https://tensorflow.org/images/surf.jpg'
image_path = tf.keras.utils.get_file('surf.jpg', origin=image_url)

# Decode the image & generate a caption
sample_img = decode_and_resize(image_path)
result, attention_plot = generate_caption(sample_img)

plot_attention(image_path, result, attention_plot)

show_img(image_path)
print('Predicted caption: ' + result)

# References
The implementation is partly based on these tutorials:

https://www.tensorflow.org/tutorials/text/image_captioning#create_a_tfdata_dataset_for_training

https://keras.io/examples/vision/image_captioning/