# Data Preprocessing
This notebook preprocesses the Flickr8k dataset by tokenizing captions, building a vocabulary, mapping words to integers, and resizing and normalizing images.

In [1]:
# Import required libraries
import os
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import load_img, img_to_array

2025-05-03 15:39:54.019006: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Load Captions
Load and clean the captions from the Flickr8k.token.txt file.

In [2]:
# Load captions
captions_file = '../data/Flickr8k.token.txt'
def load_captions(file_path):
    captions = {}
    with open(file_path, 'r') as f:
        for line in f:
            tokens = line.strip().split()
            image_id, caption = tokens[0], ' '.join(tokens[1:])
            image_id = image_id.split('#')[0]
            if image_id not in captions:
                captions[image_id] = []
            captions[image_id].append(caption)
    return captions
captions = load_captions(captions_file)
print(f'Loaded {len(captions)} images with captions.')

Loaded 8092 images with captions.


## Tokenize Captions
Tokenize the captions and build a vocabulary.

In [3]:
# Tokenize captions
def tokenize_captions(captions):
    all_captions = [caption for caption_list in captions.values() for caption in caption_list]
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_captions)
    vocab_size = len(tokenizer.word_index) + 1
    print(f'Vocabulary size: {vocab_size}')
    return tokenizer, vocab_size
tokenizer, vocab_size = tokenize_captions(captions)

Vocabulary size: 8494


## Resize and Normalize Images
Resize images to 224x224 and normalize pixel values.

In [7]:
# Resize and normalize images
def preprocess_images(image_dir, target_size=(224, 224)):
    image_features = {}
    for image_name in os.listdir(image_dir):
        image_path = os.path.join(image_dir, image_name)
        image = load_img(image_path, target_size=target_size)
        image = img_to_array(image)
        image = preprocess_input(image)
        image_features[image_name] = image
    return image_features
image_dir = '../data/images/'
image_features = preprocess_images(image_dir)
print(f'Processed {len(image_features)} images.')

Processed 8091 images.


## Save Processed Data
Save the tokenized captions and preprocessed image features into the `data/processed/` directory.

In [8]:
# Save tokenized captions
import pickle
processed_dir = '../data/processed/'
os.makedirs(processed_dir, exist_ok=True)
tokenized_captions_path = os.path.join(processed_dir, 'tokenized_captions.pkl')
with open(tokenized_captions_path, 'wb') as f:
    pickle.dump(captions, f)
print(f'Tokenized captions saved to {tokenized_captions_path}')

# Save tokenizer
tokenizer_path = os.path.join(processed_dir, 'tokenizer.pkl')
with open(tokenizer_path, 'wb') as f:
    pickle.dump(tokenizer, f)
print(f'Tokenizer saved to {tokenizer_path}')

# Save image features
image_features_path = os.path.join(processed_dir, 'image_features.pkl')
with open(image_features_path, 'wb') as f:
    pickle.dump(image_features, f)
print(f'Image features saved to {image_features_path}')

Tokenized captions saved to ../data/processed/tokenized_captions.pkl
Tokenizer saved to ../data/processed/tokenizer.pkl
Image features saved to ../data/processed/image_features.pkl


## Split Dataset
Split the dataset into training, validation, and testing sets based on the provided text files.

In [9]:
# Load image IDs for each split
def load_image_ids(file_path):
    with open(file_path, 'r') as f:
        return set(line.strip() for line in f)

train_ids = load_image_ids('../data/Flickr_8k.trainImages.txt')
val_ids = load_image_ids('../data/Flickr_8k.devImages.txt')
test_ids = load_image_ids('../data/Flickr_8k.testImages.txt')

print(f'Training set: {len(train_ids)} images')
print(f'Validation set: {len(val_ids)} images')
print(f'Testing set: {len(test_ids)} images')

# Split captions and image features
def split_data(captions, image_features, train_ids, val_ids, test_ids):
    train_captions = {k: v for k, v in captions.items() if k in train_ids}
    val_captions = {k: v for k, v in captions.items() if k in val_ids}
    test_captions = {k: v for k, v in captions.items() if k in test_ids}

    train_features = {k: v for k, v in image_features.items() if k in train_ids}
    val_features = {k: v for k, v in image_features.items() if k in val_ids}
    test_features = {k: v for k, v in image_features.items() if k in test_ids}

    return train_captions, val_captions, test_captions, train_features, val_features, test_features

train_captions, val_captions, test_captions, train_features, val_features, test_features = split_data(captions, image_features, train_ids, val_ids, test_ids)

# Save splits
with open(os.path.join(processed_dir, 'train_captions.pkl'), 'wb') as f:
    pickle.dump(train_captions, f)
with open(os.path.join(processed_dir, 'val_captions.pkl'), 'wb') as f:
    pickle.dump(val_captions, f)
with open(os.path.join(processed_dir, 'test_captions.pkl'), 'wb') as f:
    pickle.dump(test_captions, f)

with open(os.path.join(processed_dir, 'train_features.pkl'), 'wb') as f:
    pickle.dump(train_features, f)
with open(os.path.join(processed_dir, 'val_features.pkl'), 'wb') as f:
    pickle.dump(val_features, f)
with open(os.path.join(processed_dir, 'test_features.pkl'), 'wb') as f:
    pickle.dump(test_features, f)

print('Dataset split and saved successfully.')

Training set: 6000 images
Validation set: 1000 images
Testing set: 1000 images
Dataset split and saved successfully.


## Verify Preprocessed Data
Check if the tokenized captions, tokenizer, and image features are saved correctly.

In [10]:
# Verify tokenized captions
tokenized_captions_path = '../data/processed/tokenized_captions.pkl'
if os.path.exists(tokenized_captions_path):
    with open(tokenized_captions_path, 'rb') as f:
        tokenized_captions = pickle.load(f)
    print(f'Tokenized captions loaded successfully. Total images: {len(tokenized_captions)}')
else:
    print('Tokenized captions file not found.')

# Verify tokenizer
tokenizer_path = '../data/processed/tokenizer.pkl'
if os.path.exists(tokenizer_path):
    with open(tokenizer_path, 'rb') as f:
        tokenizer = pickle.load(f)
    print('Tokenizer loaded successfully.')
else:
    print('Tokenizer file not found.')

# Verify image features
image_features_path = '../data/processed/image_features.pkl'
if os.path.exists(image_features_path):
    with open(image_features_path, 'rb') as f:
        image_features = pickle.load(f)
    print(f'Image features loaded successfully. Total images: {len(image_features)}')
else:
    print('Image features file not found.')

Tokenized captions loaded successfully. Total images: 8092
Tokenizer loaded successfully.
Image features loaded successfully. Total images: 8091
