# Data Preprocessing
This notebook preprocesses the Flickr8k dataset by tokenizing captions, building a vocabulary, mapping words to integers, and resizing and normalizing images.

In [1]:
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress TensorFlow warnings
# suppress keras warnings


In [2]:
# Import required libraries
import os
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import load_img, img_to_array



## Load Captions
Load and clean the captions from the Flickr8k.token.txt file.

In [3]:
def clean_caption(caption):
    # Convert to lowercase
    caption = caption.lower()
    # Remove special characters and extra spaces
    caption = ' '.join(word for word in caption.split() 
                      if word.isalnum() or word in ['.', ','])
    # Add start and end tokens
    caption = 'startseq ' + caption + ' endseq'
    return caption

def load_captions(file_path):
    captions = {}
    with open(file_path, 'r') as f:
        for line in f:
            try:
                tokens = line.strip().split()
                image_id, caption = tokens[0], ' '.join(tokens[1:])
                image_id = image_id.split('#')[0]
                if image_id not in captions:
                    captions[image_id] = []
                # Clean and add start/end tokens
                cleaned_caption = clean_caption(caption)
                captions[image_id].append(cleaned_caption)
            except Exception as e:
                print(f'Error processing caption: {line.strip()}')
                print(f'Error: {str(e)}')
                continue

    return captions

captions_file = '../data/Flickr8k.token.txt'
captions = load_captions(captions_file)
print(f'Loaded {len(captions)} images with captions.')

# Sample output to verify caption formatting
sample_image_id = list(captions.keys())[0]
print('\nSample captions for image', sample_image_id)
for caption in captions[sample_image_id]:
    print(caption)

Loaded 8092 images with captions.

Sample captions for image 1000268201_693b08cb0e.jpg
startseq a child in a pink dress is climbing up a set of stairs in an entry way . endseq
startseq a girl going into a wooden building . endseq
startseq a little girl climbing into a wooden playhouse . endseq
startseq a little girl climbing the stairs to her playhouse . endseq
startseq a little girl in a pink dress going into a wooden cabin . endseq


## Tokenize Captions
Tokenize the captions and build a vocabulary.

In [4]:
# Configure tokenizer
def create_tokenizer(captions):
    tokenizer = Tokenizer(oov_token='<unk>')
    all_captions = [caption for caption_list in captions.values() for caption in caption_list]
    tokenizer.fit_on_texts(all_captions)
    
    # Add special tokens if not present
    special_tokens = ['startseq', 'endseq', '<unk>', '<pad>']
    for token in special_tokens:
        if token not in tokenizer.word_index:
            # Add to word index with next available index
            next_index = len(tokenizer.word_index) + 1
            tokenizer.word_index[token] = next_index
            tokenizer.index_word[next_index] = token

    vocab_size = len(tokenizer.word_index) + 1
    print(f'Vocabulary size: {vocab_size}')
    
    # Find max sequence length
    max_length = max(len(caption.split()) for caption in all_captions)
    print(f'Maximum sequence length: {max_length}')
    
    return tokenizer, vocab_size, max_length

tokenizer, vocab_size, max_length = create_tokenizer(captions)

# Test tokenization
sample_caption = list(captions.values())[0][0]
print('\nSample caption:', sample_caption)
sequence = tokenizer.texts_to_sequences([sample_caption])[0]
print('Tokenized sequence:', sequence)
padded_sequence = pad_sequences([sequence], maxlen=max_length, padding='post')
print('Padded sequence:', padded_sequence[0])

Vocabulary size: 8424
Maximum sequence length: 39

Sample caption: startseq a child in a pink dress is climbing up a set of stairs in an entry way . endseq
Tokenized sequence: [3, 2, 43, 5, 2, 91, 170, 8, 120, 54, 2, 395, 13, 392, 5, 29, 5107, 692, 4]
Padded sequence: [   3    2   43    5    2   91  170    8  120   54    2  395   13  392
    5   29 5107  692    4    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0]


## Resize and Normalize Images
Resize images to 224x224 and normalize pixel values.

In [5]:
# Resize and normalize images
def preprocess_images(image_dir, target_size=(224, 224)):
    image_features = {}
    failed_images = []

    print('Processing images...')
    total_images = len(os.listdir(image_dir))

    for i, image_name in enumerate(os.listdir(image_dir)):
        if i % 100 == 0:
            print(f'Processed {i}/{total_images} images')

        try:
            image_path = os.path.join(image_dir, image_name)
            # Load and preprocess for VGG16
            image = load_img(image_path, target_size=target_size)
            image = img_to_array(image)
            # Expand dimensions for batch processing
            image = np.expand_dims(image, axis=0)
            # Preprocess specifically for VGG16
            image = preprocess_input(image)
            # Remove the batch dimension
            image = image[0]
            image_features[image_name] = image
        except Exception as e:
            print(f'Error processing {image_name}: {str(e)}')
            failed_images.append(image_name)
            continue

    print(f'\nProcessed {len(image_features)} images successfully.')
    if failed_images:
        print(f'Failed to process {len(failed_images)} images:')
        for img in failed_images:
            print(f'- {img}')

    return image_features

image_dir = '../data/images/'
image_features = preprocess_images(image_dir)
print(f'Final number of processed images: {len(image_features)}')

Processing images...
Processed 0/8091 images
Processed 100/8091 images
Processed 100/8091 images
Processed 200/8091 images
Processed 200/8091 images
Processed 300/8091 images
Processed 300/8091 images
Processed 400/8091 images
Processed 400/8091 images
Processed 500/8091 images
Processed 500/8091 images
Processed 600/8091 images
Processed 600/8091 images
Processed 700/8091 images
Processed 700/8091 images
Processed 800/8091 images
Processed 800/8091 images
Processed 900/8091 images
Processed 900/8091 images
Processed 1000/8091 images
Processed 1000/8091 images
Processed 1100/8091 images
Processed 1100/8091 images
Processed 1200/8091 images
Processed 1200/8091 images
Processed 1300/8091 images
Processed 1300/8091 images
Processed 1400/8091 images
Processed 1400/8091 images
Processed 1500/8091 images
Processed 1500/8091 images
Processed 1600/8091 images
Processed 1600/8091 images
Processed 1700/8091 images
Processed 1700/8091 images
Processed 1800/8091 images
Processed 1800/8091 images
P

## Save Processed Data
Save the tokenized captions and preprocessed image features into the `data/processed/` directory.

In [6]:
# Save tokenized captions
import pickle
processed_dir = '../data/processed/'
os.makedirs(processed_dir, exist_ok=True)
tokenized_captions_path = os.path.join(processed_dir, 'tokenized_captions.pkl')
with open(tokenized_captions_path, 'wb') as f:
    pickle.dump(captions, f)
print(f'Tokenized captions saved to {tokenized_captions_path}')

# Save tokenizer
tokenizer_path = os.path.join(processed_dir, 'tokenizer.pkl')
with open(tokenizer_path, 'wb') as f:
    pickle.dump(tokenizer, f)
print(f'Tokenizer saved to {tokenizer_path}')

# Save image features
image_features_path = os.path.join(processed_dir, 'image_features.pkl')
with open(image_features_path, 'wb') as f:
    pickle.dump(image_features, f)
print(f'Image features saved to {image_features_path}')

Tokenized captions saved to ../data/processed/tokenized_captions.pkl
Tokenizer saved to ../data/processed/tokenizer.pkl
Image features saved to ../data/processed/image_features.pkl
Image features saved to ../data/processed/image_features.pkl


## Split Dataset
Split the dataset into training, validation, and testing sets based on the provided text files.

In [7]:
# Load image IDs for each split
def load_image_ids(file_path):
    with open(file_path, 'r') as f:
        return set(line.strip() for line in f)

train_ids = load_image_ids('../data/Flickr_8k.trainImages.txt')
val_ids = load_image_ids('../data/Flickr_8k.devImages.txt')
test_ids = load_image_ids('../data/Flickr_8k.testImages.txt')

print(f'Training set: {len(train_ids)} images')
print(f'Validation set: {len(val_ids)} images')
print(f'Testing set: {len(test_ids)} images')

# Split captions and image features
def split_data(captions, image_features, train_ids, val_ids, test_ids):
    train_captions = {k: v for k, v in captions.items() if k in train_ids}
    val_captions = {k: v for k, v in captions.items() if k in val_ids}
    test_captions = {k: v for k, v in captions.items() if k in test_ids}

    train_features = {k: v for k, v in image_features.items() if k in train_ids}
    val_features = {k: v for k, v in image_features.items() if k in val_ids}
    test_features = {k: v for k, v in image_features.items() if k in test_ids}

    return train_captions, val_captions, test_captions, train_features, val_features, test_features

train_captions, val_captions, test_captions, train_features, val_features, test_features = split_data(captions, image_features, train_ids, val_ids, test_ids)

# Save splits
with open(os.path.join(processed_dir, 'train_captions.pkl'), 'wb') as f:
    pickle.dump(train_captions, f)
with open(os.path.join(processed_dir, 'val_captions.pkl'), 'wb') as f:
    pickle.dump(val_captions, f)
with open(os.path.join(processed_dir, 'test_captions.pkl'), 'wb') as f:
    pickle.dump(test_captions, f)

with open(os.path.join(processed_dir, 'train_features.pkl'), 'wb') as f:
    pickle.dump(train_features, f)
with open(os.path.join(processed_dir, 'val_features.pkl'), 'wb') as f:
    pickle.dump(val_features, f)
with open(os.path.join(processed_dir, 'test_features.pkl'), 'wb') as f:
    pickle.dump(test_features, f)

print('Dataset split and saved successfully.')

Training set: 6000 images
Validation set: 1000 images
Testing set: 1000 images
Dataset split and saved successfully.
Dataset split and saved successfully.


## Verify Preprocessed Data
Check if the tokenized captions, tokenizer, and image features are saved correctly.

In [8]:
# Verify tokenized captions
tokenized_captions_path = '../data/processed/tokenized_captions.pkl'
if os.path.exists(tokenized_captions_path):
    with open(tokenized_captions_path, 'rb') as f:
        tokenized_captions = pickle.load(f)
    print(f'Tokenized captions loaded successfully. Total images: {len(tokenized_captions)}')
else:
    print('Tokenized captions file not found.')

# Verify tokenizer
tokenizer_path = '../data/processed/tokenizer.pkl'
if os.path.exists(tokenizer_path):
    with open(tokenizer_path, 'rb') as f:
        tokenizer = pickle.load(f)
    print('Tokenizer loaded successfully.')
else:
    print('Tokenizer file not found.')

# Verify image features
image_features_path = '../data/processed/image_features.pkl'
if os.path.exists(image_features_path):
    with open(image_features_path, 'rb') as f:
        image_features = pickle.load(f)
    print(f'Image features loaded successfully. Total images: {len(image_features)}')
else:
    print('Image features file not found.')

Tokenized captions loaded successfully. Total images: 8092
Tokenizer loaded successfully.
Image features loaded successfully. Total images: 8091
Image features loaded successfully. Total images: 8091
