<a href="https://colab.research.google.com/github/vimesh630/Automated-Image-Caption-Generator/blob/main/Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Required Libraries

In [1]:
# Install any missing libraries
!pip install nltk opencv-python pillow matplotlib

# Import libraries
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import pickle
import string
from nltk.tokenize import word_tokenize
from PIL import Image

# Download NLTK data if needed
import nltk
nltk.download('punkt')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Mount Google Drive

In [2]:
# Mount Google Drive if needed to save files or intermediate results
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Set Up Paths & Directories

In [3]:
# [2] SETUP PATHS & DIRECTORIES

# Adjust these paths to match your dataset structure in Google Drive
base_dir = '/content/drive/MyDrive/Individual projects/Automated-Caption-Generator'
image_dir = f'{base_dir}/Images'
caption_file = f'{base_dir}/captions.txt'

# Create an output directory to store preprocessed data
output_dir = f'{base_dir}/preprocessed'
!mkdir -p "{output_dir}"

print("Base directory:", base_dir)
print("Image directory:", image_dir)
print("Caption file:", caption_file)
print("Output directory:", output_dir)

Base directory: /content/drive/MyDrive/Individual projects/Automated-Caption-Generator
Image directory: /content/drive/MyDrive/Individual projects/Automated-Caption-Generator/Images
Caption file: /content/drive/MyDrive/Individual projects/Automated-Caption-Generator/captions.txt
Output directory: /content/drive/MyDrive/Individual projects/Automated-Caption-Generator/preprocessed


Preprocessing Functions

In [4]:
# [4] PREPROCESSING FUNCTIONS

# 4.1 Caption Preprocessing
def preprocess_caption(caption):
    """
    Clean and tokenize a caption.
    """
    caption = caption.lower().strip()  # Lowercase and remove extra whitespace
    caption = caption.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = word_tokenize(caption)  # Tokenize into words
    return tokens

# 4.2 Load Captions
def load_captions(caption_file_path):
    """
    Load captions into a dictionary mapping image filenames to lists of tokenized captions.
    """
    captions_dict = {}
    with open(caption_file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if len(line) < 1:
                continue
            # Expected format: "1000268201_693b08cb0e.jpg#0\tA child in a pink dress..."
            image_caption = line.split('\t')
            if len(image_caption) != 2:
                continue
            image_info, caption = image_caption
            # Extract image filename (remove the '#number' part)
            image_filename = image_info.split('#')[0]

            # Preprocess the caption
            tokens = preprocess_caption(caption)

            if image_filename not in captions_dict:
                captions_dict[image_filename] = []
            captions_dict[image_filename].append(tokens)
    return captions_dict

# 4.3 Load Image Paths
def load_image_paths(image_directory):
    """
    Load all image paths from the given directory.
    """
    return [os.path.join(image_directory, img) for img in os.listdir(image_directory) if img.endswith('.jpg')]

# 4.4 Preprocess Images
def preprocess_image(image_path, target_size=(224, 224)):
    """
    Preprocess an image: read, resize, and normalize pixel values.
    """
    image = cv2.imread(image_path)
    image = cv2.resize(image, target_size)
    image = image / 255.0  # Normalize pixel values to [0, 1]
    return image

Caption Preprocessing

In [5]:
# [5] LOAD AND PREPROCESS CAPTIONS

print("Loading and preprocessing captions...")

captions_dict = load_captions(caption_file)
print(f"Total images with captions: {len(captions_dict)}")

# Save the preprocessed captions dictionary to a pickle file
captions_output_path = os.path.join(output_dir, 'captions_dict.pkl')
with open(captions_output_path, 'wb') as f:
    pickle.dump(captions_dict, f)

print(f"Preprocessed captions saved to: {captions_output_path}")

Loading and preprocessing captions...
Total images with captions: 0
Preprocessed captions saved to: /content/drive/MyDrive/Individual projects/Automated-Caption-Generator/preprocessed/captions_dict.pkl


Image Preprocessing

In [6]:
# [6] LOAD AND PREPROCESS IMAGES

print("Loading image paths...")
image_paths = load_image_paths(image_dir)
print(f"Total images found: {len(image_paths)}")

preprocessed_images = {}
print("Preprocessing images...")

for img_path in image_paths:
    img_filename = os.path.basename(img_path)
    preprocessed_images[img_filename] = preprocess_image(img_path)

# Save the preprocessed images dictionary
images_output_path = os.path.join(output_dir, 'preprocessed_images.pkl')
with open(images_output_path, 'wb') as f:
    pickle.dump(preprocessed_images, f)

print(f"Preprocessed images saved to: {images_output_path}")

Loading image paths...
Total images found: 8092
Preprocessing images...
Preprocessed images saved to: /content/drive/MyDrive/Individual projects/Automated-Caption-Generator/preprocessed/preprocessed_images.pkl


Visualize Sample Image and Caption

In [7]:
# [7] VISUALIZATION

def display_sample_image(image_array, caption_tokens):
    """
    Display a preprocessed image with its caption tokens.
    """
    plt.imshow(image_array)
    plt.title(" ".join(caption_tokens))
    plt.axis('off')
    plt.show()

# Pick a sample image that has a caption
sample_filename = list(captions_dict.keys())[0]
sample_image = preprocessed_images[sample_filename]
sample_caption_tokens = captions_dict[sample_filename][0]  # First caption

display_sample_image(sample_image, sample_caption_tokens)

IndexError: list index out of range