In [4]:
# Installing required libraries for COCO dataset processing
# Dataset source: https://www.kaggle.com/datasets/mnassrib/ms-coco?resource=download
!pip install pycocotools matplotlib Pillow

# Importing essential libraries
from pycocotools.coco import COCO  # For accessing COCO dataset annotations
import random  # For random image sampling
import pandas as pd  # For data storage and CSV export

# Setting dataset file paths
CAPTION_ANN_PATH = '/content/drive/MyDrive/MSCOCO/annotations_trainval2014/annotations/captions_train2014.json'
IMG_DIR = '/content/drive/MyDrive/MSCOCO/train2014/train2014'

# Initializing COCO API for captions only
print("Initializing COCO API for captions...")
coco = COCO(CAPTION_ANN_PATH)

# Selecting random 200 images from the dataset
print("Sampling 200 random image IDs...")
all_img_ids = coco.getImgIds()
sampled_ids = random.sample(all_img_ids, 200)

# Defining text cleaning utility function
def clean_caption(text):
    """Removing unwanted characters and normalizing caption text"""
    return text.replace('\n', ' ').replace('\r', ' ').strip()

# Pre-loading all caption annotations in batch for faster processing
print("Loading caption annotations in batch...")
batch_annotations = coco.loadAnns(coco.getAnnIds(imgIds=sampled_ids))

# Creating fast lookup dictionary for captions
print("Building caption lookup structure...")
caption_dict = {}
for ann in batch_annotations:
    if ann['image_id'] not in caption_dict:
        caption_dict[ann['image_id']] = clean_caption(ann['caption'])

# Processing sampled images and building dataset
print("Compiling final dataset...")
dataset = []
for img_id in sampled_ids:
    # Retrieving pre-processed caption
    caption = caption_dict.get(img_id, 'No caption available')

    # Storing only image_id and caption
    dataset.append({
        'image_id': img_id,
        'caption': caption
    })

# Creating and saving the final dataset
print("Saving results to CSV...")
result_df = pd.DataFrame(dataset)
result_df.to_csv('mscoco_captions.csv', index=False, encoding='utf-8')

print("Operation completed successfully! Saved captions for 200 images.")

Initializing COCO API for captions...
loading annotations into memory...
Done (t=1.75s)
creating index...
index created!
Sampling 200 random image IDs...
Loading caption annotations in batch...
Building caption lookup structure...
Compiling final dataset...
Saving results to CSV...
Operation completed successfully! Saved captions for 200 images.
