In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
import string
import json

In [2]:
path = "C:/Users/Yakina/.cache/kagglehub/datasets/hsankesara/flickr-image-dataset/versions/1/flickr30k_images/"

In [3]:
images_dir = path + 'flickr30k_images/'
captions_file_path = path + 'results.csv'
output_dir = "processed_data"

In [4]:
def clean_caption(caption):
    """Clean caption text"""
    caption = str(caption).lower()
    caption = caption.translate(str.maketrans('', '', string.punctuation))
    caption = re.sub(r'[^a-zA-Z\s]', '', caption)
    caption = re.sub(r'\s+', ' ', caption)
    caption = caption.strip()
    
    return caption

In [5]:
df = pd.read_csv(captions_file_path, sep = r'\s*\|\s*')
df['comment'] = df['comment'].apply(clean_caption)

  df = pd.read_csv(captions_file_path, sep = r'\s*\|\s*')


In [6]:
len(df)

158915

In [7]:
df.head()

Unnamed: 0,image_name,comment_number,comment
0,1000092795.jpg,0,two young guys with shaggy hair look at their ...
1,1000092795.jpg,1,two young white males are outside near many bu...
2,1000092795.jpg,2,two men in green shirts are standing in a yard
3,1000092795.jpg,3,a man in a blue shirt standing in a garden
4,1000092795.jpg,4,two friends enjoy time spent together


In [8]:
# Filter out images that don't exist
existing_images = []
missing_images = []

for image_name in df['image_name'].unique():
	image_path = os.path.join(images_dir, image_name)
	if os.path.exists(image_path):
		existing_images.append(image_name)
	else:
		missing_images.append(image_name)

if missing_images:
	print(f"Warning: {len(missing_images)} images not found in directory")
	df = df[df['image_name'].isin(existing_images)]

print(f"{len(df)} captions for {len(existing_images)} images")

158915 captions for 31783 images


In [9]:
unique_images = df['image_name'].unique()

In [10]:
# 80% train, 10% validation, 10% test
train_images, temp_images = train_test_split(unique_images, test_size=0.2, random_state=42)
val_images, test_images = train_test_split(temp_images, test_size=0.5, random_state=42)

In [11]:
train_df = df[df['image_name'].isin(train_images)]
val_df = df[df['image_name'].isin(val_images)]
test_df = df[df['image_name'].isin(test_images)]

print(f"Train set: {len(train_df)} captions for {len(train_images)} images")
print(f"Validation set: {len(val_df)} captions for {len(val_images)} images")
print(f"Test set: {len(test_df)} captions for {len(test_images)} images")

Train set: 127130 captions for 25426 images
Validation set: 15890 captions for 3178 images
Test set: 15895 captions for 3179 images


In [12]:
os.makedirs(output_dir, exist_ok=True)

In [13]:
# Save splits
train_df.to_csv(os.path.join(output_dir, 'train_captions.csv'), index=False)
val_df.to_csv(os.path.join(output_dir, 'val_captions.csv'), index=False)
test_df.to_csv(os.path.join(output_dir, 'test_captions.csv'), index=False)

In [14]:
stats = {
	'total_images': len(existing_images),
	'total_captions': len(df),
	'train_images': len(train_images),
	'val_images': len(val_images),
	'test_images': len(test_images),
	'train_captions': len(train_df),
	'val_captions': len(val_df),
	'test_captions': len(test_df),
	'avg_caption_length': df['comment'].str.split().str.len().mean(),
	'vocab_size_estimate': len(set(' '.join(df['comment']).split()))
}

In [15]:
print("\nDataset Statistics:")
for key, value in stats.items():
	if isinstance(value, float):
		print(f"{key}: {value:.2f}")
	else:
		print(f"{key}: {value}")


Dataset Statistics:
total_images: 31783
total_captions: 158915
train_images: 25426
val_images: 3178
test_images: 3179
train_captions: 127130
val_captions: 15890
test_captions: 15895
avg_caption_length: 12.25
vocab_size_estimate: 19770


In [16]:
with open(os.path.join(output_dir, 'dataset_stats.json'), 'w') as f:
	json.dump(stats, f, indent=2)