# 01 Data Exploration & Preprocessing

This notebook explores the Flickr8k dataset, processes captions, and saves mappings for later use.

In [None]:
import os
import matplotlib.pyplot as plt
from PIL import Image
import json

### Load Captions File

In [None]:
captions_path = "../data/Flickr8k_text/captions.txt"
with open(captions_path, 'r') as f:
    lines = f.readlines()[1:]  # skip header

print("Total captions:", len(lines))

### Create Mapping: Image → Captions

In [None]:
mapping = {}
for line in lines:
    parts = line.strip().split(',')
    image_id = parts[0].split('.')[0]
    caption = " ".join(parts[1:])
    mapping.setdefault(image_id, []).append(caption)

print("Sample Image ID:", list(mapping.keys())[0])
print("Captions:", mapping[list(mapping.keys())[0]])

### Visualize a Sample Image

In [None]:
sample_img = "../data/Flickr8k_Dataset/Images/1000268201_693b08cb0e.jpg"
img = Image.open(sample_img)
plt.imshow(img)
plt.title("Example Image")
plt.axis('off')
plt.show()

### Save Mapping for Later Use

In [None]:
with open("../data/Flickr8k_text/captions_mapping.json", "w") as f:
    json.dump(mapping, f)
print("Mapping saved to captions_mapping.json")