In [9]:
import pandas as pd
from datasets import Dataset, load_from_disk
import torchvision.transforms as transforms
import json
from PIL import Image
import os
import re

In [10]:
with open("data/coco/annotations/captions_train2017.json",'r') as fh:
    raw_train_data = json.load(fh)
with open("data/coco/annotations/captions_val2017.json",'r') as fh:
    raw_val_data = json.load(fh)

In [11]:
train_df = pd.DataFrame(raw_train_data["annotations"])
train_df.drop_duplicates(subset=['image_id'], inplace=True)
train_df = train_df[:25000]
val_df = pd.DataFrame(raw_val_data["annotations"])
val_df.drop_duplicates(subset=['image_id'], inplace=True)

In [12]:
def map_image_id_to_image(image_id, split='train'):
    image_folder = f'data/coco/{split}2017/'
    # format image id with leading zeros
    filename = ""
    for i in range(12-len(str(image_id))):
        filename += '0'
    filename += str(image_id)
    formatted_image_id = re.sub(r'^(\d+)$', lambda x: x.group(1).zfill(6), filename)
    image_filename = f"{formatted_image_id}.JPG"  # Assuming the file extension is '.jpeg'
    return os.path.join(image_folder, image_filename)

# Apply the function to create the new column
train_df['img_path'] = train_df['image_id'].apply(map_image_id_to_image)
train_df['is_file'] = train_df['img_path'].apply(lambda x : os.path.isfile(x))
train_df = train_df[train_df['is_file'] == True]
val_df['img_path'] = val_df['image_id'].apply(map_image_id_to_image)
val_df['is_file'] = val_df['img_path'].apply(lambda x : os.path.isfile(x))
val_df = val_df[val_df['is_file'] == True]

In [13]:
transform = transforms.Compose(
        [transforms.ToTensor(),
        transforms.Resize((128, 128)),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

def preprocess_img(data_entry, split='train'):
    raw  = Image.open(data_entry['img_path']).convert("RGB")
    processed = transform(raw)
    return {'raw_image': raw, 'img': processed}

In [14]:
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
val_dataset = Dataset.from_pandas(val_df, preserve_index=False)

val_dataset = val_dataset.map(lambda x: preprocess_img(x, split='val'))
train_dataset = train_dataset.map(lambda x: preprocess_img(x, split='train'))


Map: 100%|██████████| 25000/25000 [1:05:59<00:00,  6.31 examples/s] 


In [15]:
train_dataset.save_to_disk("processed_train")
val_dataset.save_to_disk("processed_val")


Saving the dataset (35/35 shards): 100%|██████████| 25000/25000 [01:25<00:00, 291.00 examples/s]
Saving the dataset (1/1 shards): : 0 examples [00:00, ? examples/s]
