In [52]:
import pandas as pd
from datasets import Dataset, load_from_disk
import torchvision.transforms as transforms
import json
from PIL import Image
import os
import re

In [53]:
with open("data/coco/annotations/captions_train2017.json",'r') as fh:
    raw_train_data = json.load(fh)
with open("data/coco/annotations/captions_val2017.json",'r') as fh:
    raw_val_data = json.load(fh)

In [54]:
train_df = pd.DataFrame(raw_train_data["annotations"])
train_df.drop_duplicates(subset=['image_id'], inplace=True)
train_df = train_df[:25000]
val_df = pd.DataFrame(raw_val_data["annotations"])
val_df.drop_duplicates(subset=['image_id'], inplace=True)

In [55]:
def map_image_id_to_image(image_id, split='train'):
    image_folder = f'data/coco/{split}2017/'
    # format image id with leading zeros
    filename = ""
    for i in range(12-len(str(image_id))):
        filename += '0'
    filename += str(image_id)
    formatted_image_id = re.sub(r'^(\d+)$', lambda x: x.group(1).zfill(6), filename)
    image_filename = f"{formatted_image_id}.JPG"  # Assuming the file extension is '.jpeg'
    return os.path.join(image_folder, image_filename)

# Apply the function to create the new column
train_df['img_path'] = train_df['image_id'].apply(map_image_id_to_image)
train_df['is_file'] = train_df['img_path'].apply(lambda x : os.path.isfile(x))
train_df = train_df[train_df['is_file'] == True]
val_df['img_path'] = val_df['image_id'].apply(lambda x: map_image_id_to_image(x, split='val'))
val_df['is_file'] = val_df['img_path'].apply(lambda x : os.path.isfile(x))
val_df = val_df[val_df['is_file'] == True]

In [56]:
# transform = transforms.Compose(
#         [transforms.ToTensor(),
#         transforms.Resize((128, 128)),
#         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

def preprocess_img(data_entry):
    raw  = Image.open(data_entry['img_path']).convert("RGB")
#     processed = transform(raw)
#     return {'raw_image': raw, 'img': processed}
    return {'raw_image': raw}

In [57]:
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
val_dataset = Dataset.from_pandas(val_df, preserve_index=False)

train_dataset = train_dataset.map(preprocess_img)
train_dataset.save_to_disk("processed_train")

In [58]:
val_dataset = val_dataset.map(preprocess_img)
val_dataset.save_to_disk("processed_val")

Map: 100%|██████████| 5000/5000 [13:21<00:00,  6.24 examples/s]  
Saving the dataset (7/7 shards): 100%|██████████| 5000/5000 [00:06<00:00, 804.25 examples/s]
