In [8]:
import os
from PIL import Image as PILImage
from tqdm import tqdm
from IPython.display import display
from datasets import Dataset, Features, Image, Value, ClassLabel

In [9]:
raw_data_path = 'D:/Adams/dataset/CUB_200_2011_CAP'
save_dir = 'D:/Adams/dataset/parquets/'

In [10]:
def raw_data_gen(raw_data_path):
    for img_file in tqdm(os.listdir(raw_data_path)):
        if img_file.endswith(".png") or img_file.endswith(".jpg"):
            metadata_file = img_file.replace(".png", ".txt").replace(".jpg", ".txt")
            if os.path.exists(os.path.join(raw_data_path, metadata_file)):
                action_id = img_file.replace(".png", "").replace(".jpg", "")
                wds = action_id.split("_")[:-2]
                label = ' '.join(wds)
                img = PILImage.open(os.path.join(raw_data_path, img_file))
                with open(os.path.join(raw_data_path, metadata_file), 'r') as f:
                    caption = f.read().replace('\n', '')
                yield {'image':img, 'text':caption, 'label':label}

In [11]:
features = Features({
    'image': Image(),
    'text': Value('string'),
    'label': Value('string'),
})
ds = Dataset.from_generator(lambda: raw_data_gen(raw_data_path), features=features)

100%|██████████| 23574/23574 [00:01<00:00, 13113.40it/s]xamples/s]
Generating train split: 11787 examples [00:01, 6440.36 examples/s]


In [12]:
ds.class_encode_column("label")

Casting to class labels: 100%|██████████| 11787/11787 [00:00<00:00, 339616.14 examples/s]


Dataset({
    features: ['image', 'text', 'label'],
    num_rows: 11787
})

In [13]:
ds.save_to_disk(save_dir)

Saving the dataset (3/3 shards): 100%|██████████| 11787/11787 [00:01<00:00, 5961.53 examples/s]


In [14]:
ds.push_to_hub("weiywang/CUB_200_2011_CAP")

Map: 100%|██████████| 3929/3929 [00:00<00:00, 6947.69 examples/s]s]
Creating parquet from Arrow format: 100%|██████████| 40/40 [00:00<00:00, 56.83ba/s]
Map: 100%|██████████| 3929/3929 [00:00<00:00, 6298.47 examples/s]93.18s/it]
Creating parquet from Arrow format: 100%|██████████| 40/40 [00:00<00:00, 60.59ba/s]
Map: 100%|██████████| 3929/3929 [00:01<00:00, 3321.50 examples/s]93.32s/it]
Creating parquet from Arrow format: 100%|██████████| 40/40 [00:01<00:00, 38.91ba/s]
Uploading the dataset shards: 100%|██████████| 3/3 [04:39<00:00, 93.18s/it]
README.md: 100%|██████████| 375/375 [00:00<?, ?B/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
