In [1]:
import torch
import numpy as np
from tqdm import tqdm

from pathlib import Path
from PIL import Image, ImageDraw
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from transformers import ViTImageProcessor, TrainingArguments, Trainer, YolosForObjectDetection
from datasets import load_dataset, Dataset, DatasetDict

In [2]:
device = "cuda" if torch.cuda.is_available else "cpu"

ds_location = Path.cwd() / "DatasetOriginal"
# print(ds_location)
csv_location = ds_location / "metadata.csv"
# print(str(csv_location))
orig_dataset = load_dataset("csv", data_files=str(csv_location))

Generating train split: 0 examples [00:00, ? examples/s]

In [3]:
print(orig_dataset)
print(orig_dataset["train"][0].keys())

DatasetDict({
    train: Dataset({
        features: ['file_name', 'image_id', 'width', 'height', 'objects'],
        num_rows: 12101
    })
})
dict_keys(['file_name', 'image_id', 'width', 'height', 'objects'])


In [4]:
train_test_split = orig_dataset["train"].train_test_split(test_size=0.3, seed=5)
wildlife_train_ds = train_test_split["train"]
wildlife_test_ds = train_test_split["test"]

In [5]:
print("checking...")
print(wildlife_train_ds)
print(wildlife_test_ds)

checking...
Dataset({
    features: ['file_name', 'image_id', 'width', 'height', 'objects'],
    num_rows: 8470
})
Dataset({
    features: ['file_name', 'image_id', 'width', 'height', 'objects'],
    num_rows: 3631
})


In [6]:
print("checking...")
#! the following actually returns a <string>
test_sample1 = wildlife_train_ds[10].get("objects")
# print(test_sample1)
match_str = "\'category\'"

print(f"type of the above -> {type(test_sample1)}")
#! +13 for the starting index of actual category
#! -2 for the ending idx
start_idx = test_sample1.find(match_str) + 13
category1 = test_sample1[start_idx: -2]
print(f"category of the above -> {category1}")

# print(wildlife_test_ds[10].keys())

checking...
type of the above -> <class 'str'>
category of the above -> raccoon


In [None]:
def load_image(example):
    file_path = ds_location / example["file_name"]  # Get the file path
    image = Image.open(file_path)  # Load the image as a PIL Image
    
    test_sample1 = example['objects']
    start_idx = test_sample1.find(match_str) + 13
    category1 = test_sample1[start_idx: -2]
    
    return {
        "file_name": example["file_name"],
        "image_id": example["image_id"],
        "width": example["width"],
        "height": example["height"],
        "image": image,
        "labels": category1
    }

In [8]:
# Use the map function to add the "image" field
updated_train_ds = wildlife_train_ds.map(load_image)
print(updated_train_ds)

Map:   0%|          | 0/8470 [00:00<?, ? examples/s]

Dataset({
    features: ['file_name', 'image_id', 'width', 'height', 'objects', 'image', 'labels'],
    num_rows: 8470
})


In [9]:
updated_test_ds = wildlife_test_ds.map(load_image)
print(updated_test_ds)

Map:   0%|          | 0/3631 [00:00<?, ? examples/s]

Dataset({
    features: ['file_name', 'image_id', 'width', 'height', 'objects', 'image', 'labels'],
    num_rows: 3631
})


In [10]:
del wildlife_train_ds
del wildlife_test_ds

In [11]:
updated_train_ds = updated_train_ds.remove_columns(["objects"])
print(updated_train_ds)
print(updated_train_ds[0].keys())

Dataset({
    features: ['file_name', 'image_id', 'width', 'height', 'image', 'labels'],
    num_rows: 8470
})
dict_keys(['file_name', 'image_id', 'width', 'height', 'image', 'labels'])


In [12]:
updated_test_ds = updated_test_ds.remove_columns(["objects"])
print(updated_test_ds)
print(updated_test_ds[0].keys())

Dataset({
    features: ['file_name', 'image_id', 'width', 'height', 'image', 'labels'],
    num_rows: 3631
})
dict_keys(['file_name', 'image_id', 'width', 'height', 'image', 'labels'])


In [13]:
updated_dataset = DatasetDict({
    "train": updated_train_ds,
    "test": updated_test_ds
})

In [14]:
print(updated_dataset)
print(updated_dataset["train"])

DatasetDict({
    train: Dataset({
        features: ['file_name', 'image_id', 'width', 'height', 'image', 'labels'],
        num_rows: 8470
    })
    test: Dataset({
        features: ['file_name', 'image_id', 'width', 'height', 'image', 'labels'],
        num_rows: 3631
    })
})
Dataset({
    features: ['file_name', 'image_id', 'width', 'height', 'image', 'labels'],
    num_rows: 8470
})


In [15]:
print(updated_dataset["train"][0].keys())

dict_keys(['file_name', 'image_id', 'width', 'height', 'image', 'labels'])


In [16]:
updated_dataset.push_to_hub("SeaSponge/wildme10_classify",
                             max_shard_size="1GB",
                             num_shards={"train": 6, "test": 2})

Uploading the dataset shards:   0%|          | 0/6 [00:00<?, ?it/s]

Map:   0%|          | 0/1412 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1412 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1412 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1412 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1411 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1411 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/1816 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Map:   0%|          | 0/1815 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/SeaSponge/wildme10_classify/commit/d0b6e31c8f95c8267b7622bd1d725a3282e8a54f', commit_message='Upload dataset', commit_description='', oid='d0b6e31c8f95c8267b7622bd1d725a3282e8a54f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/SeaSponge/wildme10_classify', endpoint='https://huggingface.co', repo_type='dataset', repo_id='SeaSponge/wildme10_classify'), pr_revision=None, pr_num=None)