In [None]:
from Cocordiais import CocordiaisUtils as cocordiais

import datasets
import json
import os
import pandas as pd

In [None]:
DATA_PATH = "./data"

DATA_INFO = [
  {
    "name": "cordiais",
    "license_id": 1,
    "date": "2023-06-10 00:00:00",
    "train_pct": 0.5,
    "source_path": os.path.join(DATA_PATH, "cordiais-source"),
    "json_path": os.path.join(DATA_PATH, "cordiais.json")
  },
  {
    "name": "metfaces",
    "license_id": 1,
    "date": "2020-06-10 00:00:00",
    "train_pct": 1.0,
    "source_path": os.path.join(DATA_PATH, "metfaces-source"),
    "json_path": os.path.join(DATA_PATH, "metfaces.json")
  },
  {
    "name": "hermitage",
    "license_id": 0,
    "date": "2021-03-03 00:00:00",
    "train_pct": 1.0,
    "source_path": os.path.join(DATA_PATH, "hermitage-source"),
    "json_path": os.path.join(DATA_PATH, "hermitage.json")
  }
]

ALL_IMGS = []

for ds in DATA_INFO:
  file_list = sorted([f.replace(".jpg", "") for f in os.listdir(ds["source_path"]) if f.endswith(".jpg")])
  train_length = int(ds["train_pct"] * len(file_list))
  ds["source_list"] = {}
  ds["source_list"]["train"] = file_list[:train_length]
  ds["source_list"]["test"] = file_list[train_length:]
  ALL_IMGS += file_list

IMG2ID = {img:id for id,img in enumerate(ALL_IMGS)}

### Create HF Dataset

In [None]:
slug2obj = {}
object_count = 0

for split in ["train", "test"]:
  slug2obj[split] = {}

for ds in DATA_INFO:
  with open(ds["json_path"]) as json_file_read:
    data_json = json.load(json_file_read)

    for object_info in data_json:
      object_slug = object_info["source_image"]
      object_split = "train" if object_slug in ds["source_list"]["train"] else "test"

      if object_slug not in slug2obj[object_split]:
        slug2obj[object_split][object_slug] = {
          "image_id": IMG2ID[object_slug],
          "image": os.path.join(ds["source_path"], "%s.jpg" % object_slug),
          "image_filename": "%s.jpg" % object_slug,
          "width": object_info["source_image_w"],
          "height": object_info["source_image_h"],
          "objects": []
        }

      slug2obj[object_split][object_slug]["objects"].append({
        "bbox_id": object_count,
        "area": object_info["face_rect_xywh"][2] * object_info["face_rect_xywh"][3],
        "bbox": object_info["face_rect_xywh"],
        "category": object_info["gender"],
        "super_category": cocordiais.LABEL2SUPERLABEL[object_info["gender"]],
        "is_crowd": False
      })
      object_count += 1

In [None]:
ds_dict = {}

for split in ["train", "test"]:
  records = json.loads(json.dumps(list(slug2obj[split].values())))
  for r in records:
    r["objects"] = pd.DataFrame(r["objects"]).to_dict("list")
  ds_dict[split] = pd.DataFrame(records).to_dict("list")

In [None]:
hf_dataset = datasets.DatasetDict({
  split: datasets.Dataset.from_dict(data, features=cocordiais.COCORDIAIS_FEATURES, info=cocordiais.get_dataset_info(), split=split) for split, data in ds_dict.items()
})

In [None]:
HF_DATASET = "thiagohersan/cordiais-faces"
hf_dataset.push_to_hub(HF_DATASET, private=True)

### Test dataset

In [None]:
import torchvision.transforms as T

from datasets import load_dataset
from torch import ones_like
from transformers import DetrImageProcessor

from Cocordiais import CocordiaisDataset, CocordiaisUtils

try:
  HF_DATASET = HF_DATASET
except NameError:
  HF_DATASET = "thiagohersan/cordiais-faces"

In [None]:
DETR_MODEL = "facebook/detr-resnet-50"

detr_size = { "shortest_edge": 800, "longest_edge": 800 }
detr_processor = DetrImageProcessor.from_pretrained(DETR_MODEL, size=detr_size)

hf_dataset = load_dataset(HF_DATASET)
hf_dataset_train = hf_dataset["train"].train_test_split(test_size=0.2, shuffle=True, seed=101010)

dataset_train = CocordiaisDataset(hf_dataset_train["train"], img_processor=detr_processor, train=True)
dataset_eval = CocordiaisDataset(hf_dataset_train["test"], img_processor=detr_processor, train=False)
dataset_test = CocordiaisDataset(hf_dataset["test"], img_processor=detr_processor, train=False)

lens = (len(dataset_train), len(dataset_eval), len(dataset_test))

print("Number of examples:\n  Train: %s\n  Evaluation: %s\n  Test: %s" % lens)

In [None]:
idx = 0
data = dataset_train.data[idx]
img = T.ToPILImage()(data["pixel_values"])
# img = hf_dataset_train["train"][idx]["image"]

boxes_info = {
  "scores": ones_like(data["labels"]["class_labels"]),
  "labels": data["labels"]["class_labels"],
  "boxes": CocordiaisUtils.bboxes_xcycwh_to_xyxy(data["labels"])
}

CocordiaisUtils.plot_boxes(img, boxes_info)