In [None]:
from Cocordiais import CocordiaisUtils as cocordiais

import datasets
import json
import os
import pandas as pd

In [None]:
DATA_PATH = "./data"

DATA_INFO = [
  {
    "name": "cordiais",
    "license_id": 1,
    "date": "2023-06-10 00:00:00",
    "train_pct": 0.5,
    "source_path": os.path.join(DATA_PATH, "cordiais-source"),
    "json_path": os.path.join(DATA_PATH, "cordiais.json")
  },
  {
    "name": "metfaces",
    "license_id": 1,
    "date": "2020-06-10 00:00:00",
    "train_pct": 1.0,
    "source_path": os.path.join(DATA_PATH, "metfaces-source"),
    "json_path": os.path.join(DATA_PATH, "metfaces.json")
  }
]

ALL_IMGS = []

for ds in DATA_INFO:
  file_list = sorted([f.replace(".jpg", "") for f in os.listdir(ds["source_path"]) if f.endswith(".jpg")])
  train_length = int(ds["train_pct"] * len(file_list))
  ds["source_list"] = {}
  ds["source_list"]["train"] = file_list[:train_length]
  ds["source_list"]["test"] = file_list[train_length:]
  ALL_IMGS += file_list

IMG2ID = {img:id for id,img in enumerate(ALL_IMGS)}

### Create HF Dataset

In [None]:
slug2obj = {}
object_count = 0

for split in ["train", "test"]:
  slug2obj[split] = {}

for ds in DATA_INFO:
  with open(ds["json_path"]) as json_file_read:
    data_json = json.load(json_file_read)

    for object_info in data_json:
      object_slug = object_info["source_image"]
      object_split = "train" if object_slug in ds["source_list"]["train"] else "test"

      if object_slug not in slug2obj[object_split]:
        slug2obj[object_split][object_slug] = {
          "image_id": IMG2ID[object_slug],
          "image": os.path.join(ds["source_path"], "%s.jpg" % object_slug),
          "image_filename": "%s.jpg" % object_slug,
          "width": object_info["source_image_w"],
          "height": object_info["source_image_h"],
          "license_id": ds["license_id"],
          "date_captured": ds["date"],
          "objects": []
        }

      slug2obj[object_split][object_slug]["objects"].append({
        "bbox_id": object_count,
        "area": object_info["face_rect_xywh"][2] * object_info["face_rect_xywh"][3],
        "bbox": object_info["face_rect_xywh"],
        "category": object_info["gender"],
        "super_category": cocordiais.LABEL2SUPERLABEL[object_info["gender"]],
        "is_crowd": False
      })
      object_count += 1

In [None]:
ds_dict = {}

for split in ["train", "test"]:
  records = json.loads(json.dumps(list(slug2obj[split].values())))
  for r in records:
    r["objects"] = pd.DataFrame(r["objects"]).to_dict("list")
  ds_dict[split] = pd.DataFrame(records).to_dict("list")

In [None]:
CORDIAIS_FEATURES = datasets.Features({
  "image_id": datasets.Value("int64"),
  "image": datasets.Image(decode=True),
  "image_filename": datasets.Value("string"),
  "width": datasets.Value("int64"),
  "height": datasets.Value("int64"),
  "license_id": datasets.Value("int64"),
  "date_captured": datasets.Value("string"),
  "objects": datasets.Sequence(feature={
    "bbox_id": datasets.Value("int64"),
    "category": datasets.ClassLabel(names=list(cocordiais.LABEL2ID.keys())),
    "bbox": datasets.Sequence(feature=datasets.Value("int64"), length=4),
    "super_category": datasets.ClassLabel(names=list(set(cocordiais.COCORDIAIS_SUPERLABELS))),
    "area": datasets.Value("int64"),
    "is_crowd": datasets.Value("bool")
  })
})

In [None]:
ds_info = datasets.DatasetInfo(
  description=cocordiais.COCORDIAIS_DATASET_INFO["info"]["description"],
  homepage=cocordiais.COCORDIAIS_DATASET_INFO["info"]["url"],
  version=cocordiais.COCORDIAIS_DATASET_INFO["info"]["version"],
  license=cocordiais.COCORDIAIS_DATASET_INFO["licenses"][1]["name"],
  features=CORDIAIS_FEATURES
)

In [None]:
hf_dataset = datasets.DatasetDict({
  split: datasets.Dataset.from_dict(data, features=CORDIAIS_FEATURES, info=ds_info, split=split) for split, data in ds_dict.items()
})

In [None]:
hf_dataset.push_to_hub("thiagohersan/cordiais-faces", private=True)

### Test dataset

In [None]:
import torchvision.transforms as T

from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import DetrImageProcessor

from Cocordiais import CocordiaisDataset
from PIL import Image, ImageDraw

In [None]:
HF_DATASET = "thiagohersan/cordiais-faces"
DETR_MODEL = "facebook/detr-resnet-50"

detr_size = { "shortest_edge": 800, "longest_edge": 800 }
detr_processor = DetrImageProcessor.from_pretrained(DETR_MODEL, size=detr_size)

hf_dataset = load_dataset(HF_DATASET)
hf_dataset_train = hf_dataset["train"].train_test_split(test_size=0.2, shuffle=True, seed=101010)

dataset_train = CocordiaisDataset(hf_dataset_train["train"], img_processor=detr_processor, train=True)
dataset_eval = CocordiaisDataset(hf_dataset_train["test"], img_processor=detr_processor, train=False)
dataset_test = CocordiaisDataset(hf_dataset["test"], img_processor=detr_processor, train=False)

lens = (len(dataset_train), len(dataset_eval), len(dataset_test))

print("Number of examples:\n  Train: %s\n  Evaluation: %s\n  Test: %s" % lens)

In [None]:
dataloader_train = DataLoader(
  dataset_train.data,
  collate_fn=dataset_train.collate_batch,
  batch_size=12,
  shuffle=True
)

dataloader_eval = DataLoader(
  dataset_eval.data,
  collate_fn=dataset_eval.collate_batch,
  batch_size=4,
  shuffle=False
)

dataloader_test = DataLoader(
  dataset_test.data,
  collate_fn=dataset_test.collate_batch,
  batch_size=4,
  shuffle=False
)

In [None]:
d_train = dataset_train.data[200]
# img_train = T.ToPILImage()(d_train["pixel_values"])
img_train = hf_dataset_train["train"][200]["image"]
img_w, img_h = img_train.size

annotations = d_train["labels"]
draw = ImageDraw.Draw(img_train, "RGBA")

id2label = {0:"female", 1:"not-female"}
id2label = hf_dataset["train"].features["objects"].feature["category"].names

for i in range(len(annotations["boxes"])):
  label = id2label[annotations["class_labels"][i].item()]
  xc,yc,w,h = tuple(annotations["boxes"][i])
  draw.rectangle(((xc-w/2)*img_w, (yc-h/2)*img_h, (xc+w/2)*img_w, (yc+h/2)*img_h), outline="red", width=2)
  draw.text(((xc-w/2)*img_w, (yc-h/2)*img_h), label, fill="white")
img_train