In [None]:
import datasets
import datetime
import json
import os
import pandas as pd

from huggingface_hub import DatasetCard, DatasetCardData

In [None]:
DATA_PATH = "./data"

ID2LABELS_PATH = os.path.join(DATA_PATH, "id2label.json")

ID2LABEL = {}
with open(ID2LABELS_PATH) as json_file_read:
    ID2LABEL = {int(k):v for k,v in json.load(json_file_read).items()}

LABEL2ID = {v:int(k) for k,v in ID2LABEL.items()}

ID2SUPERLABEL = {int(k): v if v == "N/A" else "face" for k,v in ID2LABEL.items()}
SUPERLABEL2SUPERID = {sl:si for si,sl in enumerate(set([l for l in ID2SUPERLABEL.values()]))}
ID2SUPERID = {int(k): int(SUPERLABEL2SUPERID[v]) for k,v in ID2SUPERLABEL.items()}

DATA_INFO = [
  {
    "name": "cordiais",
    "license_id": 1,
    "date": "2023-06-10 00:00:00",
    "include_pct": 0.5,
    "source_path": os.path.join(DATA_PATH, "cordiais-source"),
    "json_path": os.path.join(DATA_PATH, "cordiais.json")
  },
  {
    "name": "metfaces",
    "license_id": 1,
    "date": "2020-06-10 00:00:00",
    "include_pct": 1.0,
    "source_path": os.path.join(DATA_PATH, "metfaces-source"),
    "json_path": os.path.join(DATA_PATH, "metfaces.json")
  }
]

for ds in DATA_INFO:
  file_list = sorted([f.replace(".jpg", "") for f in os.listdir(ds["source_path"]) if f.endswith(".jpg")])
  include_length = int(ds["include_pct"] * len(file_list))
  ds["source_list"] = file_list[:include_length]

ALL_IMGS = [img for subl in [ds["source_list"] for ds in DATA_INFO] for img in subl]
IMG2ID = {img:id for id,img in enumerate(ALL_IMGS)}


### Create HF Dataset

In [None]:
slug2obj = {}
object_count = 0

for ds in DATA_INFO:
  with open(ds["json_path"]) as json_file_read:
    data_json = json.load(json_file_read)

    for object_info in data_json:
      object_slug = object_info["source_image"]

      if object_slug in ds["source_list"]:
        if object_slug not in slug2obj:
          slug2obj[object_slug] = {
            "image_id": IMG2ID[object_slug],
            "image": os.path.join(ds["source_path"], "%s.jpg" % object_slug),
            "image_filename": "%s.jpg" % object_slug,
            "width": object_info["source_image_w"],
            "height": object_info["source_image_h"],
            "license_id": ds["license_id"],
            "date_captured": ds["date"],
            "objects": []
          }

        slug2obj[object_slug]["objects"].append({
          "bbox_id": object_count,
          "area": object_info["face_rect_xywh"][2] * object_info["face_rect_xywh"][3],
          "bbox": object_info["face_rect_xywh"],
          "category": ID2LABEL[object_info["gender"]],
          "super_category": ID2SUPERLABEL[object_info["gender"]],
          "is_crowd": False
        })
        object_count += 1

records = json.loads(json.dumps(list(slug2obj.values())))


In [None]:
for r in records:
  r["objects"] = pd.DataFrame(r["objects"]).to_dict('list')

df_dict = pd.DataFrame(records).to_dict('list')


In [None]:
CORDIAIS_FEATURES = datasets.Features({
  "image_id": datasets.Value("int64"),
  "image": datasets.Image(decode=True),
  "image_filename": datasets.Value("string"),
  "width": datasets.Value("int64"),
  "height": datasets.Value("int64"),
  "license_id": datasets.Value("int64"),
  "date_captured": datasets.Value("string"),
  "objects": datasets.Sequence(feature={
    "bbox_id": datasets.Value("int64"),
    "category": datasets.ClassLabel(names=list(LABEL2ID.keys())),
    "bbox": datasets.Sequence(feature=datasets.Value("int64"), length=4),
    "super_category": datasets.ClassLabel(names=list(SUPERLABEL2SUPERID.keys())),
    "area": datasets.Value("int64"),
    "is_crowd": datasets.Value("bool")
  })
})

CORDIAIS_INFO = {
  "version": "1.0.0",
  "description": "Object Detection dataset to detect female-ish faces in paintings",
  "year": 2023,
  "contributor": "Thiago Hersan",
  "url": "https://huggingface.co/datasets/thiagohersan/cordiais-faces",
  "date_created": "%s" % datetime.datetime.now(),
  "categories": [
    { "id": i, "name": l, "supercategory": ID2SUPERLABEL[i] } for i,l in ID2LABEL.items()
  ],
  "licenses": [
    { "id": 1, "name": "CC BY-NC 2.0", "url": "https://creativecommons.org/licenses/by-nc/2.0/" }
  ],
  "references": [
    { "id": 1, "name": "Training Generative Adversarial Networks with Limited Data", "url": "https://doi.org/10.48550/arXiv.2006.06676" }
  ]
}

In [None]:
hf_dataset = datasets.Dataset.from_dict(df_dict, features=CORDIAIS_FEATURES)
hf_dataset.info.description = CORDIAIS_INFO["description"]
hf_dataset.info.version = CORDIAIS_INFO["version"]
hf_dataset.info.license = CORDIAIS_INFO["licenses"][0]["name"]

In [None]:
hf_dataset.push_to_hub("thiagohersan/cordiais-faces", private=True)