In [None]:
from Cocordiais import CocordiaisUtils as cocordiais

from random import sample
from shutil import copy2

import datasets
import json
import os
import pandas as pd

In [None]:
DATA_PATH = "./data"

IMG_DIRS = [
  "baoat-source",
  "cordiais-source",
  "hermitage-source"
]

COCO_JSON_PATH = os.path.join(DATA_PATH, "encord.json")

### parse images from json, add real path/source name

In [None]:
with open(COCO_JSON_PATH) as json_file_read:
  coco_json = json.load(json_file_read)

id2image = {}
source2images = {src: [] for src in IMG_DIRS}

for img in coco_json["images"]:
  img["file_name"] = img["image_title"]
  del img["coco_url"]
  del img["image_title"]

  for ds in IMG_DIRS:
    if os.path.isfile(os.path.join(DATA_PATH, ds, img["file_name"])):
      img["source"] = ds
      if not os.path.isfile(os.path.join(DATA_PATH, "encord-source", img["file_name"])):
        copy2(os.path.join(DATA_PATH, ds, img["file_name"]), os.path.join(DATA_PATH, "encord-source"))
      break

  source2images[img["source"]].append(img["file_name"])
  id2image[img["id"]] = img

cordiais_files = source2images["cordiais-source"]
test_size = int(0.5 * len(cordiais_files))
test_images = cordiais_files[:test_size]

### if images already in encord-source/

In [None]:
with open(COCO_JSON_PATH) as json_file_read:
  coco_json = json.load(json_file_read)

id2image = {}
cordiais_files = []

for img in coco_json["images"]:
  img["file_name"] = img["image_title"]
  del img["coco_url"]
  del img["image_title"]

  if os.path.isfile(os.path.join(DATA_PATH, "cordiais-source", img["file_name"])):
    cordiais_files.append(img["file_name"])

  id2image[img["id"]] = img

test_size = int(0.5 * len(cordiais_files))
test_images = cordiais_files[:test_size]

In [None]:
slug2obj = {}

for split in ["train", "test"]:
  slug2obj[split] = {}

for object_info in coco_json["annotations"]:
  img_info = id2image[object_info["image_id"]]
  file_name = img_info["file_name"]
  object_split = "test" if file_name in test_images else "train"

  if file_name not in slug2obj[object_split]:
    slug2obj[object_split][file_name] = {
      "image_id": object_info["image_id"],
      "image": os.path.join(DATA_PATH, "encord-source", file_name),
      "image_filename": file_name,
      "width": img_info["width"],
      "height": img_info["height"],
      "objects": []
    }

  slug2obj[object_split][file_name]["objects"].append({
    "bbox_id": object_info["id"],
    "area": object_info["area"],
    "bbox": object_info["bbox"],
    "category": cocordiais.ID2LABEL[object_info["category_id"]],
    "super_category": cocordiais.ID2SUPERLABEL[object_info["category_id"]],
    "is_crowd": object_info["iscrowd"]
  })

In [None]:
slug2obj["test"]["alberto-da-veiga-guignard_lea-e-maura.jpg"]

In [None]:
len(slug2obj["train"]), len(slug2obj["test"]), len(id2image), len(coco_json["annotations"])

### add metfaces

In [None]:
DATA_PATH = "./data"

DATA_INFO = [
  {
    "name": "metfaces",
    "license_id": 1,
    "date": "2020-06-10 00:00:00",
    "train_pct": 1.0,
    "source_path": os.path.join(DATA_PATH, "metfaces-source"),
    "json_path": os.path.join(DATA_PATH, "metfaces.json")
  },
]

ALL_IMGS = []

for ds in DATA_INFO:
  file_list = sorted([f.replace(".jpg", "") for f in os.listdir(ds["source_path"]) if f.endswith(".jpg")])
  train_length = int(ds["train_pct"] * len(file_list))
  ds["source_list"] = {}
  ds["source_list"]["train"] = file_list[:train_length]
  ds["source_list"]["test"] = file_list[train_length:]
  ALL_IMGS += file_list

img2id_met = {img:(id + len(id2image)) for id,img in enumerate(ALL_IMGS)}

In [None]:
object_count = len(coco_json["annotations"])

for ds in DATA_INFO:
  with open(ds["json_path"]) as json_file_read:
    data_json = json.load(json_file_read)

    for object_info in data_json:
      object_slug = object_info["source_image"]
      object_split = "train" if object_slug in ds["source_list"]["train"] else "test"

      if object_slug not in slug2obj[object_split]:
        slug2obj[object_split][object_slug] = {
          "image_id": img2id_met[object_slug],
          "image": os.path.join(ds["source_path"], "%s.jpg" % object_slug),
          "image_filename": "%s.jpg" % object_slug,
          "width": object_info["source_image_w"],
          "height": object_info["source_image_h"],
          "objects": []
        }

      slug2obj[object_split][object_slug]["objects"].append({
        "bbox_id": object_count,
        "area": object_info["face_rect_xywh"][2] * object_info["face_rect_xywh"][3],
        "bbox": object_info["face_rect_xywh"],
        "category": object_info["gender"],
        "super_category": cocordiais.LABEL2SUPERLABEL[object_info["gender"]],
        "is_crowd": False
      })
      object_count += 1

In [None]:
len(slug2obj["train"]), len(slug2obj["test"])

In [None]:
ds_dict = {}

for split in ["train", "test"]:
  records = json.loads(json.dumps(list(slug2obj[split].values())))
  for r in records:
    r["objects"] = pd.DataFrame(r["objects"]).to_dict("list")
  ds_dict[split] = pd.DataFrame(records).to_dict("list")

In [None]:
hf_dataset = datasets.DatasetDict({
  split: datasets.Dataset.from_dict(data, features=cocordiais.COCORDIAIS_FEATURES, info=cocordiais.get_dataset_info(), split=split) for split, data in ds_dict.items()
})

In [None]:
HF_DATASET = "thiagohersan/cordiais-encord-faces-3"
hf_dataset.push_to_hub(HF_DATASET, private=True)

### Test

In [None]:
import torchvision.transforms as T

from datasets import load_dataset
from torch import ones_like
from transformers import DetrImageProcessor

from Cocordiais import CocordiaisDataset, CocordiaisUtils

try:
  HF_DATASET = HF_DATASET
except NameError:
  HF_DATASET = "thiagohersan/cordiais-encord-faces"

In [None]:
DETR_MODEL = "facebook/detr-resnet-50"

detr_size = { "shortest_edge": 800, "longest_edge": 800 }
detr_processor = DetrImageProcessor.from_pretrained(DETR_MODEL, size=detr_size)

hf_dataset = load_dataset(HF_DATASET)
hf_dataset_train = hf_dataset["train"].train_test_split(test_size=0.2, shuffle=True, seed=101010)

hf_data = {
  "train": hf_dataset_train["train"],
  "eval": hf_dataset_train["test"],
  "test": hf_dataset["test"]
}

dataset = {
  "train": CocordiaisDataset(hf_data["train"], img_processor=detr_processor, train=True),
  "eval": CocordiaisDataset(hf_data["eval"], img_processor=detr_processor, train=False),
  "test": CocordiaisDataset(hf_data["test"], img_processor=detr_processor, train=False)
}

lens = (len(dataset["train"]), len(dataset["eval"]), len(dataset["test"]))

print("Number of examples:\n  Train: %s\n  Evaluation: %s\n  Test: %s" % lens)

In [None]:
split, idx = "eval", 0

orig_image = hf_data[split][idx]["image"]
detr_image = T.ToPILImage()(dataset[split].data[idx]["pixel_values"])
labels = dataset[split].data[idx]["labels"]

boxes_info = {
  "scores": ones_like(labels["class_labels"]),
  "labels": labels["class_labels"],
  "boxes": CocordiaisUtils.bboxes_xcycwh_to_xyxy(labels)
}

CocordiaisUtils.plot_boxes(orig_image, boxes_info)
CocordiaisUtils.plot_boxes(detr_image, boxes_info)