In [None]:
from Cocordiais import CocordiaisUtils as cocordiais

from random import sample
from shutil import copy2

import datasets
import json
import os
import pandas as pd

In [None]:
DATA_PATH = "./data"

IMG_DIRS = [
  "baoat-source",
  "cordiais-source",
  "hermitage-source"
]

COCO_JSON_PATH = os.path.join(DATA_PATH, "encord-coco.json")

### parse images from json, add real path/source name

In [None]:
with open(COCO_JSON_PATH) as json_file_read:
  coco_json = json.load(json_file_read)

id2image = {}
source2images = {src: [] for src in IMG_DIRS}

for img in coco_json["images"]:
  img["file_name"] = img["image_title"]
  del img["coco_url"]
  del img["image_title"]

  for ds in IMG_DIRS:
    if os.path.isfile(os.path.join(DATA_PATH, ds, img["file_name"])):
      img["source"] = ds
      if not os.path.isfile(os.path.join(DATA_PATH, "encord-source", img["file_name"])):
        copy2(os.path.join(DATA_PATH, ds, img["file_name"]), os.path.join(DATA_PATH, "encord-source"))
      break

  source2images[img["source"]].append(img["file_name"])
  id2image[img["id"]] = img

In [None]:
cordiais_files = source2images["cordiais-source"]
test_size = int(0.5 * len(cordiais_files))
test_images = cordiais_files[:test_size]

In [None]:
slug2obj = {}

for split in ["train", "test"]:
  slug2obj[split] = {}

for object_info in coco_json["annotations"]:
  img_info = id2image[object_info["image_id"]]
  file_name = img_info["file_name"]
  object_split = "test" if file_name in test_images else "train"

  if file_name not in slug2obj[object_split]:
    slug2obj[object_split][file_name] = {
      "image_id": object_info["image_id"],
      "image": os.path.join(DATA_PATH, img_info["source"], file_name),
      "image_filename": file_name,
      "width": img_info["width"],
      "height": img_info["height"],
      "objects": []
    }

  slug2obj[object_split][file_name]["objects"].append({
    "bbox_id": object_info["id"],
    "area": object_info["area"],
    "bbox": object_info["bbox"],
    "category": cocordiais.ID2LABEL[object_info["category_id"]],
    "super_category": cocordiais.ID2SUPERLABEL[object_info["category_id"]],
    "is_crowd": object_info["iscrowd"]
  })

In [None]:
slug2obj["test"]["alberto-da-veiga-guignard_lea-e-maura.jpg"]

In [None]:
ds_dict = {}

for split in ["train", "test"]:
  records = json.loads(json.dumps(list(slug2obj[split].values())))
  for r in records:
    r["objects"] = pd.DataFrame(r["objects"]).to_dict("list")
  ds_dict[split] = pd.DataFrame(records).to_dict("list")

In [None]:
hf_dataset = datasets.DatasetDict({
  split: datasets.Dataset.from_dict(data, features=cocordiais.COCORDIAIS_FEATURES, info=cocordiais.get_dataset_info(), split=split) for split, data in ds_dict.items()
})

In [None]:
HF_DATASET = "thiagohersan/cordiais-encord-faces"

In [None]:
hf_dataset.push_to_hub(HF_DATASET, private=True)

### Test

In [None]:
import torchvision.transforms as T

from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import DetrImageProcessor

from Cocordiais import CocordiaisDataset
from PIL import Image, ImageDraw

In [None]:
DETR_MODEL = "facebook/detr-resnet-50"

detr_size = { "shortest_edge": 800, "longest_edge": 800 }
detr_processor = DetrImageProcessor.from_pretrained(DETR_MODEL, size=detr_size)

hf_dataset = load_dataset(HF_DATASET)
hf_dataset_train = hf_dataset["train"].train_test_split(test_size=0.2, shuffle=True, seed=101010)

dataset_train = CocordiaisDataset(hf_dataset_train["train"], img_processor=detr_processor, train=True)
dataset_eval = CocordiaisDataset(hf_dataset_train["test"], img_processor=detr_processor, train=False)
dataset_test = CocordiaisDataset(hf_dataset["test"], img_processor=detr_processor, train=False)

lens = (len(dataset_train), len(dataset_eval), len(dataset_test))

print("Number of examples:\n  Train: %s\n  Evaluation: %s\n  Test: %s" % lens)

In [None]:
dataloader_train = DataLoader(
  dataset_train.data,
  collate_fn=dataset_train.collate_batch,
  batch_size=12,
  shuffle=True
)

dataloader_eval = DataLoader(
  dataset_eval.data,
  collate_fn=dataset_eval.collate_batch,
  batch_size=4,
  shuffle=False
)

dataloader_test = DataLoader(
  dataset_test.data,
  collate_fn=dataset_test.collate_batch,
  batch_size=4,
  shuffle=False
)

In [None]:
idx = 0
d_train = dataset_train.data[idx]
# img_train = T.ToPILImage()(d_train["pixel_values"])
img_train = hf_dataset_train["train"][idx]["image"]
img_w, img_h = img_train.size

annotations = d_train["labels"]
draw = ImageDraw.Draw(img_train, "RGBA")

id2label = {0:"female", 1:"not-female"}
id2label = hf_dataset["train"].features["objects"].feature["category"].names

for i in range(len(annotations["boxes"])):
  label = id2label[annotations["class_labels"][i].item()]
  xc,yc,w,h = tuple(annotations["boxes"][i])
  draw.rectangle(((xc-w/2)*img_w, (yc-h/2)*img_h, (xc+w/2)*img_w, (yc+h/2)*img_h), outline="red", width=2)
  draw.text(((xc-w/2)*img_w, (yc-h/2)*img_h), label, fill="white")
img_train