### Create COCOrdiais dataset from data

In [5]:
import cocordiais_utils as cocordiais

import datetime
import json
import os
import random
import shutil
from PIL import Image as PImage

In [None]:
PATH_DATASET = "./data"
PATH_COCORDIAIS = "./cocordiais"

IMAGES_METFACES = os.path.join(PATH_DATASET, "metfaces-source")
IMAGES_CORDIAIS = os.path.join(PATH_DATASET, "cordiais-source")

FILES_METFACES = sorted([f.replace(".jpg", "") for f in os.listdir(IMAGES_METFACES) if f.endswith(".jpg")])
FILES_CORDIAIS = sorted([f.replace(".jpg", "") for f in os.listdir(IMAGES_CORDIAIS) if f.endswith(".jpg")])

random.seed(101010)
shuffled_cordiais = random.sample(FILES_CORDIAIS, k=len(FILES_CORDIAIS))
shuffled_metfaces = random.sample(FILES_METFACES, k=len(FILES_METFACES))

DATA_INFO = [
  {
    "name": "cordiais",
    "license_id": 1,
    "date": "2023-06-10 00:00:00",
    "splits": {
      "test": shuffled_cordiais[int(0.5 * len(FILES_CORDIAIS)):],
      "train": shuffled_cordiais[int(0.1 * len(FILES_CORDIAIS)) : int(0.5 * len(FILES_CORDIAIS))],
      "validation": shuffled_cordiais[:int(0.1 * len(FILES_CORDIAIS))]
    }
  },
  {
    "name": "metfaces",
    "license_id": 1,
    "date": "2020-06-10 00:00:00",
    "splits": {
      "test": [],
      "train": shuffled_metfaces[int(0.2 * len(FILES_METFACES)):],
      "validation": shuffled_metfaces[:int(0.2 * len(FILES_METFACES))]
    }
  }
]


### Copy files to cocordiais directory

In [None]:
for dset in DATA_INFO:
  for split in dset["splits"].keys():
    os.makedirs(os.path.join(PATH_COCORDIAIS, split), exist_ok=True)
    for fn in dset["splits"][split]:
      shutil.copy2(
        os.path.join(PATH_DATASET, "%s-source" % dset["name"], "%s.jpg" % fn),
        os.path.join(PATH_COCORDIAIS, split)
      )

shutil.copy2(
  os.path.join(PATH_DATASET, "id2label.json"),
  os.path.join(PATH_COCORDIAIS)
)


In [None]:
# TODO: this could be a class... maybe ?
files = {}
img2id = {}
id2img = {}
coco_annotations = {}

for s in ["test", "train", "validation"]:
  split_dir = os.path.join(PATH_COCORDIAIS, s)
  split_file_list = sorted([f.replace(".jpg", "") for f in os.listdir(split_dir) if f.endswith(".jpg")])
  files[s] = set(split_file_list)
  img2id[s] = {f: i for i,f in enumerate(split_file_list)}
  id2img[s] = {}
  coco_annotations[s] = []


def get_split(fn):
  for s in ["test", "train", "validation"]:
    if fn in files[s]:
      return s


for dset in DATA_INFO:
  dset_name = dset["name"]
  dataset_path_in = os.path.join(PATH_DATASET, "%s.json" % dset_name)

  with open(dataset_path_in) as json_file_in:
    data = json.load(json_file_in)
    for obj in data:
      my_split = get_split(obj["source_image"])
      my_img_id = img2id[my_split][obj["source_image"]]

      id2img[my_split][my_img_id] = {
        "id": my_img_id,
        "width": obj["source_image_w"],
        "height": obj["source_image_h"],
        "file_name": "%s.jpg" % obj["source_image"],
        "license": dset["license_id"],
        "date_captured": dset["date"],
        "coco_url": "",
        "flickr_url": ""
      }

      poly = [
        [obj["face_rect_xyxy"][0], obj["face_rect_xyxy"][1]],
        [obj["face_rect_xyxy"][2], obj["face_rect_xyxy"][1]],
        [obj["face_rect_xyxy"][2], obj["face_rect_xyxy"][3]],
        [obj["face_rect_xyxy"][0], obj["face_rect_xyxy"][3]]
      ]

      ann_obj = {
        "id": len(coco_annotations[my_split]),
        "image_id": my_img_id,
        "category_id": cocordiais.LABEL2ID[obj["gender"]],
        "segmentation": list([poly]),
        "area": obj["face_rect_xywh"][2] * obj["face_rect_xywh"][3],
        "bbox": obj["face_rect_xywh"],
        "iscrowd": 0,
      }

      coco_annotations[my_split].append(ann_obj)

for s in ["test", "train", "validation"]:
  coco_split = cocordiais.get_cocordiais_info()
  coco_split["images"] = list(id2img[s].values())
  coco_split["annotations"] = list(coco_annotations[s])

  out_file = os.path.join(PATH_COCORDIAIS, s, "cocordiais.json")

  with open(out_file, 'w') as json_file_out_write:
    json.dump(coco_split, json_file_out_write)


### Create COCOrdiais from hf dataset

In [None]:
import datasets
import datetime
import huggingface_hub
import json
import os
from PIL import Image as PImage

In [None]:
hf_dataset_ = datasets.load_dataset("thiagohersan/cordiais-faces")
hf_dataset = hf_dataset_["train"].train_test_split(test_size=0.2, shuffle=True, seed=1010)

In [None]:
COCORDIAIS_DATA_PATH = "./cocordiais-from-hf"

for split_name, objs in hf_dataset.items():
  split_data_path = os.path.join(COCORDIAIS_DATA_PATH, split_name)
  os.makedirs(split_data_path)

  json_path_out = os.path.join(split_data_path, "cocordiais.json")
  cocordiais_obj = cocordiais.get_cocordiais_info()

  for obj in objs:
    img_path = os.path.join(split_data_path, obj["image_filename"])

    if not os.path.isfile(img_path):
      obj["image"].save(img_path, "JPEG")

    cocordiais_obj["images"].append({
      "id": obj["image_id"],
      "width": obj["width"],
      "height": obj["height"],
      "file_name": obj["image_filename"],
      "license": obj["license_id"],
      "date_captured": obj["date_captured"],
      "coco_url": "",
      "flickr_url": ""
    })

    ann_objs = obj["objects"]
    for ann_idx in range(len(ann_objs["bbox_id"])):
      cocordiais_obj["annotations"].append({
        "id": ann_objs["bbox_id"][ann_idx],
        "image_id": obj["image_id"],
        "category_id": ann_objs["category"][ann_idx],
        "area": ann_objs["area"][ann_idx],
        "bbox": ann_objs["bbox"][ann_idx],
        "iscrowd": 1 if ann_objs["is_crowd"][ann_idx] else 0
      })

  with open(json_path_out, 'w') as json_file_out_write:
    json.dump(cocordiais_obj, json_file_out_write)
  

### Check jsons, images and annotations

In [None]:
import torchvision
import os
import random

from PIL import Image, ImageDraw
from transformers import DetrImageProcessor

In [None]:
class CocoDetection(torchvision.datasets.CocoDetection):
  def __init__(self, img_folder, processor, train=True):
    ann_file = os.path.join(img_folder, "cocordiais.json")
    super(CocoDetection, self).__init__(img_folder, ann_file)
    self.processor = processor

  def __getitem__(self, idx):
    # read in PIL image and target in COCO format
    # feel free to add data augmentation here before passing them to the next step
    img, target = super(CocoDetection, self).__getitem__(idx)
    
    # preprocess image and target (converting target to DETR format, resizing + normalization of both image and target)
    image_id = self.ids[idx]
    target = {'image_id': image_id, 'annotations': target}
    encoding = self.processor(images=img, annotations=target, return_tensors="pt")
    pixel_values = encoding["pixel_values"].squeeze() # remove batch dimension
    target = encoding["labels"][0] # remove batch dimension

    return pixel_values, target

In [None]:
# COCORDIAIS_DATA_PATH = "./cocordiais-hf"
COCORDIAIS_DATA_PATH = "./cocordiais"

processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")

train_dataset = CocoDetection(img_folder=os.path.join(COCORDIAIS_DATA_PATH, 'train'), processor=processor)
test_dataset = CocoDetection(img_folder=os.path.join(COCORDIAIS_DATA_PATH, 'test'), processor=processor, train=False)

print("Number of training examples:", len(train_dataset))
print("Number of validation examples:", len(test_dataset))

### Show images

In [None]:
image_ids = train_dataset.coco.getImgIds()
image_id = random.sample(image_ids, k=1)[0]

image = train_dataset.coco.loadImgs(image_id)[0]
image = Image.open(os.path.join(COCORDIAIS_DATA_PATH, 'train', image['file_name']))

annotations = train_dataset.coco.imgToAnns[image_id]
draw = ImageDraw.Draw(image, "RGBA")

cats = train_dataset.coco.cats
id2label = {k: v['name'] for k,v in cats.items()}

for annotation in annotations:
  label = id2label[annotation['category_id']]
  x,y,w,h = tuple(annotation['bbox'])
  draw.rectangle((x,y,x+w,y+h), outline='red', width=2)
  draw.text((x, y), label, fill='white')

image