### Finetune DETR to detect female-ish faces in paintings

In [None]:
! pip install --upgrade scipy transformers datasets huggingface_hub pytorch-lightning pycocotools

In [None]:
!huggingface-cli login

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from datasets import load_dataset
from pytorch_lightning import LightningModule, Trainer
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torchvision.transforms import ToPILImage
from transformers import DetrForObjectDetection, DetrImageProcessor

from Cocordiais import CocordiaisDataset, CocordiaisUtils

### Load dataset from HF and turn to COCO format

In [None]:
AUXILIARY_LOSS = False
CLASS_COST = 1
NUM_EPOCHS = 100

aux_string = "-aux" if AUXILIARY_LOSS else ""
cc_string = ("-cc%s" % CLASS_COST) if CLASS_COST > 1 else ""

DETR_MODEL = "facebook/detr-resnet-50"
HF_DATASET = "thiagohersan/cordiais-faces"
HF_MODEL = f"thiagohersan/detr-cordiais-aug2-{NUM_EPOCHS}{aux_string}{cc_string}"

### Create DataLoaders

In [None]:
detr_size = { "shortest_edge": 800, "longest_edge": 800 }
detr_processor = DetrImageProcessor.from_pretrained(DETR_MODEL, size=detr_size)

hf_dataset = load_dataset(HF_DATASET)
hf_dataset = hf_dataset["train"].train_test_split(test_size=0.2, shuffle=True, seed=101010)

dataset_train = CocordiaisDataset(hf_dataset["train"], img_processor=detr_processor, train=True)
dataset_eval = CocordiaisDataset(hf_dataset["test"], img_processor=detr_processor, train=False)

print("Number of examples:\n  Train: %s\n  Evaluation: %s" % (len(dataset_train), len(dataset_eval)))

In [None]:
dataloader_train = DataLoader(
  dataset_train.data,
  collate_fn=dataset_train.collate_batch,
  batch_size=12,
  shuffle=True
)

dataloader_eval = DataLoader(
  dataset_eval.data,
  collate_fn=dataset_eval.collate_batch,
  batch_size=4,
  shuffle=False
)

In [None]:
# verify
pixel_values, _, target = dataset_train.data[0].values()
print(pixel_values.shape)
print(target)

batch = next(iter(dataloader_train))
print(batch.keys())
pimg = ToPILImage()(batch["pixel_values"][0])
print(pimg.size)
pimg

### Train with PyTorchLightning

In [None]:
class Detr(LightningModule):
  def __init__(self, dl_train, dl_eval, lr, lr_backbone, weight_decay):
    super().__init__()
    # replace COCO classification head with custom head
    self.model = DetrForObjectDetection.from_pretrained(
      DETR_MODEL,
      revision="no_timm",
      num_labels=len(CocordiaisUtils.ID2LABEL),
      num_queries=16,
      ignore_mismatched_sizes=True,
      auxiliary_loss=AUXILIARY_LOSS,
      class_cost=CLASS_COST
    )

    self.lr = lr
    self.lr_backbone = lr_backbone
    self.weight_decay = weight_decay

    self.dataloader_train = dl_train
    self.dataloader_eval = dl_eval
    self.batch_size_train = dl_train.batch_size
    self.batch_size_eval = dl_eval.batch_size

  def forward(self, pixel_values, pixel_mask):
    outputs = self.model(pixel_values=pixel_values, pixel_mask=pixel_mask)
    return outputs

  def common_step(self, batch, batch_idx):
    pixel_values = batch["pixel_values"]
    pixel_mask = batch["pixel_mask"]
    labels = [{k: v.to(self.device) for k, v in t.items()} for t in batch["labels"]]

    outputs = self.model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)

    loss = outputs.loss
    loss_dict = outputs.loss_dict

    return loss, loss_dict

  def training_step(self, batch, batch_idx):
    loss, loss_dict = self.common_step(batch, batch_idx)
    self.log("training_loss", loss, batch_size=self.batch_size_train)
    for k,v in loss_dict.items():
      self.log("training_" + k, v.item(), batch_size=self.batch_size_train)
    return loss

  def validation_step(self, batch, batch_idx):
    loss, loss_dict = self.common_step(batch, batch_idx)     
    self.log("validation_loss", loss, batch_size=self.batch_size_eval)
    for k,v in loss_dict.items():
      self.log("validation_" + k, v.item(), batch_size=self.batch_size_eval)
    return loss

  def configure_optimizers(self):
    param_dicts = [
      {
        "params": [p for n, p in self.named_parameters() if "backbone" not in n and p.requires_grad]
      },
      {
        "params": [p for n, p in self.named_parameters() if "backbone" in n and p.requires_grad],
        "lr": self.lr_backbone,
      },
    ]
    optimizer = AdamW(param_dicts, lr=self.lr,
    weight_decay=self.weight_decay)

    return optimizer

  def train_dataloader(self):
    return self.dataloader_train

  def val_dataloader(self):
    return self.dataloader_eval

In [None]:
model = Detr(dl_train=dataloader_train, dl_eval=dataloader_eval, lr=1e-4, lr_backbone=1e-5, weight_decay=1e-4)

# check output shape [batch x queries x channels]
outputs = model(pixel_values=batch['pixel_values'], pixel_mask=batch['pixel_mask'])
print(outputs.logits.shape)

In [None]:
!rm -rf lightning_logs
%load_ext tensorboard
%tensorboard --logdir lightning_logs/

In [None]:
trainer = Trainer(max_epochs=NUM_EPOCHS, gradient_clip_val=0.1, accelerator="auto")
trainer.fit(model)

In [None]:
model.model.push_to_hub(HF_MODEL, private=True)
detr_processor.push_to_hub(HF_MODEL, private=True)