RetinaNet with ResNet18 Backbone – WIDER FACE Dataset (v1)

This notebook demonstrates downloading and preparing the WIDER FACE dataset,
building a lightweight RetinaFace architecture using a ResNet18 backbone,
and training the model for facial detection tasks.

References:
- WIDER FACE Dataset: http://shuoyang1213.me/WIDERFACE/
- Dataset scripts: https://huggingface.co/datasets/CUHK-CSE/wider_face
- RetinaFace PyTorch Implementation: https://github.com/zisianw/FaceBoxes.PyTorch


```
# =========================================================
# 1. Import libraries
# =========================================================
```

In [None]:
import os
import requests
import zipfile
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
import random

```
# =========================================================
# 2. Download and extract WIDER FACE dataset
# =========================================================
```

In [None]:
# URLs for WIDER FACE dataset files
_URLS = {
   "train": "https://huggingface.co/datasets/wider_face/resolve/main/data/WIDER_train.zip",
   "validation": "https://huggingface.co/datasets/wider_face/resolve/main/data/WIDER_val.zip",
   "test": "https://huggingface.co/datasets/wider_face/resolve/main/data/WIDER_test.zip",
   "annot": "https://huggingface.co/datasets/wider_face/resolve/main/data/wider_face_split.zip",
}

# Directory to save the downloaded files
DATA_DIR = "wider_face_data"

def download_and_extract(url, dest_folder):
   # Get the filename from the URL
   filename = os.path.join(dest_folder, url.split("/")[-1])

   # Download the file
   print(f"Downloading {filename}...")
   response = requests.get(url, stream=True)
   response.raise_for_status()  # Check for errors

   # Save the file
   with open(filename, "wb") as file:
       for chunk in response.iter_content(chunk_size=8192):
           file.write(chunk)
   print(f"Downloaded {filename}.")

   # Extract the file
   print(f"Extracting {filename}...")
   with zipfile.ZipFile(filename, "r") as zip_ref:
       zip_ref.extractall(dest_folder)
   print(f"Extracted {filename}.")

   # Optionally, delete the zip file after extraction
   os.remove(filename)

# Create directory if it doesn't exist
os.makedirs(DATA_DIR, exist_ok=True)

# Download and extract each file
for name, url in _URLS.items():
   download_and_extract(url, DATA_DIR)

print("All files downloaded and extracted.")


Downloading wider_face_data/WIDER_train.zip...
Downloaded wider_face_data/WIDER_train.zip.
Extracting wider_face_data/WIDER_train.zip...
Extracted wider_face_data/WIDER_train.zip.
Downloading wider_face_data/WIDER_val.zip...
Downloaded wider_face_data/WIDER_val.zip.
Extracting wider_face_data/WIDER_val.zip...
Extracted wider_face_data/WIDER_val.zip.
Downloading wider_face_data/WIDER_test.zip...
Downloaded wider_face_data/WIDER_test.zip.
Extracting wider_face_data/WIDER_test.zip...
Extracted wider_face_data/WIDER_test.zip.
Downloading wider_face_data/wider_face_split.zip...
Downloaded wider_face_data/wider_face_split.zip.
Extracting wider_face_data/wider_face_split.zip...
Extracted wider_face_data/wider_face_split.zip.
All files downloaded and extracted.


In [None]:
def compute_classification_loss(predictions, targets):
    total_loss = 0
    num_scales = len(predictions)
    batch_size = predictions[0].size(0)

    for scale in range(num_scales):
        scale_preds = predictions[scale]  # Shape: [1, 2, H, W]

        # Reshape predictions to [1, 2*H*W]
        scale_preds = scale_preds.view(batch_size, -1)

        # Create target tensor of the same shape as scale_preds
        scale_targets = torch.zeros_like(scale_preds)

        # Set positive samples (you need to implement this based on your data)
        # For example, if you know which locations correspond to faces:
        # scale_targets[:, face_locations] = 1

        # Compute binary cross-entropy loss
        loss = F.binary_cross_entropy_with_logits(scale_preds, scale_targets)

        total_loss += loss

    return total_loss / num_scales

def compute_bbox_loss(predictions, targets):
    total_loss = 0
    num_scales = len(predictions)
    batch_size = predictions[0].size(0)

    for scale in range(num_scales):
        scale_preds = predictions[scale]  # Shape: [1, 8, H, W]

        # Reshape predictions to [1, H*W, 4]
        scale_preds = scale_preds.view(batch_size, -1, 4)

        # Create target tensor of the same shape as scale_preds
        scale_targets = torch.zeros_like(scale_preds)

        # Set correct bounding boxes (you need to implement this based on your data)
        # For example:
        # scale_targets[:, face_locations, :] = correct_bboxes

        # Compute smooth L1 loss
        loss = F.smooth_l1_loss(scale_preds, scale_targets)

        total_loss += loss

    return total_loss / num_scales

def compute_landmark_loss(predictions, targets):
    total_loss = 0
    num_scales = len(predictions)
    batch_size = predictions[0].size(0)

    for scale in range(num_scales):
        scale_preds = predictions[scale]  # Shape: [1, 20, H, W]

        # Reshape predictions to [1, H*W, 10]
        scale_preds = scale_preds.view(batch_size, -1, 10)

        # Create target tensor of the same shape as scale_preds
        scale_targets = torch.zeros_like(scale_preds)

        # Set correct landmarks (you need to implement this based on your data)
        # For example:
        # scale_targets[:, face_locations, :] = correct_landmarks

        # Compute smooth L1 loss
        loss = F.smooth_l1_loss(scale_preds, scale_targets)

        total_loss += loss

    return total_loss / num_scales

def compute_total_loss(classifications, bbox_regressions, ldm_regressions, targets):
    cls_loss = compute_classification_loss(classifications, targets)
    bbox_loss = compute_bbox_loss(bbox_regressions, targets)
    ldm_loss = compute_landmark_loss(ldm_regressions, targets)

    lambda1 = 0.25
    lambda2 = 0.1

    total_loss = cls_loss + lambda1 * bbox_loss + lambda2 * ldm_loss
    return total_loss


```
# =========================================================
# 3. Define custom dataset class
# =========================================================
```

In [None]:
class WiderFaceDataset(Dataset):
    def __init__(self, data_dir, split, transform=None, subset_fraction=1.0):
        self.data_dir = data_dir
        self.split = split
        self.transform = transform

        self.image_dir = os.path.join(data_dir, f"WIDER_{split}/images")
        if split != "test":
            annot_file = os.path.join(data_dir, "wider_face_split", f"wider_face_{split}_bbx_gt.txt")
            self.image_paths, self.annotations = self._load_annotations(annot_file)
        else:
            annot_file = os.path.join(data_dir, "wider_face_split", "wider_face_test_filelist.txt")
            self.image_paths = self._load_test_images(annot_file)
            self.annotations = None

        # Sample a subset of data
        if subset_fraction < 1.0:
            subset_size = int(len(self.image_paths) * subset_fraction)
            sampled_indices = random.sample(range(len(self.image_paths)), subset_size)
            self.image_paths = [self.image_paths[i] for i in sampled_indices]
            if self.annotations is not None:
                self.annotations = [self.annotations[i] for i in sampled_indices]

    def _load_annotations(self, annot_file):
        """Load image paths and bounding box annotations."""
        image_paths = []
        annotations = []

        with open(annot_file, "r") as f:
            while True:
                line = f.readline().strip()
                if not line:  # End of file
                    break
                if ".jpg" in line:  # Image filename
                    image_paths.append(os.path.join(self.image_dir, line))
                    num_boxes = int(f.readline().strip())
                    boxes = []
                    for _ in range(num_boxes):
                        box_data = list(map(int, f.readline().strip().split()))
                        boxes.append(box_data[:4])  # Extract xmin, ymin, width, height
                    annotations.append(boxes)

        return image_paths, annotations

    def _load_test_images(self, annot_file):
        """Load test image paths."""
        with open(annot_file, "r") as f:
            return [os.path.join(self.image_dir, line.strip()) for line in f if ".jpg" in line]

    def __len__(self):
        return len(self.image_paths)


    def __getitem__(self, idx):
        # Load image
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert("RGB")

        # Get original image size
        orig_w, orig_h = image.size

        # Apply transformations
        if self.transform:
            image = self.transform(image)

        # Adjust bounding boxes
        if self.annotations is not None:
            boxes = self.annotations[idx]
            # Convert to relative coordinates and adjust for new size
            adjusted_boxes = []
            for box in boxes:
                x, y, w, h = box
                x_rel, y_rel = x / orig_w, y / orig_h
                w_rel, h_rel = w / orig_w, h / orig_h
                adjusted_boxes.append([x_rel * 640, y_rel * 640, w_rel * 640, h_rel * 640])

            return {"image": image, "boxes": torch.tensor(adjusted_boxes, dtype=torch.float32)}

        return {"image": image}

```
# =========================================================
# 4. Data transformation and loading
# =========================================================
```

In [None]:
# Define your transformations
transform = transforms.Compose([
    transforms.Resize((640, 640)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


In [None]:
# Paths to datasets (ensure you have downloaded and extracted them)
DATA_DIR = "wider_face_data"

In [None]:
# Create datasets with 10% of the original data
train_dataset = WiderFaceDataset(data_dir=DATA_DIR, split="train", transform=transform, subset_fraction=0.1)
val_dataset = WiderFaceDataset(data_dir=DATA_DIR, split="val", transform=transform, subset_fraction=0.1)

In [None]:
# Example usage: print dataset size or access a sample
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
sample = train_dataset[0]
print(f"Sample image shape: {sample['image'].shape}")
print(f"Sample bounding boxes: {sample['boxes']}")

Train dataset size: 1288
Validation dataset size: 322
Sample image shape: torch.Size([3, 640, 640])
Sample bounding boxes: tensor([[180.6250, 127.5016, 278.7500, 207.6574]])


```
# =========================================================
# 5. Model definition
# =========================================================
```


In [None]:
class BasicBlock(nn.Module):
    expansion = 1   #bottleneck = 4

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes * self.expansion, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes * self.expansion)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion * planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out

class ResNet18_Lightweight(nn.Module):
    def __init__(self):
        super(ResNet18_Lightweight, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(BasicBlock, 64, 2, stride=1)
        self.layer2 = self._make_layer(BasicBlock, 128, 2, stride=2)
        self.layer3 = self._make_layer(BasicBlock, 256, 2, stride=2)
        self.layer4 = self._make_layer(BasicBlock, 512, 2, stride=2)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out1 = self.layer1(out)
        out2 = self.layer2(out1)
        out3 = self.layer3(out2)
        out4 = self.layer4(out3)
        return out1, out2, out3, out4

# Feature Pyramid Network (FPN)
class FPN(nn.Module):
    def __init__(self):
        super(FPN, self).__init__()
        self.toplayer = nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0)
        self.smooth1 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
        self.smooth2 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
        self.smooth3 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
        self.latlayer1 = nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0)
        self.latlayer2 = nn.Conv2d(128, 256, kernel_size=1, stride=1, padding=0)
        self.latlayer3 = nn.Conv2d(64, 256, kernel_size=1, stride=1, padding=0)

    def _upsample_add(self, x, y):
        _, _, H, W = y.size()
        return F.interpolate(x, size=(H, W), mode='bilinear', align_corners=False) + y

    def forward(self, c1, c2, c3, c4):
        p4 = self.toplayer(c4)
        p3 = self._upsample_add(p4, self.latlayer1(c3))
        p2 = self._upsample_add(p3, self.latlayer2(c2))
        p1 = self._upsample_add(p2, self.latlayer3(c1))

        p3 = self.smooth1(p3)
        p2 = self.smooth2(p2)
        p1 = self.smooth3(p1)

        return p1, p2, p3, p4

# Context Module
class SSH(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(SSH, self).__init__()
        self.conv3X3 = nn.Conv2d(in_channels, out_channels//2, kernel_size=3, stride=1, padding=1)
        self.conv5X5_1 = nn.Conv2d(in_channels, out_channels//4, kernel_size=3, stride=1, padding=1)
        self.conv5X5_2 = nn.Conv2d(out_channels//4, out_channels//4, kernel_size=3, stride=1, padding=1)
        self.conv7X7_2 = nn.Conv2d(out_channels//4, out_channels//4, kernel_size=3, stride=1, padding=1)
        self.conv7x7_3 = nn.Conv2d(out_channels//4, out_channels//4, kernel_size=3, stride=1, padding=1)

    def forward(self, x):
        conv3X3 = self.conv3X3(x)
        conv5X5_1 = self.conv5X5_1(x)
        conv5X5 = self.conv5X5_2(conv5X5_1)
        conv7X7_2 = self.conv7X7_2(conv5X5_1)
        conv7X7 = self.conv7x7_3(conv7X7_2)
        return torch.cat([conv3X3, conv5X5, conv7X7], dim=1)

# Lightweight RetinaFace Model
class LightweightRetinaFace(nn.Module):
    def __init__(self):
        super(LightweightRetinaFace, self).__init__()
        self.backbone = ResNet18_Lightweight()
        self.fpn = FPN()
        self.ssh1 = SSH(256, 256)
        self.ssh2 = SSH(256, 256)
        self.ssh3 = SSH(256, 256)
        self.ssh4 = SSH(256, 256)

        self.ClassHead = self._make_class_head()
        self.BboxHead = self._make_bbox_head()
        self.LandmarkHead = self._make_landmark_head()

    def _make_class_head(self, fpn_num=4, inchannels=256, anchor_num=2):
        classhead = nn.ModuleList()
        for i in range(fpn_num):
            classhead.append(nn.Conv2d(inchannels, anchor_num, kernel_size=1, stride=1, padding=0))
        return classhead

    def _make_bbox_head(self, fpn_num=4, inchannels=256, anchor_num=2):
        bboxhead = nn.ModuleList()
        for i in range(fpn_num):
            bboxhead.append(nn.Conv2d(inchannels, anchor_num * 4, kernel_size=1, stride=1, padding=0))
        return bboxhead

    def _make_landmark_head(self, fpn_num=4, inchannels=256, anchor_num=2):
        landmarkhead = nn.ModuleList()
        for i in range(fpn_num):
            landmarkhead.append(nn.Conv2d(inchannels, anchor_num * 10, kernel_size=1, stride=1, padding=0))
        return landmarkhead

    def forward(self, x):
        c1, c2, c3, c4 = self.backbone(x)
        p1, p2, p3, p4 = self.fpn(c1, c2, c3, c4)

        feature1 = self.ssh1(p1)
        feature2 = self.ssh2(p2)
        feature3 = self.ssh3(p3)
        feature4 = self.ssh4(p4)
        features = [feature1, feature2, feature3, feature4]

        bbox_regressions = []
        classifications = []
        ldm_regressions = []

        for i, feature in enumerate(features):
            bbox_regressions.append(self.BboxHead[i](feature))
            classifications.append(self.ClassHead[i](feature))
            ldm_regressions.append(self.LandmarkHead[i](feature))


        return bbox_regressions, classifications, ldm_regressions

In [None]:
# Initialize the model
model = LightweightRetinaFace().to('cuda' if torch.cuda.is_available() else 'cpu')

# Define loss functions (you may need to implement custom loss functions for RetinaFace)
classification_loss = nn.BCEWithLogitsLoss()
bbox_loss = nn.SmoothL1Loss()
landmark_loss = nn.SmoothL1Loss()

# Initialize optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-3)


In [None]:
def collate_fn(batch):
    images = []
    targets = []
    for item in batch:
        images.append(item['image'])
        targets.append({
            'boxes': item['boxes'].clone().detach(),
            'labels': torch.ones(len(item['boxes']), dtype=torch.float32)  # Assuming all are faces
        })
    return {'images': torch.stack(images), 'targets': targets}


```
# =========================================================
# 6. Training and validation
# =========================================================
```

In [None]:
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=4, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=collate_fn)



In [None]:
# Create a SummaryWriter for TensorBoard
writer = SummaryWriter('runs/retinaface_experiment')

In [None]:
def train_one_epoch(model, dataloader, optimizer, device, epoch):
    model.train()
    total_loss = 0
    scaler = torch.amp.GradScaler('cuda')
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1} Training")
    for batch in progress_bar:
        images = batch['images'].to(device)
        targets = [{k: v.to(device) for k, v in t.items()} for t in batch['targets']]

        optimizer.zero_grad()

        with torch.amp.autocast('cuda'):
            bbox_regressions, classifications, ldm_regressions = model(images)
            loss = compute_total_loss(classifications, bbox_regressions, ldm_regressions, targets)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': loss.item()})

    avg_loss = total_loss / len(dataloader)
    writer.add_scalar('Loss/train', avg_loss, epoch)
    return avg_loss

In [None]:
def validate(model, dataloader, device, epoch):
    model.eval()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1} Validation")
    with torch.no_grad():
        for batch in progress_bar:
            images = batch['images'].to(device)
            targets = [{k: v.to(device) for k, v in t.items()} for t in batch['targets']]

            bbox_regressions, classifications, ldm_regressions = model(images)
            loss = compute_total_loss(classifications, bbox_regressions, ldm_regressions, targets)
            total_loss += loss.item()
            progress_bar.set_postfix({'loss': loss.item()})

    avg_loss = total_loss / len(dataloader)
    writer.add_scalar('Loss/val', avg_loss, epoch)
    return avg_loss

In [None]:
num_epochs = 50
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_losses = []
val_losses = []

for epoch in range(num_epochs):
    train_loss = train_one_epoch(model, train_loader, optimizer, device, epoch)
    val_loss = validate(model, val_loader, device, epoch)

    train_losses.append(train_loss)
    val_losses.append(val_loss)

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation Loss: {val_loss:.4f}")
    print("-----------------------")

    # Save the model checkpoint
    torch.save(model.state_dict(), f'retinaface_epoch_{epoch+1}.pth')

    # Log learning rate
    writer.add_scalar('Learning Rate', optimizer.param_groups[0]['lr'], epoch)

Epoch 1 Training: 100%|██████████| 322/322 [10:17<00:00,  1.92s/it, loss=0.000135]
Epoch 1 Validation: 100%|██████████| 81/81 [02:06<00:00,  1.56s/it, loss=0.000394]


Epoch 1/50
Train Loss: 0.0409
Validation Loss: 0.0052
-----------------------


Epoch 2 Training: 100%|██████████| 322/322 [10:11<00:00,  1.90s/it, loss=5.59e-5]
Epoch 2 Validation: 100%|██████████| 81/81 [02:05<00:00,  1.55s/it, loss=4.8e-5]


Epoch 2/50
Train Loss: 0.0001
Validation Loss: 0.0016
-----------------------


Epoch 3 Training: 100%|██████████| 322/322 [10:10<00:00,  1.90s/it, loss=4.51e-5]
Epoch 3 Validation: 100%|██████████| 81/81 [02:05<00:00,  1.55s/it, loss=8.79e-5]


Epoch 3/50
Train Loss: 0.0001
Validation Loss: 0.0016
-----------------------


Epoch 4 Training:  60%|██████    | 194/322 [06:08<04:03,  1.90s/it, loss=4.88e-5]


KeyboardInterrupt: 

```
# =========================================================
# 7. Logging and Plotting
# =========================================================
```

In [None]:
# Plot training and validation loss
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs+1), train_losses, label='Train Loss')
plt.plot(range(1, num_epochs+1), val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.savefig('loss_plot.png')
plt.show()

# Close the TensorBoard writer
writer.close()