In [1]:
import os
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.amp import autocast, GradScaler
import torchvision.transforms as T
from utils.import_data import WiderFaceDataset, TRANSFORM, TRAIN_ROOT, TRAIN_ANN_FILE, TEST_ROOT, VAL_ROOT, VAL_ANN_FILE
#from utils.anchors import AnchorMatcher, AnchorGenerator, box_nms, compute_loss_with_anchors
from utils.anchors import AnchorGeneratorSingle, AnchorGeneratorMultiScale, compute_loss_single_face, anchor_validate_iou

#custom collate function to seperate the boxes from the images
def custom_collate_fn(batch):
    images, targets = zip(*batch)  # unzip the batch
    return images, targets

# Create the dataset
dataset = WiderFaceDataset(
    root_dir=TRAIN_ROOT, 
    annotation_file=TRAIN_ANN_FILE, 
    img_size=224, 
    transform=TRANSFORM,
    single_face_only=True #if true, thirds the total images
    )

# Create a dataloader
dataloader = DataLoader(
    dataset, 
    batch_size=64, #64 runs quickest on desktop
    shuffle=True,
    num_workers=0,
    pin_memory=True,
    #use collate here because our data has the images but also the boxes and number of boxes
    collate_fn = custom_collate_fn
    )


#load validation pictures
val_dataset = WiderFaceDataset(
    root_dir=VAL_ROOT, 
    annotation_file=VAL_ANN_FILE, 
    img_size=224, 
    transform=TRANSFORM,
    single_face_only=True
    )

val_loader = DataLoader(
    val_dataset, 
    batch_size=32,
    shuffle=True,
    num_workers=0,
    pin_memory=True,
    collate_fn = custom_collate_fn
    )

print(f"Total train images: {len(dataset)}")
print(f"Total validation images: {len(val_dataset)}")


Total train images: 4631
Total validation images: 1122


In [2]:
class FaceDetectionNet(nn.Module):
    def __init__(self, num_anchors=1):
        """
        num_anchors: the number of anchor sizes (3: [32, 64, 96])
        kernel_size is the size of the box we pass over each img to extract the features, exactly like tf (3,3,3)
        """
        super(FaceDetectionNet, self).__init__()

        #Backbone (feature extractor)
        self.backbone = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),  # RGB input
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )

        # Detection head
        # Predict bounding boxes + confidence
        # Output channels = num_anchors * 5 (x, y, w, h, conf)
        self.det_head = nn.Conv2d(128, num_anchors * 5, kernel_size=1)

    def forward(self, x):
        x = self.backbone(x)
        return self.det_head(x)

In [28]:
import numpy as np

face_sizes = []
for i in range(min(100, len(dataset))):  # Check first 100 images
    _, target = dataset[i]
    box = target['boxes'][0]
    width = box[2] - box[0]
    height = box[3] - box[1]
    avg_size = (width + height) / 2
    face_sizes.append(avg_size.item())

print(f"Face size stats:")
print(f"  Min: {min(face_sizes):.1f}")
print(f"  Max: {max(face_sizes):.1f}")
print(f"  Mean: {np.mean(face_sizes):.1f}")
print(f"  Median: {np.median(face_sizes):.1f}")
print(f"\nYour current anchor size: 32")
print(f"Recommended anchor size: {int(np.median(face_sizes))}")

Face size stats:
  Min: 3.8
  Max: 175.4
  Mean: 50.3
  Median: 42.4

Your current anchor size: 32
Recommended anchor size: 42


In [4]:
#Training Loop
start_epoch = 0
num_epochs = 100
loadLastCheckpoint = False
checkpoint_rate = 10
validationRate = 5
anchor_sizes = [30, 60]
warmup_epochs=10

net = FaceDetectionNet(num_anchors=2)

#moves all the info to the gpu (cuda) if it can, if not it keeps it on the cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on: {device}") 
net = net.to(device)

# Initialize anchor generator
anchor_gen = AnchorGeneratorMultiScale(anchor_sizes=anchor_sizes)
anchors = anchor_gen.generate(feature_h=28, feature_w=28, stride=8, device=device)

optimizer = torch.optim.Adam(net.parameters(), lr=0.0001)
scaler = GradScaler()

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, 
    mode='min', 
    factor=0.5,
    patience=5,      # Wait longer before giving up
    min_lr=1e-7      # Set a floor
)

def get_lr_warmup(epoch, warmup_epochs=warmup_epochs, base_lr=0.0005):
    if epoch < warmup_epochs:
        return base_lr * (epoch + 1) / warmup_epochs
    return base_lr


if loadLastCheckpoint:
    # Check for existing checkpoints and load the latest one
    checkpoint_files = [f for f in os.listdir("checkpoints") if f.startswith("faceNet_checkpoint")]
    if checkpoint_files:
        # Sort by epoch number and get the latest
        latest_checkpoint = sorted(checkpoint_files, key=lambda x: int(x.split('checkpoint')[1].split('.')[0]))[-1]
        checkpoint_path = os.path.join("checkpoints", latest_checkpoint)
        
        checkpoint = torch.load(checkpoint_path, map_location=device)
        net.load_state_dict(checkpoint["model_state"])
        optimizer.load_state_dict(checkpoint["optimizer_state"])
        start_epoch = checkpoint["epoch"] + 1
        print(f"Loaded checkpoint from epoch {checkpoint['epoch'] + 1}")

timer_start = time.time()
for epoch in range(start_epoch, num_epochs):
    net.train() # sets the net to training mode
    epoch_loss = 0

    #with pytorch it wont let you pass the entire dataset to the net at once so you have to send it in batches (thats what the batch size is for)
    for images, targets in dataloader:
        images = torch.stack(images).to(device)    # shape: [batch_size, 3, 224, 224]
        #boxes = [t['boxes'].to(device) for t in targets]  # move each image's boxes to GPU

        optimizer.zero_grad()               # resets gradients
        with autocast('cuda'):  # autocast lets you use mixed precision for faster training on nvidia gpu, uses 16bit floating points instead of 32
            outputs = net(images)
            loss, avg_reg, avg_conf, avg_iou = compute_loss_single_face(outputs, targets, anchors, stride=8)
            
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm=1.0) # Gradient clipping to prevent exploding gradients

        # if epoch < warmup_epochs:
        #     for param_group in optimizer.param_groups:
        #         param_group['lr'] = get_lr_warmup(epoch)

        scaler.step(optimizer)
        scaler.update()
        
        epoch_loss += loss.item()  # accumulate batch loss

    #timer to see how long each epoch takes
    timer_end = time.time()
    length = timer_end-timer_start
    mins, secs = divmod(length, 60)

    
    avg_epoch_loss = epoch_loss / len(dataloader)
    scheduler.step(avg_epoch_loss) #reduces the learning rate once the loss has stopped moving for 3 consecutive epochs

    if (epoch+1) % validationRate == 0:
        val_iou = anchor_validate_iou(net=net, val_loader=val_loader, device=device, anchors=anchors)
        torch.cuda.empty_cache() #free up the cache on the gpu after checking iou

        print(
        f"Epoch [{epoch+1}/{num_epochs}]"
        f" - Train Loss: {avg_epoch_loss:.4f}"
        f" - Avg IoU: {(avg_iou*100):.3f}%"
        f" - Reg: {avg_reg:.4f}"
        f" - Conf: {avg_conf:.4f}"
        f" -  Loss: {loss.item():.4f}"
        f" - Time taken: 0{int(mins)}:{secs:.2f}"
        f" - Val IoU: {(val_iou*100):.3f}%"
        )
    else:
        print(
            f"Epoch [{epoch+1}/{num_epochs}]"
            f" - Train Loss: {avg_epoch_loss:.4f}"
            f" - Avg IoU: {(avg_iou*100):.3f}%"
            f" - Reg: {avg_reg:.4f}"
            f" - Conf: {avg_conf:.4f}"
            f" -  Loss: {loss.item():.4f}"
            f" - Time taken: 0{int(mins)}:{secs:.2f}")
    

    timer_start = time.time()

    #saves a checkpoint
    if (epoch+1) % checkpoint_rate == 0:
        torch.save({
            "epoch": epoch,
            "model_state": net.state_dict(),
            "optimizer_state": optimizer.state_dict(),
            "loss": avg_epoch_loss
        }, f"checkpoints/faceNet_checkpoint{epoch+1}.pth")

Training on: cuda
Epoch [1/100] - Train Loss: 2.1742 - Avg IoU: 58.843% - Reg: 0.0730 - Conf: 0.7162 -  Loss: 1.8179 - Time taken: 01:29.25
Epoch [2/100] - Train Loss: 1.6556 - Avg IoU: 50.481% - Reg: 0.0576 - Conf: 0.5868 -  Loss: 1.5169 - Time taken: 01:29.82
Epoch [3/100] - Train Loss: 1.5202 - Avg IoU: 45.417% - Reg: 0.0323 - Conf: 0.5350 -  Loss: 1.3131 - Time taken: 01:31.15
Epoch [4/100] - Train Loss: 1.4550 - Avg IoU: 46.566% - Reg: 0.0452 - Conf: 0.5109 -  Loss: 1.3449 - Time taken: 01:35.90
Epoch [5/100] - Train Loss: 1.4234 - Avg IoU: 44.409% - Reg: 0.0329 - Conf: 0.4895 -  Loss: 1.2543 - Time taken: 01:36.10 - Val IoU: 5.327%
Epoch [6/100] - Train Loss: 1.4045 - Avg IoU: 54.129% - Reg: 0.0856 - Conf: 0.4845 -  Loss: 1.5532 - Time taken: 01:35.90
Epoch [7/100] - Train Loss: 1.3959 - Avg IoU: 48.830% - Reg: 0.0526 - Conf: 0.4880 -  Loss: 1.3781 - Time taken: 01:35.88
Epoch [8/100] - Train Loss: 1.3875 - Avg IoU: 53.296% - Reg: 0.0681 - Conf: 0.4915 -  Loss: 1.4953 - Time take

KeyboardInterrupt: 

In [None]:
# Re-create the model architecture
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

loadedNet = FaceDetectionNet()
checkpoint_path = os.path.join("saved_checkpoints", "faceNet_checkpoint60.pth")
checkpoint = torch.load(checkpoint_path, map_location=device)
loadedNet.load_state_dict(checkpoint["model_state"])
loadedNet.to(device)

# VERY IMPORTANT for inference
loadedNet.eval()iou_avg = anchor_validate_iou(net=loadedNet, val_loader=val_loader, device=device)
print(f"{(iou_avg*100):.4f}%")

TypeError: anchor_validate_iou() missing 1 required positional argument: 'anchors'

In [None]:
from PIL import Image


#img = "43_Row_Boat_Canoe_43_37.jpg"
#img_path = "C:\\Code\\Neural Net Projects\\FaceNet\\Dataset\\WIDER_test\\images\\43--Row_Boat\\" + img

img = "0_Parade_Parade_0_1020.jpg"

img_path = "C:\\Code\\Neural Net Projects\\FaceNet\\Dataset\\WIDER_test\\images\\0--Parade\\" + img
def test_net(image):
    #load image w/ preprocessing
    
    orig_w, orig_h = image.size  # save original size

    image_tensor = TRANSFORM(image)   # same TRANSFORM as training
    image_tensor = image_tensor.unsqueeze(0).to(device)  # [1, 3, 224, 224]

    #forward pass w/ no gradient
    with torch.no_grad():
        outputs = loadedNet(image_tensor)

    #decode the predicitions into boxes and scores
    pred = outputs[0]  # [1, 5, 28, 28]

    # Flatten
    pred = pred.view(5, -1).permute(1, 0)  # [28*28, 5]

    pred_boxes = pred[:, :4] * 224.0   # undo normalization
    pred_scores = torch.sigmoid(pred[:, 4])

    #apply confidence threshhold and nms (non max suppresion)
    CONF_THRESH = 0.5
    NMS_THRESH = 0.4

    keep = pred_scores > CONF_THRESH
    pred_boxes = pred_boxes[keep]
    pred_scores = pred_scores[keep]

    if pred_boxes.shape[0] > 0:
        keep_idx = box_nms(pred_boxes, pred_scores, iou_threshold=NMS_THRESH)
        pred_boxes = pred_boxes[keep_idx]
        pred_scores = pred_scores[keep_idx]


    #scale boxes back into original image
    scale_x = orig_w / 224
    scale_y = orig_h / 224

    pred_boxes[:, [0, 2]] *= scale_x
    pred_boxes[:, [1, 3]] *= scale_y

    return pred_boxes, pred_scores

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches

img = "0_Parade_Parade_0_1046.jpg"
img_path = "C:\\Code\\Neural Net Projects\\FaceNet\\Dataset\\WIDER_test\\images\\0--Parade\\" + img
image = Image.open(img_path).convert("RGB")
pred_boxes, pred_scores = test_net(image)

fig, ax = plt.subplots(1)
ax.imshow(image)

for box, score in zip(pred_boxes, pred_scores):
    x1, y1, x2, y2 = box.cpu()
    rect = patches.Rectangle(
        (x1, y1), x2 - x1, y2 - y1,
        linewidth=2, edgecolor='blue', facecolor='none'
    )
    ax.add_patch(rect)
    ax.text(x1, y1 - 5, f"Confidence: {score:.2f}", color='red')

plt.show()