In [1]:
import os
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.amp import autocast, GradScaler
import torchvision.transforms as T
from utils.import_data import WiderFaceDataset, TRANSFORM, TRAIN_ROOT, TRAIN_ANN_FILE, TEST_ROOT, VAL_ROOT, VAL_ANN_FILE
#from utils.anchors import AnchorMatcher, AnchorGenerator, box_nms, compute_loss_with_anchors
from utils.anchors import AnchorGeneratorSingle, compute_loss_single_face, compute_iou

#custom collate function to seperate the boxes from the images
def custom_collate_fn(batch):
    images, targets = zip(*batch)  # unzip the batch
    return images, targets

# Create the dataset
dataset = WiderFaceDataset(
    root_dir=TRAIN_ROOT, 
    annotation_file=TRAIN_ANN_FILE, 
    img_size=224, 
    transform=TRANSFORM,
    single_face_only=True #if true, thirds the total images
    )

# Create a dataloader
dataloader = DataLoader(
    dataset, 
    batch_size=64, #64 runs quickest on desktop
    shuffle=True,
    num_workers=0,
    pin_memory=True,
    #use collate here because our data has the images but also the boxes and number of boxes
    collate_fn = custom_collate_fn
    )


#load validation pictures
val_dataset = WiderFaceDataset(
    root_dir=VAL_ROOT, 
    annotation_file=VAL_ANN_FILE, 
    img_size=224, 
    transform=TRANSFORM,
    single_face_only=True
    )

val_loader = DataLoader(
    val_dataset, 
    batch_size=32,
    shuffle=True,
    num_workers=0,
    pin_memory=True,
    collate_fn = custom_collate_fn
    )

print(f"Total train images: {len(dataset)}")
print(f"Total validation images: {len(val_dataset)}")


Total train images: 4631
Total validation images: 1122


In [3]:
#Computes the intersection over union (IoU) for a single face image. On a scale of 0-1, a higher number means the boxes are more accurate
def anchor_validate_iou(net, val_loader, device, conf_thresh=0.05, img_size=224):
    """
    IoU validation for single-face anchor-based detector.
    Picks highest-confidence anchor per image.
    """
    net.eval()
    iou_sum = 0.0
    count = 0

    with torch.no_grad():
        for images, targets in val_loader:
            # Stack images and move to device
            images = torch.stack(images).to(device)  # [B, 3, H, W]
            outputs = net(images)                    # [B, 5, H', W']

            B, C, H, W = outputs.shape
            # Flatten spatial dimensions
            outputs = outputs.permute(0, 2, 3, 1).reshape(B, -1, 5)  # [B, H*W, 5]

            for i in range(B):
                pred = outputs[i]               # [H*W, 5]

                pred_boxes = pred[:, :4] * img_size  # scale to image size
                pred_scores = torch.sigmoid(pred[:, 4])

                # confidence filter
                keep = pred_scores > conf_thresh
                if keep.sum() == 0:
                    continue

                pred_boxes = pred_boxes[keep]
                pred_scores = pred_scores[keep]

                # pick best box
                best_idx = pred_scores.argmax()
                pred_box = pred_boxes[best_idx]

                # Get GT (dataset bbx) box
                gt_boxes = targets[i]['boxes']
                if gt_boxes.shape[0] == 0:
                    continue
                gt_box = gt_boxes[0]

                # Ensure 2D for compute_iou
                iou = compute_iou(pred_box.unsqueeze(0), gt_box.unsqueeze(0))
                iou_sum += iou.item()
                count += 1

    
    net.train()
    return iou_sum / max(count, 1)


In [4]:
class FaceDetectionNet(nn.Module):
    def __init__(self, num_anchors=3):
        """
        num_anchors: the number of anchor sizes (3: [32, 64, 96])
        kernel_size is the size of the box we pass over each img to extract the features, exactly like tf (3,3,3)
        """
        super(FaceDetectionNet, self).__init__()

        #Backbone (feature extractor)
        self.backbone = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),  # RGB input
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )

        # Detection head
        # Predict bounding boxes + confidence
        # Output channels = num_anchors * 5 (x, y, w, h, conf)
        self.det_head = nn.Conv2d(128, num_anchors * 5, kernel_size=1)

    def forward(self, x):
        x = self.backbone(x)
        return self.det_head(x)

In [None]:
#Training Loop
start_epoch = 0
num_epochs = 3
loadLastCheckpoint = False
checkpoint_rate = 10
validationRate = 1
net = FaceDetectionNet()

#moves all the info to the gpu (cuda) if it can, if not it keeps it on the cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on: {device}") 
net = net.to(device)


optimizer = torch.optim.Adam(net.parameters(), lr=0.0001)
scaler = GradScaler()

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=3
)


# Initialize anchor generator
anchor_gen = AnchorGeneratorSingle()
anchors = anchor_gen.generate(feature_h=28, feature_w=28, stride=8, device=device)


if loadLastCheckpoint:
    # Check for existing checkpoints and load the latest one
    checkpoint_files = [f for f in os.listdir("checkpoints") if f.startswith("faceNet_checkpoint")]
    if checkpoint_files:
        # Sort by epoch number and get the latest
        latest_checkpoint = sorted(checkpoint_files, key=lambda x: int(x.split('checkpoint')[1].split('.')[0]))[-1]
        checkpoint_path = os.path.join("checkpoints", latest_checkpoint)
        
        checkpoint = torch.load(checkpoint_path, map_location=device)
        net.load_state_dict(checkpoint["model_state"])
        optimizer.load_state_dict(checkpoint["optimizer_state"])
        start_epoch = checkpoint["epoch"] + 1
        print(f"Loaded checkpoint from epoch {checkpoint['epoch'] + 1}")

timer_start = time.time()
for epoch in range(start_epoch, num_epochs):
    net.train() # sets the net to training mode
    epoch_loss = 0

    #with pytorch it wont let you pass the entire dataset to the net at once so you have to send it in batches (thats what the batch size is for)
    for images, targets in dataloader:
        images = torch.stack(images).to(device)    # shape: [batch_size, 3, 224, 224]
        boxes = [t['boxes'].to(device) for t in targets]  # move each image's boxes to GPU

        optimizer.zero_grad()               # resets gradients
        with autocast('cuda'):  # autocast lets you use mixed precision for faster training on nvidia gpu, uses 16bit floating points instead of 32
            outputs = net(images)
            loss = compute_loss_single_face(outputs, targets, anchors, stride=8)
            
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        epoch_loss += loss.item()  # accumulate batch loss

    #timer to see how long each epoch takes
    timer_end = time.time()
    length = timer_end-timer_start
    mins, secs = divmod(length, 60)

    
    avg_epoch_loss = epoch_loss / len(dataloader)
    scheduler.step(avg_epoch_loss) #reduces the learning rate once the loss has stopped moving for 3 consecutive epochs

    if (epoch+1) % validationRate == 0:
        avg_iou = anchor_validate_iou(net=net, val_loader=val_loader, device=device)
        torch.cuda.empty_cache() #free up the cache on the gpu after checking iou
        
        print(
        f"Epoch [{epoch+1}/{num_epochs}]"
        f" - Train Loss: {avg_epoch_loss:.4f}"
        f" - Val IoU: {(avg_iou*100):.4f}%"
        f" - Time taken: 0{int(mins)}:{secs:.2f}")
    else:
        print(
            f"Epoch [{epoch+1}/{num_epochs}]"
            f" - Train Loss: {avg_epoch_loss:.4f}"
            f" - Time taken: 0{int(mins)}:{secs:.2f}")
    

    timer_start = time.time()

    #saves a checkpoint
    if (epoch+1) % checkpoint_rate == 0:
        torch.save({
            "epoch": epoch,
            "model_state": net.state_dict(),
            "optimizer_state": optimizer.state_dict(),
            "loss": avg_epoch_loss
        }, f"checkpoints/faceNet_checkpoint{epoch+1}.pth")

Training on: cuda
Epoch [1/3] - Train Loss: 2.3623 - Val IoU: 0.0322% - Time taken: 01:3.53
Epoch [2/3] - Train Loss: 1.6050 - Val IoU: 0.0000% - Time taken: 01:6.51
Epoch [3/3] - Train Loss: 1.4351 - Val IoU: 0.0000% - Time taken: 01:7.46


In [None]:
# Re-create the model architecture
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

loadedNet = FaceDetectionNet()
checkpoint_path = os.path.join("checkpoints", "faceNet_checkpoint30.pth")
checkpoint = torch.load(checkpoint_path, map_location=device)
loadedNet.load_state_dict(checkpoint["model_state"])
loadedNet.to(device)

# VERY IMPORTANT for inference
loadedNet.eval()
iou_avg = anchor_validate_iou(net=loadedNet, val_loader=val_loader, device=device)
print(f"{(iou_avg*100):.4f}%")

0.1484


In [None]:
from PIL import Image


#img = "43_Row_Boat_Canoe_43_37.jpg"
#img_path = "C:\\Code\\Neural Net Projects\\FaceNet\\Dataset\\WIDER_test\\images\\43--Row_Boat\\" + img

img = "0_Parade_Parade_0_1020.jpg"

img_path = "C:\\Code\\Neural Net Projects\\FaceNet\\Dataset\\WIDER_test\\images\\0--Parade\\" + img
def test_net(image):
    #load image w/ preprocessing
    
    orig_w, orig_h = image.size  # save original size

    image_tensor = TRANSFORM(image)   # same TRANSFORM as training
    image_tensor = image_tensor.unsqueeze(0).to(device)  # [1, 3, 224, 224]

    #forward pass w/ no gradient
    with torch.no_grad():
        outputs = loadedNet(image_tensor)

    #decode the predicitions into boxes and scores
    pred = outputs[0]  # [1, 5, 28, 28]

    # Flatten
    pred = pred.view(5, -1).permute(1, 0)  # [28*28, 5]

    pred_boxes = pred[:, :4] * 224.0   # undo normalization
    pred_scores = torch.sigmoid(pred[:, 4])

    #apply confidence threshhold and nms (non max suppresion)
    CONF_THRESH = 0.5
    NMS_THRESH = 0.4

    keep = pred_scores > CONF_THRESH
    pred_boxes = pred_boxes[keep]
    pred_scores = pred_scores[keep]

    if pred_boxes.shape[0] > 0:
        keep_idx = box_nms(pred_boxes, pred_scores, iou_threshold=NMS_THRESH)
        pred_boxes = pred_boxes[keep_idx]
        pred_scores = pred_scores[keep_idx]


    #scale boxes back into original image
    scale_x = orig_w / 224
    scale_y = orig_h / 224

    pred_boxes[:, [0, 2]] *= scale_x
    pred_boxes[:, [1, 3]] *= scale_y

    return pred_boxes, pred_scores

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches

img = "0_Parade_Parade_0_1046.jpg"
img_path = "C:\\Code\\Neural Net Projects\\FaceNet\\Dataset\\WIDER_test\\images\\0--Parade\\" + img
image = Image.open(img_path).convert("RGB")
pred_boxes, pred_scores = test_net(image)

fig, ax = plt.subplots(1)
ax.imshow(image)

for box, score in zip(pred_boxes, pred_scores):
    x1, y1, x2, y2 = box.cpu()
    rect = patches.Rectangle(
        (x1, y1), x2 - x1, y2 - y1,
        linewidth=2, edgecolor='blue', facecolor='none'
    )
    ax.add_patch(rect)
    ax.text(x1, y1 - 5, f"Confidence: {score:.2f}", color='red')

plt.show()