In [22]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.amp import autocast, GradScaler
# Importing the class to get the dataset
from utils.import_data import WiderFaceDataset, TRANSFORM, TRAIN_ROOT, TRAIN_ANN_FILE, TEST_ROOT
from utils.anchors import AnchorMatcher, AnchorGenerator, box_nms, compute_loss_with_anchors


# Create the dataset
dataset = WiderFaceDataset(
    root_dir=TRAIN_ROOT, 
    annotation_file=TRAIN_ANN_FILE, 
    img_size=224, 
    transform=TRANSFORM,
    single_face_only=True #if true, thirds the total images
    )

print(f"Total images: {len(dataset)}")

#custom collate function to seperate the boxes from the images
def custom_collate_fn(batch):
    images, targets = zip(*batch)  # unzip the batch
    return images, targets

# Create a dataloader
dataloader = DataLoader(
    dataset, 
    batch_size=64, #64 runs quicest on desktop
    shuffle=True,
    num_workers=0,
    pin_memory=True,
    #use collate here because our data has the images but also the boxes and number of boxes
    collate_fn = custom_collate_fn
    )



Total images: 4631


In [23]:
class FaceDetectionNet(nn.Module):
    def __init__(self, num_anchors=1):
        """
        num_anchors: number of boxes predicted per spatial cell (simplest: 1)
        """
        super(FaceDetectionNet, self).__init__()

        """
        kernel_size is the size of the box we pass over each img to extract the features, exactly like tf (3,3,3)
        """
        #Backbone (feature extractor)
        self.backbone = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),  # RGB input
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # downsample by 2 -> 112x1112

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # downsample by 2 -> 56x56

            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # downsample by 2 -> 28x28
        )

        # Detection head
        # Predict bounding boxes + confidence
        # Output channels = num_anchors * 5 (x, y, w, h, conf)
        self.det_head = nn.Conv2d(256, num_anchors * 5, kernel_size=1)

    def forward(self, x):
        """
        x: [batch_size, 3, H, W]
        Returns:
            out: [batch_size, num_anchors * 5, H/4, W/4] 
                 Each cell predicts (x, y, w, h, confidence)
        """
        features = self.backbone(x)
        out = self.det_head(features)  # [B, 5*num_anchors, H', W']

        B, C, H, W = out.shape
        out = out.view(B, -1, 5, H, W)  # [B, num_anchors, 5, H', W']
        return out

In [26]:
import time
net = FaceDetectionNet()

#moves all the info to the gpu (cuda) if it can, if not it keeps it on the cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on: {device}") 
net = net.to(device)


optimizer = torch.optim.Adam(net.parameters(), lr=0.0005)
scaler = GradScaler()

# Initialize anchor generator and matcher
anchor_gen = AnchorGenerator(scales=[1.0], aspect_ratios=[1.0])
matcher = AnchorMatcher(pos_iou_thresh=0.5, neg_iou_thresh=0.4)
# Generate anchors (assuming feature map is 28x28 after 2x2 pooling)
anchors = anchor_gen.generate_anchors(feature_h=28, feature_w=28, stride=8, img_size=224)

start_epoch = 0
num_epochs = 100
checkpoint_rate = 20

loadLastCheckpoint = True
if loadLastCheckpoint:
    # Check for existing checkpoints and load the latest one
    checkpoint_files = [f for f in os.listdir("checkpoints") if f.startswith("faceNet_checkpoint")]
    if checkpoint_files:
        # Sort by epoch number and get the latest
        latest_checkpoint = sorted(checkpoint_files, key=lambda x: int(x.split('checkpoint')[1].split('.')[0]))[-1]
        checkpoint_path = os.path.join("checkpoints", latest_checkpoint)
        
        checkpoint = torch.load(checkpoint_path, map_location=device)
        net.load_state_dict(checkpoint["model_state"])
        optimizer.load_state_dict(checkpoint["optimizer_state"])
        start_epoch = checkpoint["epoch"] + 1
        print(f"Loaded checkpoint from epoch {checkpoint['epoch'] + 1}")

timer_start = time.time()
for epoch in range(start_epoch, num_epochs):
    net.train() # sets the net to training mode
    epoch_loss = 0

    #with pytorch it wont let you pass the entire dataset to the net at once so you have to send it in batches (thats what the batch size is for)
    for images, targets in dataloader:
        # images is a tuple of tensors; stack into a single batch tensor
        images = torch.stack(images).to(device)    # shape: [batch_size, 3, 224, 224]
        boxes = [t['boxes'].to(device) for t in targets]  # move each image's boxes to GPU

        optimizer.zero_grad()               # resets gradients
        # outputs = net(images)               # forward pass
        # loss = compute_loss_with_anchors(outputs, boxes, anchors, matcher, device)
        # loss.backward()                     # backpropogation
        # optimizer.step()                    # update weights 

        with autocast('cuda'):  # Use FP16 for forward pass
            outputs = net(images)
            loss = compute_loss_with_anchors(outputs, boxes, anchors, matcher, device)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        epoch_loss += loss.item()  # accumulate batch loss

    #timer to see how long each epoch takes
    timer_end = time.time()
    length = timer_end-timer_start
    mins, secs = divmod(length, 60)

    #custom loss to mimic what it looks like in tf
    avg_epoch_loss = epoch_loss / len(dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}] - Average Loss: {avg_epoch_loss:.4f} - Time taken: 0{int(mins)}:{secs:.2f}")
    timer_start = time.time()

    #saves a checkpoint
    if (epoch+1) % checkpoint_rate == 0:
        torch.save({
            "epoch": epoch,
            "model_state": net.state_dict(),
            "optimizer_state": optimizer.state_dict(),
            "loss": avg_epoch_loss
        }, f"checkpoints/faceNet_checkpoint{epoch+1}.pth")






Training on: cuda
Loaded checkpoint from epoch 45
Epoch [46/100] - Average Loss: 0.1537 - Time taken: 01:4.90
Epoch [47/100] - Average Loss: 0.1514 - Time taken: 01:4.74
Epoch [48/100] - Average Loss: 0.1495 - Time taken: 01:4.57
Epoch [49/100] - Average Loss: 0.1465 - Time taken: 01:4.89
Epoch [50/100] - Average Loss: 0.1458 - Time taken: 01:8.84
Epoch [51/100] - Average Loss: 0.1455 - Time taken: 01:9.08
Epoch [52/100] - Average Loss: 0.1458 - Time taken: 01:8.39
Epoch [53/100] - Average Loss: 0.1480 - Time taken: 01:8.22
Epoch [54/100] - Average Loss: 0.1428 - Time taken: 01:8.41
Epoch [55/100] - Average Loss: 0.1418 - Time taken: 01:8.18
Epoch [56/100] - Average Loss: 0.1405 - Time taken: 01:8.25
Epoch [57/100] - Average Loss: 0.1396 - Time taken: 01:8.26
Epoch [58/100] - Average Loss: 0.1369 - Time taken: 01:8.79
Epoch [59/100] - Average Loss: 0.1411 - Time taken: 01:8.22
Epoch [60/100] - Average Loss: 0.1383 - Time taken: 01:8.25
Epoch [61/100] - Average Loss: 0.1379 - Time taken

In [None]:
# Re-create the model architecture
net = FaceDetectionNet()
checkpoint_path = os.path.join("checkpoints", "faceNet_checkpoint100.pth")
checkpoint = torch.load(checkpoint_path, map_location=device)
net.load_state_dict(checkpoint["model_state"])
net.to(device)

# VERY IMPORTANT for inference
net.eval()

FaceDetectionNet(
  (backbone): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU()
    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (11): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (13): ReLU()
    (14): MaxPool2d(kernel_size=2, stride=2, padding=0, dila

In [21]:
from PIL import Image
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import matplotlib.patches as patches

# Load a test image
image_path = r"C:\Code\Neural Net Projects\FaceNet\Dataset\WIDER_test\images\0--Parade\0_Parade_Parade_0_1007.jpg"
original_image = Image.open(image_path).convert('RGB')

# Get original image dimensions for scaling boxes back
orig_width, orig_height = original_image.size

# Preprocess image for model (224x224)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225])
])

device = 'cuda'
image_tensor = transform(original_image).to(device)

# Run detection
boxes, scores = detect_faces(image_tensor, net, anchors, conf_threshold=0.5, nms_threshold=0.5)

print(f"Found {len(boxes)} faces")
if len(boxes) > 0:
    print(f"Boxes (in 224x224 space):\n{boxes}")
    print(f"Confidence Scores:\n{scores}")

# Scale boxes back to original image dimensions
if len(boxes) > 0:
    scale_x = orig_width / 224
    scale_y = orig_height / 224
    boxes_scaled = boxes.clone()
    boxes_scaled[:, [0, 2]] *= scale_x
    boxes_scaled[:, [1, 3]] *= scale_y
else:
    boxes_scaled = boxes

# Visualize on original image
fig, ax = plt.subplots(1, figsize=(12, 12))
ax.imshow(original_image)

for i, box in enumerate(boxes_scaled):
    x_min, y_min, x_max, y_max = box[:4].cpu().numpy()
    score = scores[i].cpu().item()
    
    # Draw rectangle
    width = x_max - x_min
    height = y_max - y_min
    rect = patches.Rectangle((x_min, y_min), width, height, linewidth=2, edgecolor='red', facecolor='none')
    ax.add_patch(rect)
    
    # Add confidence label
    ax.text(x_min, y_min - 10, f'Confidence: {score:.3f}', color='red', fontsize=11, 
            bbox=dict(facecolor='white', alpha=0.7))

ax.set_title(f"Face Detection Results - Found {len(boxes)} faces", fontsize=14)
ax.axis('off')
plt.tight_layout()
plt.show()

RuntimeError: indices should be either on cpu or on the same device as the indexed tensor (cpu)

In [16]:
# Inference with NMS
def detect_faces(image, model, anchors, conf_threshold=0.5, nms_threshold=0.5):
    """
    Detect faces in image using trained model
    """
    model.eval()
    with torch.no_grad():
        outputs = model(image.unsqueeze(0))  # [1, 1, 5, H, W]
        
        pred_flat = outputs[0].view(5, -1).T  # [num_anchors, 5]
        pred_boxes = pred_flat[:, :4]
        pred_conf = torch.sigmoid(pred_flat[:, 4])
        
        # Filter by confidence
        keep_idx = pred_conf > conf_threshold
        boxes = anchors[keep_idx]
        scores = pred_conf[keep_idx]
        
        # Apply NMS
        keep = box_nms(boxes, scores, nms_threshold)
        
        return boxes[keep], scores[keep]

In [None]:
from PIL import Image
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import matplotlib.patches as patches

# Load image
image_path = r"C:\Code\Neural Net Projects\FaceNet\Dataset\WIDER_test\images\0--Parade\0_Parade_Parade_0_1007.jpg"  # Replace with your image
#image_path = TEST_ROOT + "0_Parade_Parade_0_1046.jpg"
image = Image.open(image_path).convert('RGB')

# Preprocess image (same as training)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225])
])
device = 'cpu'
image_tensor = transform(image).to(device)

# Run detection
boxes, scores = detect_faces(image_tensor, net, anchors, conf_threshold=0.5, nms_threshold=0.5)

print(f"Found {len(boxes)} faces")
if len(boxes) > 0:
    print(f"Boxes:\n{boxes}")
    print(f"Scores:\n{scores}")

# Visualize results
fig, ax = plt.subplots(1, figsize=(10, 10))
ax.imshow(image)

for i, box in enumerate(boxes):
    x, y, w, h = box[:4].cpu().numpy()
    score = scores[i].cpu().item()
    
    rect = patches.Rectangle((x, y), w, h, linewidth=2, edgecolor='red', facecolor='none')
    ax.add_patch(rect)
    ax.text(x, y-5, f'{score:.2f}', color='red', fontsize=10)

ax.set_title(f"Detected {len(boxes)} faces")
ax.axis('off')
plt.tight_layout()
plt.show()