# LibreYOLO8 - Minimal Inference Notebook

This notebook contains all the code needed to run inference with a self-contained implementation.


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from pathlib import Path
from typing import Union, Tuple, Dict, List
import colorsys


## Model Architecture


In [None]:
class DFL(nn.Module):
    """Distribution Focal Loss (DFL) module."""
    def __init__(self, c1=16):
        super().__init__()
        self.conv = nn.Conv2d(c1, 1, 1, bias=False).requires_grad_(False)
        x = torch.arange(c1, dtype=torch.float)
        self.conv.weight.data[:] = nn.Parameter(x.view(1, c1, 1, 1))
        self.c1 = c1

    def forward(self, x):
        b, c, h, w = x.shape
        x = x.view(b, 4, self.c1, h, w).transpose(2, 1)
        x = F.softmax(x, dim=1)
        x = self.conv(x.reshape(b, self.c1, 4 * h, w))
        return x.view(b, 4, h, w)


class Conv(nn.Module):
    """Standard convolution: Conv2d + BatchNorm + SiLU"""
    def __init__(self, c_out, c_in, k, s, p):
        super().__init__()
        self.cnn = nn.Conv2d(in_channels=c_in, out_channels=c_out, kernel_size=k, stride=s, padding=p, bias=False)
        self.batchnorm = nn.BatchNorm2d(num_features=c_out)
        self.silu = nn.SiLU()
        
    def forward(self, x):
        return self.silu(self.batchnorm(self.cnn(x)))


class Bottleneck(nn.Module):
    """Residual bottleneck block"""
    def __init__(self, c_out, c_in, res_connection):
        super().__init__()
        self.conv1 = Conv(c_out=c_out, c_in=c_in, k=3, s=1, p=1)
        self.conv2 = Conv(c_out=c_out, c_in=c_out, k=3, s=1, p=1)
        self.res_connection = res_connection
        
    def forward(self, x):
        if self.res_connection:
            return x + self.conv2(self.conv1(x))
        return self.conv2(self.conv1(x))


class C2F(nn.Module):
    """C2f module with split-concat-bottleneck structure"""
    def __init__(self, c_out, c_in, res_connection, nb_bottlenecks):
        super().__init__()
        self.conv1 = Conv(c_out=c_out, c_in=c_in, k=1, s=1, p=0)
        self.conv2 = Conv(c_out=c_out, c_in=int((nb_bottlenecks + 2) * c_out / 2), k=1, s=1, p=0)
        self.bottlenecks = nn.ModuleList([
            Bottleneck(c_out=c_out//2, c_in=c_out//2, res_connection=res_connection) 
            for _ in range(nb_bottlenecks)
        ])
        
    def forward(self, x):
        x = self.conv1(x)
        batch, c, h, w = x.shape
        x2 = x[:, c//2:, :, :]
        for bottleneck in self.bottlenecks:
            x2 = bottleneck(x2)
            x = torch.cat((x, x2), dim=1)
        return self.conv2(x)


class SPPF(nn.Module):
    """Spatial Pyramid Pooling Fast"""
    def __init__(self, c_out, c_in):
        super().__init__()
        self.conv1 = Conv(c_out=c_out//2, c_in=c_in, k=1, s=1, p=0)
        self.maxpool = nn.MaxPool2d(kernel_size=5, stride=1, padding=2)
        self.conv2 = Conv(c_out=c_out, c_in=4*(c_out//2), k=1, s=1, p=0)

    def forward(self, x):
        x = self.conv1(x)
        x1 = self.maxpool(x)
        x2 = self.maxpool(x1)
        x3 = self.maxpool(x2)
        return self.conv2(torch.cat((x, x1, x2, x3), dim=1))


In [None]:
class Backbone(nn.Module):
    """Feature extraction backbone"""
    def __init__(self, config):
        super().__init__()
        cfg = {
            'n': {'d': 0.33, 'w': 0.25, 'r': 2.0},
            's': {'d': 0.33, 'w': 0.50, 'r': 2.0},
            'm': {'d': 0.67, 'w': 0.75, 'r': 1.5},
            'l': {'d': 1.00, 'w': 1.00, 'r': 1.0},
            'x': {'d': 1.00, 'w': 1.25, 'r': 1.0}
        }
        d, w, r = cfg[config]['d'], cfg[config]['w'], cfg[config]['r']
                
        self.p1 = Conv(c_out=int(64*w), c_in=3, k=3, s=2, p=1)
        self.p2 = Conv(c_out=int(128*w), c_in=int(64*w), k=3, s=2, p=1)
        self.c2f1 = C2F(c_out=int(128*w), c_in=int(128*w), res_connection=True, nb_bottlenecks=max(1, int(round(3*d))))
        self.p3 = Conv(c_out=int(256*w), c_in=int(128*w), k=3, s=2, p=1)
        self.c2f2 = C2F(c_out=int(256*w), c_in=int(256*w), res_connection=True, nb_bottlenecks=max(1, int(round(6*d))))
        self.p4 = Conv(c_out=int(512*w), c_in=int(256*w), k=3, s=2, p=1)
        self.c2f3 = C2F(c_out=int(512*w), c_in=int(512*w), res_connection=True, nb_bottlenecks=max(1, int(round(6*d))))
        self.p5 = Conv(c_out=int(512*w*r), c_in=int(512*w), k=3, s=2, p=1)
        self.c2f4 = C2F(c_out=int(512*w*r), c_in=int(512*w*r), res_connection=True, nb_bottlenecks=max(1, int(round(3*d))))
        self.sppf = SPPF(c_out=int(512*w*r), c_in=int(512*w*r))
        
    def forward(self, x):
        x = self.c2f1(self.p2(self.p1(x)))
        x = self.c2f2(self.p3(x))
        x8 = x.clone()
        x = self.c2f3(self.p4(x))
        x16 = x.clone()
        x32 = self.sppf(self.c2f4(self.p5(x)))
        return x8, x16, x32


class Neck(nn.Module):
    """Feature pyramid network neck"""
    def __init__(self, config):
        super().__init__()
        cfg = {
            'n': {'d': 0.33, 'w': 0.25, 'r': 2.0},
            's': {'d': 0.33, 'w': 0.50, 'r': 2.0},
            'm': {'d': 0.67, 'w': 0.75, 'r': 1.5},
            'l': {'d': 1.00, 'w': 1.00, 'r': 1.0},
            'x': {'d': 1.00, 'w': 1.25, 'r': 1.0}
        }
        d, w, r = cfg[config]['d'], cfg[config]['w'], cfg[config]['r']

        self.upsample1 = nn.Upsample(scale_factor=2, mode='nearest')        
        self.c2f21 = C2F(c_out=int(512*w), c_in=int(512*w*(1 + r)), res_connection=False, nb_bottlenecks=max(1, int(round(3*d))))
        self.upsample2 = nn.Upsample(scale_factor=2, mode='nearest')
        self.c2f11 = C2F(c_out=int(256*w), c_in=int(768*w), res_connection=False, nb_bottlenecks=max(1, int(round(3*d))))    
        self.conv1 = Conv(c_out=int(256*w), c_in=int(256*w), k=3, s=2, p=1)
        self.c2f12 = C2F(c_out=int(512*w), c_in=int(768*w), res_connection=False, nb_bottlenecks=max(1, int(round(3*d))))
        self.conv2 = Conv(c_out=int(512*w), c_in=int(512*w), k=3, s=2, p=1)
        self.c2f22 = C2F(c_out=int(512*w*r), c_in=int(512*w*(1 + r)), res_connection=False, nb_bottlenecks=max(1, int(round(3*d))))
        
    def forward(self, x8, x16, x32):
        x16 = self.c2f21(torch.cat((self.upsample1(x32), x16), dim=1))
        x8 = self.c2f11(torch.cat((self.upsample2(x16), x8), dim=1))
        x16 = self.c2f12(torch.cat((self.conv1(x8), x16), dim=1))
        x32 = self.c2f22(torch.cat((self.conv2(x16), x32), dim=1))
        return x8, x16, x32


class Head(nn.Module):
    """Decoupled detection head"""
    def __init__(self, c_in, c_box, c_cls, reg_max, nb_classes):
        super().__init__()
        self.conv11 = Conv(c_out=c_box, c_in=c_in, k=3, s=1, p=1)
        self.conv12 = Conv(c_out=c_box, c_in=c_box, k=3, s=1, p=1)
        self.conv21 = Conv(c_out=c_cls, c_in=c_in, k=3, s=1, p=1)
        self.conv22 = Conv(c_out=c_cls, c_in=c_cls, k=3, s=1, p=1)
        self.cnn1 = nn.Conv2d(in_channels=c_box, out_channels=4*reg_max, kernel_size=1, stride=1, padding=0)
        self.cnn2 = nn.Conv2d(in_channels=c_cls, out_channels=nb_classes, kernel_size=1, stride=1, padding=0)
        
    def forward(self, x):
        box = self.cnn1(self.conv12(self.conv11(x)))
        cls = self.cnn2(self.conv22(self.conv21(x)))
        return box, cls


In [None]:
class LibreYOLO8Model(nn.Module):
    """Main Libre YOLO model"""
    def __init__(self, config, reg_max=16, nb_classes=80):
        super().__init__()
        cfg = {
            'n': {'d': 0.33, 'w': 0.25, 'r': 2.0},
            's': {'d': 0.33, 'w': 0.50, 'r': 2.0},
            'm': {'d': 0.67, 'w': 0.75, 'r': 1.5},
            'l': {'d': 1.00, 'w': 1.00, 'r': 1.0},
            'x': {'d': 1.00, 'w': 1.25, 'r': 1.0}
        }
        w, r = cfg[config]['w'], cfg[config]['r']
        
        self.backbone = Backbone(config=config)
        self.neck = Neck(config=config)
        
        c_p3 = int(256 * w)
        c_box = max(64, c_p3 // 4)
        c_cls = max(c_p3, min(nb_classes, 100))
        
        self.head8 = Head(c_in=int(256*w), c_box=c_box, c_cls=c_cls, reg_max=reg_max, nb_classes=nb_classes)
        self.head16 = Head(c_in=int(512*w), c_box=c_box, c_cls=c_cls, reg_max=reg_max, nb_classes=nb_classes)
        self.head32 = Head(c_in=int(512*w*r), c_box=c_box, c_cls=c_cls, reg_max=reg_max, nb_classes=nb_classes)
        self.dfl = DFL(c1=reg_max)

    def forward(self, x):
        x8, x16, x32 = self.backbone(x)
        x8, x16, x32 = self.neck(x8, x16, x32)
        
        box8, cls8 = self.head8(x8)
        box16, cls16 = self.head16(x16)
        box32, cls32 = self.head32(x32)
        
        return {
            'x8': {'box': self.dfl(box8), 'cls': cls8},
            'x16': {'box': self.dfl(box16), 'cls': cls16},
            'x32': {'box': self.dfl(box32), 'cls': cls32}
        }


## Utility Functions


In [None]:
COCO_CLASSES = [
    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat',
    'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat',
    'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
    'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
    'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
    'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
    'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
    'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]


def get_class_color(class_id: int) -> str:
    """Get a unique color for a class ID."""
    hue = (class_id * 137.508) % 360 / 360.0
    saturation = 0.7 + (class_id % 3) * 0.1
    value = 0.8 + (class_id % 2) * 0.15
    rgb = colorsys.hsv_to_rgb(hue, saturation, value)
    return f"#{int(rgb[0]*255):02x}{int(rgb[1]*255):02x}{int(rgb[2]*255):02x}"


def preprocess_image(image: Union[str, Image.Image, np.ndarray], input_size: int = 640):
    """Preprocess image for model inference."""
    if isinstance(image, str):
        img = Image.open(image).convert('RGB')
    elif isinstance(image, Image.Image):
        img = image.convert('RGB')
    elif isinstance(image, np.ndarray):
        img = Image.fromarray(image).convert('RGB')
    else:
        raise ValueError(f"Unsupported image type: {type(image)}")
    
    original_size = img.size
    original_img = img.copy()
    img_resized = img.resize((input_size, input_size), Image.Resampling.BILINEAR)
    img_array = np.array(img_resized, dtype=np.float32) / 255.0
    img_tensor = torch.from_numpy(img_array).permute(2, 0, 1).unsqueeze(0)
    
    return img_tensor, original_img, original_size


In [None]:
def make_anchors(feats: List[torch.Tensor], strides: List[int], grid_cell_offset: float = 0.5):
    """Generate anchor points from feature map sizes."""
    anchor_points, stride_tensor = [], []
    
    for feat, stride in zip(feats, strides):
        _, _, h, w = feat.shape
        dtype, device = feat.dtype, feat.device
        sx = torch.arange(end=w, device=device, dtype=dtype) + grid_cell_offset
        sy = torch.arange(end=h, device=device, dtype=dtype) + grid_cell_offset
        sy, sx = torch.meshgrid(sy, sx, indexing='ij')
        anchor_points.append(torch.stack((sx, sy), -1).view(-1, 2))
        stride_tensor.append(torch.full((h * w, 1), stride, dtype=dtype, device=device))
    
    return torch.cat(anchor_points), torch.cat(stride_tensor)


def decode_boxes(box_preds: torch.Tensor, anchors: torch.Tensor, stride_tensor: torch.Tensor):
    """Decode box predictions to xyxy coordinates."""
    x1 = (anchors[:, 0:1] - box_preds[0, :, 0:1]) * stride_tensor[:, 0:1]
    y1 = (anchors[:, 1:2] - box_preds[0, :, 1:2]) * stride_tensor[:, 0:1]
    x2 = (anchors[:, 0:1] + box_preds[0, :, 2:3]) * stride_tensor[:, 0:1]
    y2 = (anchors[:, 1:2] + box_preds[0, :, 3:4]) * stride_tensor[:, 0:1]
    return torch.cat([x1, y1, x2, y2], dim=1)


def nms(boxes: torch.Tensor, scores: torch.Tensor, iou_threshold: float = 0.45):
    """Non-Maximum Suppression."""
    if len(boxes) == 0:
        return torch.tensor([], dtype=torch.long, device=boxes.device)
    
    _, order = scores.sort(0, descending=True)
    keep = []
    
    while len(order) > 0:
        i = order[0]
        keep.append(i.item())
        if len(order) == 1:
            break
        
        box_i = boxes[i]
        boxes_remaining = boxes[order[1:]]
        
        x1_i, y1_i, x2_i, y2_i = box_i
        x1_r, y1_r, x2_r, y2_r = boxes_remaining[:, 0], boxes_remaining[:, 1], boxes_remaining[:, 2], boxes_remaining[:, 3]
        
        x1_inter = torch.max(x1_i, x1_r)
        y1_inter = torch.max(y1_i, y1_r)
        x2_inter = torch.min(x2_i, x2_r)
        y2_inter = torch.min(y2_i, y2_r)
        
        inter_area = torch.clamp(x2_inter - x1_inter, min=0) * torch.clamp(y2_inter - y1_inter, min=0)
        area_i = (x2_i - x1_i) * (y2_i - y1_i)
        area_r = (x2_r - x1_r) * (y2_r - y1_r)
        iou = inter_area / (area_i + area_r - inter_area + 1e-7)
        
        order = order[1:][iou < iou_threshold]
    
    return torch.tensor(keep, dtype=torch.long, device=boxes.device)


In [None]:
def postprocess(output: Dict, conf_thres: float = 0.25, iou_thres: float = 0.45, 
                input_size: int = 640, original_size: Tuple[int, int] = None):
    """Postprocess model outputs to get final detections."""
    box_layers = [output['x8']['box'], output['x16']['box'], output['x32']['box']]
    cls_layers = [output['x8']['cls'], output['x16']['cls'], output['x32']['cls']]
    strides = [8, 16, 32]
    
    anchors, stride_tensor = make_anchors(box_layers, strides)
    box_preds = torch.cat([x.flatten(2).permute(0, 2, 1) for x in box_layers], dim=1)
    cls_preds = torch.cat([x.flatten(2).permute(0, 2, 1) for x in cls_layers], dim=1)
    
    decoded_boxes = decode_boxes(box_preds, anchors, stride_tensor)
    scores = cls_preds[0].sigmoid()
    max_scores, class_ids = torch.max(scores, dim=1)
    
    mask = max_scores > conf_thres
    if not mask.any():
        return {"boxes": [], "scores": [], "classes": [], "num_detections": 0}
    
    valid_boxes = decoded_boxes[mask]
    valid_scores = max_scores[mask]
    valid_classes = class_ids[mask]
    
    if original_size is not None:
        scale_x = original_size[0] / input_size
        scale_y = original_size[1] / input_size
        valid_boxes[:, [0, 2]] *= scale_x
        valid_boxes[:, [1, 3]] *= scale_y
    
    keep_indices = nms(valid_boxes, valid_scores, iou_thres)
    
    return {
        "boxes": valid_boxes[keep_indices].cpu().numpy().tolist(),
        "scores": valid_scores[keep_indices].cpu().numpy().tolist(),
        "classes": valid_classes[keep_indices].cpu().numpy().tolist(),
        "num_detections": len(keep_indices)
    }


def draw_boxes(img: Image.Image, boxes: List, scores: List, classes: List):
    """Draw bounding boxes on image."""
    img_draw = img.copy()
    draw = ImageDraw.Draw(img_draw)
    
    try:
        font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 12)
    except:
        font = ImageFont.load_default()
    
    for box, score, cls_id in zip(boxes, scores, classes):
        x1, y1, x2, y2 = box
        cls_id_int = int(cls_id)
        color = get_class_color(cls_id_int)
        
        draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
        
        label = f"{COCO_CLASSES[cls_id_int]}: {score:.2f}" if cls_id_int < len(COCO_CLASSES) else f"Class {cls_id_int}: {score:.2f}"
        bbox = draw.textbbox((0, 0), label, font=font)
        text_width, text_height = bbox[2] - bbox[0], bbox[3] - bbox[1]
        
        draw.rectangle([x1, y1 - text_height - 4, x1 + text_width + 4, y1], fill=color)
        draw.text((x1 + 2, y1 - text_height - 2), label, fill="white", font=font)
    
    return img_draw


## Load Model and Run Inference


In [None]:
# Configuration
MODEL_SIZE = "n"  # Options: "n", "s", "m", "l", "x"
WEIGHTS_PATH = "../weights/libreyolo8n.pt"
IMAGE_PATH = "../media/test_image_1_creative_commons.jpg"

# Initialize model
model = LibreYOLO8Model(config=MODEL_SIZE, reg_max=16, nb_classes=80)

# Load weights
state_dict = torch.load(WEIGHTS_PATH, map_location='cpu', weights_only=False)
model.load_state_dict(state_dict, strict=True)
model.eval()

print(f"Model loaded successfully!")


In [None]:
# Preprocess image
input_tensor, original_img, original_size = preprocess_image(IMAGE_PATH, input_size=640)

# Run inference
with torch.no_grad():
    output = model(input_tensor)

# Postprocess
detections = postprocess(output, conf_thres=0.25, iou_thres=0.45, input_size=640, original_size=original_size)

print(f"Found {detections['num_detections']} detections")
for i, (box, score, cls_id) in enumerate(zip(detections['boxes'], detections['scores'], detections['classes'])):
    print(f"  {i+1}. {COCO_CLASSES[int(cls_id)]}: {score:.2f}")


In [None]:
# Draw detections and display
if detections['num_detections'] > 0:
    result_img = draw_boxes(original_img, detections['boxes'], detections['scores'], detections['classes'])
else:
    result_img = original_img

# Display the result
result_img


In [None]:
# Optional: Save the result
output_path = "../media/notebook_output.jpg"
result_img.save(output_path)
print(f"Result saved to: {output_path}")
