In [1]:
# === Note: Runs this jupyter notebook from the projects root, mimicking a root run.
import os
os.chdir(os.getcwd().rsplit("/", 1)[0])
print(os.getcwd())

import torch
# Show all values when printing torch tensors.
torch.set_printoptions(threshold=torch.inf) # shows all the values when printing tensors


/Users/tonyavis/Main/AI_public_projects/object_detection/yolo_v1_taco


In [2]:
from utils.load_config import load_config
config = load_config()
config

Namespace(DEVICE=device(type='mps'), NUM_WORKERS=2, PIN_MEMORY=True, EPOCHS=25, LEARNING_RATE=0.001, BATCH_SIZE=64, WEIGHT_DECAY=0, CON_TRAINING=False, LOAD_MODEL_FILE='yolo_v1_taco_D_2025-07-10_EPOCH_50_LOSS_4.4416_SIZE_448.pt', LAST_EPOCH=0, WHICH_DATASET='test-case-overfit-one-image', IMAGE_SIZE=448, C=18, B=2, S=7, IOU_THRESHOLD=0.6, MIN_THRESHOLD=0.5, NUM_NODES_PER_CELL=28, NUM_NODES_PER_IMG=1372)

In [3]:
from argparse import Namespace

In [None]:
# Extract bounding boxes
# NOTE: Do not implement this function before the loss_fn(). Use this function before plot_bboxes(), and non_max_suppression().
def extract_bboxes(t: torch.Tensor, config: Namespace):
    """
    Extract bounding boxes from a single image's (predicted or labeled) tensor, converting them into a flat (N, 9) representation, and finally sorting based on their probability score (pc) in descending order.

    Note:
        - N = S * S * B or total number of bboxes per image.
        - Nested 9 nodes looks like -> [i, j, b, class_idx, pc, x, y, w, h], sorted by pc descending for each image.
        - i,j is the bboxes cell location in the grid.
        - b is if the bbox is either bbox1 or bbox2 in its cell. Its value is either 0 or 1.
        - pc is the probability score that an object exists in that cell.
        - class_idx is a value between 0 and 17 that indicates which object the cell predicts to be present. It’s obtained by applying argmax() to the class object probability scores at indices 0 through 17. Note object probability scores here is not the same as pc.
        - The return tensor will have bboxes sorted by pc (descending).

    Args:
        t (tensor) : Shape (S, S, NUM_NODES_PER_CELL)

    Returns:
        (tensor) : shape (S * S * B , 9), Sorted by bboxes with the highest pc at the beginning. [[ i, j, b, class_idx, pc, x, y, w, h]] -> num nodes = 9.
    """
    S, B, C, DEVICE, NUM_NODES_PER_CELL = (
        config.S,
        config.B,
        config.C,
        config.DEVICE,
        config.NUM_NODES_PER_CELL,
    )

    # === 1: Create new tensors to store class probs, first bbox and second bbox from every cell across the batch.
    class_probs = t[..., :C]  # ( S, S, C)

    bbox_1 = t[..., C : C + 5]  # ( S, S, 5) #pc1, x1, y1, w1, h1
    bbox_2 = t[..., C + 5 : NUM_NODES_PER_CELL]  # ( S, S, 5)
    bboxes = torch.stack([bbox_1, bbox_2], dim=2)  # shape: ( S, S, 2, 5)

    # #   Get the highest predicted object from indexes 0-17. Store as index.
    class_idx = class_probs.argmax(dim=-1)  # shape: ( S, S)

    # === 2: Create grid cell indice mapping tensor for i, j, and b coords.
    # Note: (i,j) -> i = row_indices and j = col_indices.\

    row_indices, col_indices = torch.meshgrid(
        torch.arange(S, device=DEVICE), torch.arange(S, device=DEVICE), indexing="ij"
    )  # (S, S)
    
    # Create a grid cell to keep track of what bounding box in a cell has the high probability score.
    box_indices = (
        torch.arange(B, device=DEVICE).view(1, 1, B).expand(S, S, B)
    )  # (S, S, 2)
    print("box_indices:",box_indices, "\n\n")

    #       Reshape and expand row and col indices to include the number bounding boxes per cell.
    row_indices = row_indices.unsqueeze(-1).expand(
        -1, -1, B
    )  # from (S, S) -> (S, S, 2)

    col_indices = col_indices.unsqueeze(-1).expand(
        -1, -1, B
    )  # from (S, S) -> (S, S, 2)

    # === 3: Expand class_idx tensor to match (S, S, 2)
    cls_indices = class_idx.unsqueeze(-1).expand(
        -1, -1, B
    )  # (7, 7) -> (7, 7, 2)
    print(cls_indices.shape)









    # # === 4: Stack metadata
    # metadata = torch.stack(
    #     [row_indices, col_indices, box_indices], dim=-1
    # )  # (7, 7, 2, 3)

    # cls_indices = cls_indices.unsqueeze(-1)  # ( 7, 7, 2) -> ( 7, 7, 2, 1)

    # #       Concatenate: (S, S , 2, 9) 9 = [i, j, b, class_idx, pc, x, y, w, h]
    # full = torch.cat(
    #     [metadata.float(), cls_indices.float(), bboxes], dim=-1
    # )  # (7, 7, 2, 9)

    # #      Reshape full into flat form: ( S, S, 2, 9) -> ( N, 9) where N=S*S*2 or total num bboxes per image.
    # full = full.view( -1, 9)

    # # === 5: Sort by pc (column 4)
    # sorted_indices = full[:, 4].argsort(
    #     descending=True
    # )

    # return full.index_select(
    #     0, sorted_indices
    # )


**Create a tensor That mimics labeled data with easily identifiable values at locations like pc1, class_idx, x, y, w, h**

In [53]:
S, B, C = 1, 2, 18  # Grid size, num boxes, num classes
config.S = S

out = torch.zeros(S, S, C + B * 5)

out[:, :, 18] = torch.rand(S, S) * 5 + 15
for i in range(28): #number of nodes in a cell\
    out[:, :, i] = i

out, "Size:", out.shape

(tensor([[[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
           14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27.]]]),
 'Size:',
 torch.Size([1, 1, 28]))

In [57]:
out[:, :, 18:23] = 100
out

tensor([[[  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,
           11.,  12.,  13.,  14.,  15.,  16.,  17., 100., 100., 100., 100.,
          100.,  23.,  24.,  25.,  26.,  27.]]])

In [58]:
extract_bboxes(out, config)

box_indices: tensor([[[0, 1]]], device='mps:0') 


torch.Size([1, 1, 2])


In [None]:
def extract_bboxes(t: torch.Tensor, cfg: Namespace):
    """
    Flatten a single YOLO-v1 grid (S×S×(C+5B)) into (N, 9) rows sorted by pc.

    Returns
    -------
    torch.Tensor  # (S*S*B, 9) with columns:
        0  i   row index in grid
        1  j   col index in grid
        2  b   0 or 1  (which of the two boxes in the cell)
        3  cls class-index ∈[0, C-1]
        4  pc  object-confidence
        5-8    x, y, w, h (relative coords)
    """
    S, B, C = cfg.S, cfg.B, cfg.C
    device = t.device                           # robust to mixed devices
    nodes = t.shape[-1]
    assert nodes == C + 5 * B, \
        f"Expected {C + 5 * B} nodes per cell, got {nodes}"

    # --- slice
    class_probs = t[..., :C]                    # (S,S,C)
    box_raw     = t[..., C:].reshape(S, S, B, 5)  # (S,S,B,5)

    # --- metadata tensors (int64 throughout)
    i_idx, j_idx = torch.meshgrid(
        torch.arange(S, device=device),
        torch.arange(S, device=device),
        indexing='ij'
    )
    i_idx = i_idx.unsqueeze(-1).expand(-1, -1, B)    # (S,S,B)
    j_idx = j_idx.unsqueeze(-1).expand(-1, -1, B)
    b_idx = torch.arange(B, device=device).view(1, 1, B).expand(S, S, B)

    cls_idx = class_probs.argmax(-1).unsqueeze(-1).expand(-1, -1, B)  # (S,S,B)

    # --- stack [i,j,b,cls,pc,x,y,w,h]; keep ints for first 4 cols
    header = torch.stack([i_idx, j_idx, b_idx, cls_idx], dim=-1)       # (S,S,B,4)
    full   = torch.cat([header, box_raw], dim=-1)                      # (S,S,B,9)

    full = full.reshape(-1, 9)                                         # (N,9)
    order = full[:, 4].argsort(descending=True)
    return full.index_select(0, order)

In [60]:
extract_bboxes(t=out, cfg=config)


tensor([[  0.,   0.,   0.,  17., 100., 100., 100., 100., 100.],
        [  0.,   0.,   1.,  17.,  23.,  24.,  25.,  26.,  27.]])