In [15]:
# inference_letterbox.py
import torch, cv2, numpy as np
from utils.util import non_max_suppression
import torchvision.transforms as T
import yaml


In [16]:
# ---------- params ----------
WEIGHT = "weights/best.pt"
IMG_PATH = "dataset/VietNam_street.png"   # change if needed
DATA_YAML = "utils/args.yaml"   # Path to dataset yaml
INPUT_SIZE = (640, 640)          # height, width used by model
CONF_THR = 0.25
IOU_THR = 0.45


In [17]:
with open(DATA_YAML, "r") as f:
    data_dict = yaml.safe_load(f)

names = data_dict["names"]   # dict {0:"person",1:"bicycle",...}


In [18]:
# ---------- utils ----------
def letterbox(img, new_shape=(640, 640), color=(114, 114, 114)):
    # img: BGR numpy (H, W, C) as loaded by cv2
    h0, w0 = img.shape[:2]
    new_h, new_w = new_shape
    r = min(new_h / h0, new_w / w0)
    new_unpad_w = int(round(w0 * r))
    new_unpad_h = int(round(h0 * r))
    # resize
    img_resized = cv2.resize(img, (new_unpad_w, new_unpad_h), interpolation=cv2.INTER_LINEAR)
    # compute padding
    dw = new_w - new_unpad_w
    dh = new_h - new_unpad_h
    top = int(round(dh / 2 - 0.1))
    bottom = int(round(dh / 2 + 0.1))
    left = int(round(dw / 2 - 0.1))
    right = int(round(dw / 2 + 0.1))
    img_padded = cv2.copyMakeBorder(img_resized, top, bottom, left, right,
                                    cv2.BORDER_CONSTANT, value=color)
    return img_padded, r, (left, top)


In [19]:
# ---------- load model ----------
ckpt = torch.load(WEIGHT, map_location="cpu")
if 'model' in ckpt:
    model = ckpt['model']
else:
    raise RuntimeError("Checkpoint does not contain 'model' key.")
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
# prefer float for numeric stability
if next(model.parameters()).dtype == torch.half:
    model = model.half()
else:
    model = model.float()


  ckpt = torch.load(WEIGHT, map_location="cpu")


In [None]:
# ---------- load + preprocess with letterbox ----------
# read with cv2 (BGR), convert to RGB only where needed
img_bgr = cv2.imread(IMG_PATH) #np.array (H, W, 3)
if img_bgr is None:
    raise FileNotFoundError(f"Image not found: {IMG_PATH}")
orig_h, orig_w = img_bgr.shape[:2] #np.array (H, W)

img_pad, gain, (pad_w, pad_h) = letterbox(img_bgr, new_shape=INPUT_SIZE) 
'''
Gọi hàm letterbox để resize ảnh về kích thước chuẩn của YOLO (INPUT_SIZE, ví dụ 640×640).

Letterbox = resize ảnh nhưng vẫn giữ tỉ lệ khung hình (aspect ratio) → phần thừa sẽ được padding màu đen.
Trả về:

img_pad: ảnh sau khi resize + pad.

gain: hệ số scale (ảnh gốc → ảnh mới).

(pad_w, pad_h): độ pad thêm ở 2 chiều.

Thông tin gain, pad_w, pad_h được dùng sau này để chuyển ngược bbox từ ảnh YOLO về ảnh gốc.
'''
# convert to RGB floating tensor [1,3,H,W] in 0..1
img_rgb = cv2.cvtColor(img_pad, cv2.COLOR_BGR2RGB) #Chuyển từ BGR → RGB (YOLO và PyTorch thường chuẩn hóa input thành RGB).
img_tensor = torch.from_numpy(img_rgb).permute(2, 0, 1).unsqueeze(0).float() / 255.0
'''
torch.from_numpy(img_rgb): chuyển ảnh NumPy → Tensor PyTorch.

.permute(2, 0, 1): đổi trục từ (H, W, C) → (C, H, W) (PyTorch format).

.unsqueeze(0): thêm batch dimension → [1, C, H, W].

.float() / 255.0: đổi từ uint8 (0–255) sang float32 (0–1) để mạng dễ học
'''
img_tensor = img_tensor.to(device)
if next(model.parameters()).dtype == torch.half:
    img_tensor = img_tensor.half()
'''
    Nếu mô hình đang ở dạng half precision (FP16) → convert input sang .half() để đồng bộ.

Dùng khi inference trên GPU để tăng tốc và tiết kiệm bộ nhớ.
'''


In [21]:
# ---------- forward ----------
with torch.no_grad():
    out = model(img_tensor)
    if isinstance(out, (list, tuple)):
        out = out[0]   # repo sometimes returns (pred, loss) or similar

# ensure out shape is [B, C, anchors] as util.non_max_suppression expects
print("raw output shape:", out.shape)

# ---------- NMS (repo util) ----------
# non_max_suppression expects outputs as-is (no permute)
dets = non_max_suppression(out, confidence_threshold=CONF_THR, iou_threshold=IOU_THR)[0]

if dets is None or len(dets) == 0:
    print("No detections found.")
else:
    # debug before scaling
    d = dets.detach().cpu().clone()
    print("Before scale: x_min, y_min, x_max, y_max ranges:",
          d[:, 0].min().item(), d[:, 1].min().item(), d[:, 2].max().item(), d[:, 3].max().item())
    print("Sample raw detection (first 5):")
    for i in range(min(5, d.shape[0])):
        print(i, d[i].numpy())

    # ---------- map boxes from padded input -> original image ----------
    # dets format: [x1, y1, x2, y2, conf, cls]
    dets = dets.detach().cpu()
    # remove padding
    dets[:, [0, 2]] -= pad_w
    dets[:, [1, 3]] -= pad_h
    # divide by gain (scale)
    dets[:, :4] /= gain
    # clip to image size
    dets[:, [0, 2]] = dets[:, [0, 2]].clamp(0, orig_w)
    dets[:, [1, 3]] = dets[:, [1, 3]].clamp(0, orig_h)

    # debug after scaling
    print("After scale (to original image): x_min,x_max,y_min,y_max ranges:",
          dets[:,0].min().item(), dets[:,2].max().item(), dets[:,1].min().item(), dets[:,3].max().item())

    # ---------- draw ----------
    img_out = img_bgr.copy()  # BGR
    
        # BGR colors
    CLASS_COLORS = {
        "person": (0, 0, 255),        # red
        "motorcycle": (0, 255, 255),  # yellow
        "car": (255, 255, 255),       # white
    }

    default_color = (0, 125, 0)  # green for all other classes
    
    for *xyxy, conf, cls in dets:
        x1, y1, x2, y2 = map(int, xyxy)
        cls_id = int(cls.item())
        cls_name = names.get(cls_id, str(cls_id))  # fallback to id if not found
        label = f"{cls_name} {float(conf):.2f}"
        
        cls_id = int(cls.item())
        cls_name = names.get(cls_id, str(cls_id))  # get class name from YAML

        border_color = CLASS_COLORS.get(cls_name, default_color)

        cv2.rectangle(img_out, (x1, y1), (x2, y2), border_color, 2)
        cv2.putText(img_out, f"{cls_name} {conf:.2f}", (x1, max(y1-6,0)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, border_color, 2)


    cv2.imwrite("inference_result.jpg", img_out)
    print("Saved inference_result.jpg")


raw output shape: torch.Size([1, 84, 8400])
Before scale: x_min, y_min, x_max, y_max ranges: 35.4375 153.25 640.5 529.5
Sample raw detection (first 5):
0 [365.75      313.        456.25      498.          0.8540039   0.       ]
1 [303.5        366.         541.         507.           0.85058594
   3.        ]
2 [175.5     311.25    317.75    410.25      0.84375   2.     ]
3 [323.5       301.5       345.5       364.5         0.7441406   0.       ]
4 [571.        284.5       640.        441.          0.7241211   0.       ]
After scale (to original image): x_min,x_max,y_min,y_max ranges: 79.734375 1440.0 106.3125 952.875
Saved inference_result.jpg
