Dataset building

In [1]:
from pathlib import Path
import cv2, os
from tqdm import tqdm
import shutil

# === 路径（按你的实际修改）===
ROOT = Path(r"D:\Courses\Csc2503\proj\archive\OUHANDS_train")
TRAIN_LIST = ROOT / r"data_split_for_intermediate_tests\training_files.txt"
VAL_LIST   = ROOT / r"data_split_for_intermediate_tests\validation_files.txt"
COLOUR_DIR = ROOT / r"train\hand_data\colour"
BBOX_DIR   = ROOT / r"train\hand_data\bounding_box"

# 输出的分类数据集（ImageFolder）
CLS_ROOT = Path(r"D:\Courses\Csc2503\proj\ouhands_cls")
(CLS_ROOT / "train").mkdir(parents=True, exist_ok=True)
(CLS_ROOT / "val").mkdir(parents=True, exist_ok=True)

CLASSES = ['A','B','C','D','E','F','H','I','J','K']
CLASS2ID = {c:i for i,c in enumerate(CLASSES)}
IMG_EXTS = {".png",".jpg",".jpeg",".bmp",".tif",".tiff"}

def read_list(fp: Path):
    return [ln.strip() for ln in fp.read_text(encoding="utf-8", errors="ignore").splitlines() if ln.strip()]

def find_img(colour_dir: Path, name: str):
    p = colour_dir / name
    if p.exists(): return p
    stem = Path(name).stem
    for e in IMG_EXTS:
        q = colour_dir / f"{stem}{e}"
        if q.exists(): return q
    return None

def read_boxes_ouhands(txt_path: Path):
    """
    OUHands bbox 行格式（首行是数量N）：
      x, w, h, y, [score]
    其中：
      xmin = x
      ymin = w
      xmax = x + h
      ymax = y + w
    返回 [(xmin,ymin,xmax,ymax), ...]
    """
    if not txt_path.exists():
        return []
    lines = [ln.strip() for ln in txt_path.read_text(encoding="utf-8", errors="ignore").splitlines() if ln.strip()]
    if len(lines) <= 1: return []
    try:
        n = int(float(lines[0].split()[0])); rows = lines[1:1+n]
    except Exception:
        rows = lines
    out = []
    for r in rows:
        parts = [p for p in r.replace("\t"," ").split() if p]
        if len(parts) < 4: continue
        x, w, h, y = map(float, parts[:4])
        xmin, ymin = x, w
        xmax, ymax = x + h, y + w
        if xmax > xmin and ymax > ymin:
            out.append((xmin, ymin, xmax, ymax))
    return out

def crop_and_save(img_path: Path, boxes, out_dir: Path, cls_letter: str, margin=0.15, make_square=True, size=224):
    im = cv2.imread(str(img_path))
    if im is None: return 0
    H, W = im.shape[:2]
    saved = 0
    # 若多手：按面积从大到小裁（也可全部保留）
    boxes = sorted(boxes, key=lambda b:(b[2]-b[0])*(b[3]-b[1]), reverse=True)
    for i,(x1,y1,x2,y2) in enumerate(boxes):
        # 加一点上下文边框
        bw = x2 - x1; bh = y2 - y1
        x1m = max(0, int(x1 - margin*bw))
        y1m = max(0, int(y1 - margin*bh))
        x2m = min(W-1, int(x2 + margin*bw))
        y2m = min(H-1, int(y2 + margin*bh))

        # 可选：裁成近似正方形（对 ResNet 更友好）
        if make_square:
            ww = x2m - x1m; hh = y2m - y1m
            if ww > hh:
                pad = (ww - hh)//2
                y1m = max(0, y1m - pad); y2m = min(H-1, y2m + pad)
            else:
                pad = (hh - ww)//2
                x1m = max(0, x1m - pad); x2m = min(W-1, x2m + pad)

        crop = im[y1m:y2m+1, x1m:x2m+1]
        if crop.size == 0: continue
        crop = cv2.resize(crop, (size,size), interpolation=cv2.INTER_AREA)
        # 目标类目录
        out_cls_dir = out_dir / cls_letter
        out_cls_dir.mkdir(parents=True, exist_ok=True)
        out_name = f"{img_path.stem}_{i}.jpg"
        cv2.imwrite(str(out_cls_dir/out_name), crop)
        saved += 1
    return saved

def build_split(split_name: str, list_file: Path):
    names = read_list(list_file)
    out_dir = CLS_ROOT / split_name
    kept = 0
    for name in tqdm(names, desc=f"Build {split_name}"):
        imgp = find_img(COLOUR_DIR, name)
        if imgp is None: continue
        cls_letter = imgp.name[0].upper()
        if cls_letter not in CLASS2ID: 
            continue
        boxes = read_boxes_ouhands(BBOX_DIR / f"{imgp.stem}.txt")
        if not boxes:
            # 没有框：也可选择整体缩放作为负例，或跳过
            continue
        kept += crop_and_save(imgp, boxes, out_dir, cls_letter, margin=0.15, make_square=True, size=224)
    print(f"{split_name}: saved crops = {kept}")

# 生成 train/val 的裁剪分类集
build_split("train", TRAIN_LIST)
build_split("val",   VAL_LIST)
print("ImageFolder root:", CLS_ROOT)


Build train: 100%|██████████| 1600/1600 [00:38<00:00, 41.06it/s]


train: saved crops = 1600


Build val: 100%|██████████| 400/400 [00:08<00:00, 44.58it/s]

val: saved crops = 400
ImageFolder root: D:\Courses\Csc2503\proj\ouhands_cls





Train

In [3]:
import torch, torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models
from collections import Counter

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# 与 ImageNet 预训练一致的归一化
train_tf = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.9, 1.0)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ColorJitter(0.1, 0.1, 0.1, 0.05),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406],
                         std=[0.229,0.224,0.225]),
])
val_tf = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406],
                         std=[0.229,0.224,0.225]),
])

train_ds = datasets.ImageFolder(root=str(CLS_ROOT/"train"), transform=train_tf)
val_ds   = datasets.ImageFolder(root=str(CLS_ROOT/"val"),   transform=val_tf)

# 类不平衡的话加权
cnt = Counter([y for _,y in train_ds.samples])
num_classes = len(train_ds.classes)
class_weights = torch.tensor([len(train_ds)/cnt[i] for i in range(num_classes)], dtype=torch.float32).to(device)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, num_workers=0, pin_memory=False)
val_loader   = DataLoader(val_ds,   batch_size=128, shuffle=False, num_workers=0, pin_memory=False)

# ResNet18（可换 resnet50）
model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
model.fc = nn.Linear(model.fc.in_features, num_classes)
model = model.to(device)

criterion = nn.CrossEntropyLoss(weight=class_weights)  # 使用类权重
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

def evaluate(model, loader):
    model.eval()
    total, correct, loss_sum = 0, 0, 0.0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            logits = model(x)
            loss = criterion(logits, y)
            loss_sum += loss.item() * x.size(0)
            pred = logits.argmax(1)
            correct += (pred == y).sum().item()
            total += x.size(0)
    return loss_sum/total, correct/total

best_acc = 0.0
for epoch in range(50):
    model.train()
    running = 0.0
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad(set_to_none=True)
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        running += loss.item() * x.size(0)

    tr_loss = running / len(train_loader.dataset)
    va_loss, va_acc = evaluate(model, val_loader)
    scheduler.step()

    print(f"Epoch {epoch+1:02d}: train_loss={tr_loss:.4f}  val_loss={va_loss:.4f}  val_acc={va_acc*100:.2f}%")

    # 保存最好权重
    if va_acc > best_acc:
        best_acc = va_acc
        torch.save(model.state_dict(), "resnet18_ouhands_best.pt")
        print("✅ saved: resnet18_ouhands_best.pt")


Using device: cuda:0
Epoch 01: train_loss=0.4672  val_loss=0.4203  val_acc=86.25%
✅ saved: resnet18_ouhands_best.pt
Epoch 02: train_loss=0.0836  val_loss=0.2271  val_acc=93.25%
✅ saved: resnet18_ouhands_best.pt
Epoch 03: train_loss=0.0316  val_loss=0.3112  val_acc=87.50%
Epoch 04: train_loss=0.0326  val_loss=0.1836  val_acc=93.25%
Epoch 05: train_loss=0.0241  val_loss=0.1069  val_acc=97.25%
✅ saved: resnet18_ouhands_best.pt
Epoch 06: train_loss=0.0145  val_loss=0.1011  val_acc=96.50%
Epoch 07: train_loss=0.0070  val_loss=0.1042  val_acc=96.00%
Epoch 08: train_loss=0.0025  val_loss=0.1108  val_acc=95.75%
Epoch 09: train_loss=0.0020  val_loss=0.1013  val_acc=95.75%
Epoch 10: train_loss=0.0018  val_loss=0.1010  val_acc=95.75%
Epoch 11: train_loss=0.0018  val_loss=0.1098  val_acc=95.75%
Epoch 12: train_loss=0.0016  val_loss=0.1043  val_acc=95.50%
Epoch 13: train_loss=0.0016  val_loss=0.0947  val_acc=96.25%
Epoch 14: train_loss=0.0013  val_loss=0.0814  val_acc=96.25%
Epoch 15: train_loss=0.

Test

In [5]:
from pathlib import Path
import cv2
from tqdm import tqdm

# === 原始 OUHands 测试集路径 ===
TEST_COLOUR = Path(r"D:\Courses\Csc2503\proj\archive\OUHANDS_test\test\hand_data\colour")
TEST_BBOX   = Path(r"D:\Courses\Csc2503\proj\archive\OUHANDS_test\test\hand_data\bounding_box")

# === 输出的分类测试集（ImageFolder）===
CLS_TEST_ROOT = Path(r"D:\Courses\Csc2503\proj\ouhands_cls\test")
CLS_TEST_ROOT.mkdir(parents=True, exist_ok=True)

CLASSES = ['A','B','C','D','E','F','H','I','J','K']
IMG_EXTS = {".png",".jpg",".jpeg",".bmp",".tif",".tiff"}

def read_boxes_ouhands(txt_path: Path):
    if not txt_path.exists(): return []
    lines = [ln.strip() for ln in txt_path.read_text(encoding="utf-8", errors="ignore").splitlines() if ln.strip()]
    if len(lines) <= 1: return []
    try:
        n = int(float(lines[0].split()[0])); rows = lines[1:1+n]
    except Exception:
        rows = lines
    out = []
    for r in rows:
        ps = [p for p in r.replace("\t"," ").split() if p]
        if len(ps) < 4: continue
        x,w,h,y = map(float, ps[:4])
        xmin, ymin = x, w
        xmax, ymax = x + h, y + w
        if xmax > xmin and ymax > ymin:
            out.append((xmin, ymin, xmax, ymax))
    return out

def crop_and_save(img_path: Path, boxes, out_dir: Path, cls_letter: str, margin=0.15, size=224, square=True):
    im = cv2.imread(str(img_path))
    if im is None: return 0
    H,W = im.shape[:2]
    saved = 0
    # 多框都保留（也可改成只保留最大框）
    for i,(x1,y1,x2,y2) in enumerate(boxes):
        bw, bh = x2-x1, y2-y1
        x1m = max(0, int(x1 - margin*bw))
        y1m = max(0, int(y1 - margin*bh))
        x2m = min(W-1, int(x2 + margin*bw))
        y2m = min(H-1, int(y2 + margin*bh))
        if square:
            ww, hh = x2m-x1m, y2m-y1m
            if ww > hh:
                pad = (ww-hh)//2
                y1m = max(0, y1m-pad); y2m = min(H-1, y2m+pad)
            else:
                pad = (hh-ww)//2
                x1m = max(0, x1m-pad); x2m = min(W-1, x2m+pad)
        crop = im[y1m:y2m+1, x1m:x2m+1]
        if crop.size == 0: continue
        crop = cv2.resize(crop, (size,size), interpolation=cv2.INTER_AREA)
        (out_dir/cls_letter).mkdir(parents=True, exist_ok=True)
        cv2.imwrite(str((out_dir/cls_letter/f"{img_path.stem}_{i}.jpg")), crop)
        saved += 1
    return saved

count = 0
for p in tqdm(sorted(TEST_COLOUR.iterdir()), desc="Build cls test"):
    if p.suffix.lower() not in IMG_EXTS: continue
    cls_letter = p.name[0].upper()
    if cls_letter not in CLASSES: continue
    boxes = read_boxes_ouhands(TEST_BBOX / f"{p.stem}.txt")
    count += crop_and_save(p, boxes, CLS_TEST_ROOT, cls_letter)
print("Saved crops:", count)


Build cls test: 100%|██████████| 1000/1000 [00:10<00:00, 96.52it/s]

Saved crops: 1000





In [6]:
# 如果还没装：!pip -q install scikit-learn thop timm

import torch, torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models
from sklearn.metrics import accuracy_score, f1_score
from thop import profile

from pathlib import Path

# === 你的权重路径（你训练时保存的 .pt）===
RESNET_WEIGHTS = r"D:\Courses\Csc2503\proj\resnet18_ouhands_best.pt"  # 改成你的文件完整路径

# === 分类测试集（ImageFolder 结构）===
CLS_TEST_ROOT = Path(r"D:\Courses\Csc2503\proj\ouhands_cls\test")

# === 设备 ===
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# === Data ===
test_tf = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406],
                         std=[0.229,0.224,0.225]),
])
test_ds = datasets.ImageFolder(root=str(CLS_TEST_ROOT), transform=test_tf)
test_loader = DataLoader(test_ds, batch_size=128, shuffle=False, num_workers=0, pin_memory=False)

# === Model ===
num_classes = len(test_ds.classes)  # 应为10
model = models.resnet18(weights=None)  # 结构一致
model.fc = nn.Linear(model.fc.in_features, num_classes)
sd = torch.load(RESNET_WEIGHTS, map_location="cpu")
# 兼容 DataParallel 或纯 state_dict
state_dict = sd.get("state_dict", sd)
# 去掉 DataParallel 的 'module.' 前缀
state_dict = {k.replace("module.", ""): v for k,v in state_dict.items()}
model.load_state_dict(state_dict, strict=False)
model = model.to(device)
model.eval()

# === 推理并计算 Top-1 与 Macro-F1 ===
all_pred, all_true = [], []
with torch.no_grad():
    for x, y in test_loader:
        x = x.to(device, non_blocking=True)
        logits = model(x)
        pred = logits.argmax(1).cpu().numpy().tolist()
        all_pred += pred
        all_true += y.numpy().tolist()

top1 = accuracy_score(all_true, all_pred)
macro_f1 = f1_score(all_true, all_pred, average="macro")

print(f"Top-1: {top1:.4f}")
print(f"Macro-F1: {macro_f1:.4f}")

# === Params (M) 与 FLOPs (G) ===
params_m = sum(p.numel() for p in model.parameters()) / 1e6
dummy = torch.randn(1, 3, 224, 224).to(device)
flops, params = profile(model, inputs=(dummy,), verbose=False)
flops_g = flops / 1e9

print(f"Params (M): {params_m:.3f}")
print(f"FLOPs (G): {flops_g:.2f}")

# === 一行表格输出 ===
print(f"| ResNet18 | CNN (raw image) | {top1:.4f} | {macro_f1:.4f} | {params_m:.2f} | {flops_g:.2f} |")


Using device: cuda:0


  sd = torch.load(RESNET_WEIGHTS, map_location="cpu")


Top-1: 0.9000
Macro-F1: 0.8992
Params (M): 11.182
FLOPs (G): 1.82
| ResNet18 | CNN (raw image) | 0.9000 | 0.8992 | 11.18 | 1.82 |
