
## 1) Environment Setup
Installs minimal dependencies if missing. Uses AMP and saves artifacts under `/kaggle/working/violence_detection/`.


In [None]:
!pip install -qqq timm onnx onnxruntime transformers datasets huggingface_hub

In [None]:
from datetime import datetime, timedelta
import sys, subprocess
import os, json, math, time, glob, hashlib
from transformers import MobileViTForImageClassification
from pathlib import Path
import math, random, gc, cv2
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from dataclasses import dataclass
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import time
import timm
from timm.data import Mixup
from timm.loss import SoftTargetCrossEntropy
from collections import Counter
import onnx, onnxruntime as ort
BASE_DIR = '/kaggle/working/'
BASE = Path(BASE_DIR)
FRAMES_DIR = BASE / 'frames'
MODELS_DIR = BASE / 'models'
WEIGHTS_DIR = MODELS_DIR / 'weights'
REPORTS_DIR = BASE / 'reports'
INDEX_JSON = BASE / 'video_index.json'
for d in [FRAMES_DIR, WEIGHTS_DIR, REPORTS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

LABELS = {'Violence':0, 'NonViolence':1, 'PseudoViolence':2}
print(datetime.utcnow() + timedelta(hours=7))


## 2) Configuration
- Input size: **224×224**
- Frames per second extracted: **20**
- Epochs: **30**
- Mixed precision: **True**
- Temporal smoothing: **EMA** over per-frame probabilities
- Anti-overfitting: **weight decay**, **dropout**, **label smoothing**, **strong augmentations**, **early stopping**, **gradient clipping**


In [None]:
@dataclass
class Config:
    # Model chọn: "apple/mobilevit-small" (Apple MobileViT-S) hoặc "vit_tiny_patch16_224.augreg_in21k_ft_in1k" (ViT-Tiny)
    MODEL_NAME: str = "apple/mobilevit-small"
    input_size: int = 256             # 256 cho MobileViT-S; đổi 224 nếu dùng ViT-Tiny
    # Extraction
    frames_per_second: int = 20       # GIỮ 20 FPS
    num_classes: int = 3
    # Train
    epochs: int = 10
    batch_size: int = 32
    lr: float = 1e-4
    weight_decay: float = 0.05
    label_smoothing: float = 0.1
    num_workers: int = 4
    amp: bool = True
    early_stopping_patience: int = 5
    grad_clip_norm: float = 1.0
    seed: int = 42

CFG = Config()
print(CFG)
print(datetime.utcnow() + timedelta(hours=7))


## 3) Dataset Indexing & Label Mapping
Parses all three datasets and builds an index of videos with labels and splits. AIRTLab maps **both** `violent/` and `non-violent/` to `PseudoViolence` by request.


In [None]:
random.seed(CFG.seed)

# Label mapping
LABELS = {
    'Violence': 0,
    'NonViolence': 1,
    'PseudoViolence': 2,
}

# Paths (Kaggle inputs)
PATH_HOCKEY = '/kaggle/input/hockey-fight-vidoes/data'
PATH_RWF = '/kaggle/input/rwf2000/RWF-2000'
PATH_AIRTLAB = '/kaggle/input/airtlab/violence-detection-dataset'

# ==== TỈ LỆ CHIA ====
TRAIN_RATIO = 0.85
VAL_RATIO   = 0.13
TEST_RATIO  = 0.02

# ==== KHỞI TẠO INDEX ====
index = {"train": [], "val": [], "test": []}

# ==== 1) RWF-2000: dùng split sẵn có (train/val) ====
for split in ['train', 'val']:
    for cls in ['Fight', 'NonFight']:
        folder = os.path.join(PATH_RWF, split, cls)
        if not os.path.isdir(folder): 
            continue
        for fn in sorted(os.listdir(folder)):
            if not fn.lower().endswith('.avi'):
                continue
            label = 'Violence' if cls == 'Fight' else 'NonViolence'
            item = {
                'path': os.path.join(folder, fn),
                'label': label,
                'source': 'RWF-2000'
            }
            if split == 'train':
                index['train'].append(item)
            else:
                index['val'].append(item)  # ta để nguyên vào val theo cấu trúc dataset

# ==== 2) Hockey: tạo split theo 85/13/2 ====
h_files = []
if os.path.isdir(PATH_HOCKEY):
    for fn in sorted(os.listdir(PATH_HOCKEY)):
        if not fn.lower().endswith('.avi'):
            continue
        lbl = 'Violence' if 'fi' in fn.lower() else ('NonViolence' if 'no' in fn.lower() else None)
        if lbl is None: 
            continue
        h_files.append({'path': os.path.join(PATH_HOCKEY, fn), 'label': lbl, 'source': 'Hockey'})

random.seed(CFG.seed)
random.shuffle(h_files)

n = len(h_files)
n_train = int(n * TRAIN_RATIO)
n_val   = int(n * VAL_RATIO)
n_test  = n - n_train - n_val   # phần còn lại để đảm bảo tổng = n

index['train'] += h_files[:n_train]
index['val']   += h_files[n_train:n_train + n_val]
index['test']  += h_files[n_train + n_val:n_train + n_val + n_test]

print(f"[Hockey] total={n} -> train={n_train}, val={n_val}, test={n_test}")

# ==== 3) AIRTLab: map violent & non-violent => PseudoViolence và chia 85/13/2 ====
a_files = []
for sub in ['non-violent', 'violent']:
    subdir = os.path.join(PATH_AIRTLAB, sub)
    if not os.path.isdir(subdir):
        continue
    for cam in ['cam1', 'cam2']:
        camdir = os.path.join(subdir, cam)
        if not os.path.isdir(camdir):
            continue
        for fn in sorted(os.listdir(camdir)):
            if not (fn.lower().endswith('.mp4') or fn.lower().endswith('.avi') or fn.lower().endswith('.mov')):
                continue
            a_files.append({'path': os.path.join(camdir, fn), 'label': 'PseudoViolence', 'source': 'AIRTLab'})

random.seed(42)
random.shuffle(a_files)

na = len(a_files)
na_train = int(na * TRAIN_RATIO)
na_val   = int(na * VAL_RATIO)
na_test  = na - na_train - na_val

index['train'] += a_files[:na_train]
index['val']   += a_files[na_train:na_train + na_val]
index['test']  += a_files[na_train + na_val:na_train + na_val + na_test]

print(f"[AIRTLab] total={na} -> train={na_train}, val={na_val}, test={na_test}")

# ==== 4) Thống kê cuối & lưu ====
print({
    "train": len(index["train"]),
    "val":   len(index["val"]),
    "test":  len(index["test"]),
})

with open(INDEX_JSON, 'w') as f:
    json.dump(index, f, indent=2)

print(f"Saved index to {INDEX_JSON}")
print(datetime.utcnow() + timedelta(hours=7))


## 4) Frame Extraction (20 FPS)
For each video, we extract **20 frames per second**. Robust to varying FPS: selects evenly-spaced frames using timestamps.
Frames are saved to `BASE_DIR/frames/<split>/<label>/<video_id>/frame_XXXX.jpg`.


In [None]:
def vid_hash(path): return hashlib.md5(path.encode()).hexdigest()[:12]

def letterbox_resize_bgr(frame_bgr, target_w, target_h, pad_color=(0,0,0)):
    h, w = frame_bgr.shape[:2]
    scale = min(target_w / w, target_h / h)
    nw, nh = int(w * scale), int(h * scale)
    resized = cv2.resize(frame_bgr, (nw, nh), interpolation=cv2.INTER_AREA)
    top = (target_h - nh) // 2
    bottom = target_h - nh - top
    left = (target_w - nw) // 2
    right = target_w - nw - left
    out = cv2.copyMakeBorder(resized, top, bottom, left, right, cv2.BORDER_CONSTANT, value=pad_color)
    return out

def extract_resized(video_path, out_dir, target_size, jpeg_quality, overwrite, target_fps):
    os.makedirs(out_dir, exist_ok=True)
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        tqdm.write(f"[WARN] Cannot open: {video_path}")
        return 0

    fps = cap.get(cv2.CAP_PROP_FPS)
    if not fps or fps <= 0: fps = 30.0
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = total_frames / fps

    step_t = 1.0 / float(target_fps)
    next_t = 0.0
    idx = 0
    saved = 0

    # progress per video (ước lượng theo số khung cần trích)
    expected = int(duration * target_fps)
    while True:
        ok, frame = cap.read()
        if not ok: break
        t = idx / fps
        if t + 1e-6 >= next_t:
            canvas = letterbox_resize_bgr(frame, target_size[0], target_size[1], pad_color=(0,0,0))
            
            fn_orig = out_dir / f"frame_{saved:06d}.jpg"
            if overwrite or (not fn_orig.exists()):
                cv2.imwrite(str(fn_orig), canvas, [cv2.IMWRITE_JPEG_QUALITY, int(jpeg_quality)])
            
            flipped = cv2.flip(canvas, 1)
            fn_flip = out_dir / f"frame_{saved:06d}_flip.jpg"
            if overwrite or (not fn_flip.exists()):
                cv2.imwrite(str(fn_flip), flipped, [cv2.IMWRITE_JPEG_QUALITY, int(jpeg_quality)])

            saved += 1
            next_t += step_t
        idx += 1
        if t > duration + 1.0: break

    cap.release()
    return saved, saved

with open(INDEX_JSON, 'r') as f:
    index = json.load(f)

for split, items in index.items():
    print(f"Split: {split}, videos: {len(items)}")
    tqdm.write(str(datetime.utcnow() + timedelta(hours=7)))
    for it in tqdm(items, desc=f"{split} videos", leave=False):
        label = it['label']
        vid = it['path']
        vhid = vid_hash(vid)
        out_dir = Path(FRAMES_DIR) / split / label / vhid
        n_orig, n_flip = extract_resized(
            vid,
            out_dir,
            target_size=(CFG.input_size, CFG.input_size),
            jpeg_quality=85,
            overwrite=False,
            target_fps=CFG.frames_per_second
        )

print("Frame extraction done.")
print(datetime.utcnow() + timedelta(hours=7))


## 5) PyTorch Dataset & Augmentations
Loads extracted frames with strong real-time-friendly augmentations and returns tensors sized 224×224.


In [None]:
train_tfms = T.Compose([
    T.Resize((CFG.input_size, CFG.input_size)),
    T.RandomResizedCrop(CFG.input_size, scale=(0.7, 1.0), ratio=(0.8, 1.25)),
    T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.05),
    T.RandomHorizontalFlip(p=0.5),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

val_tfms = T.Compose([
    T.Resize((CFG.input_size, CFG.input_size)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

class FrameFolderDataset(Dataset):
    def __init__(self, split, transforms=None):
        self.items = []
        base = os.path.join(FRAMES_DIR, split)
        for label_name in LABELS.keys():
            labdir = os.path.join(base, label_name)
            if not os.path.isdir(labdir): continue
            for vid_folder in sorted(os.listdir(labdir)):
                full = os.path.join(labdir, vid_folder)
                if not os.path.isdir(full): continue
                frames = sorted(glob.glob(os.path.join(full, '*.jpg')))
                for fr in frames:
                    self.items.append((fr, LABELS[label_name]))
        self.transforms = transforms
        print(f"Loaded {len(self.items)} frames in split={split}")

    def __len__(self): return len(self.items)
    def __getitem__(self, idx):
        p, y = self.items[idx]
        img = Image.open(p).convert('RGB')
        if self.transforms: img = self.transforms(img)
        return img, y

train_loader = DataLoader(FrameFolderDataset('train', transforms=train_tfms), batch_size=CFG.batch_size, shuffle=True, num_workers=CFG.num_workers, pin_memory=True)
val_loader   = DataLoader(FrameFolderDataset('val',   transforms=val_tfms),   batch_size=CFG.batch_size, shuffle=False, num_workers=CFG.num_workers, pin_memory=True)
test_loader  = val_loader
print(datetime.utcnow() + timedelta(hours=7))

## 6) Model: ViT-Tiny (timm) with 3-way classifier
- Lightweight ViT backbone (`vit_tiny_patch16_224`) for speed.
- Dropout, label smoothing, cosine LR with warmup.
- AMP training + gradient clipping.


In [None]:
def build_model(model_name, num_classes):
    # Tạo model timm & đặt đúng số lớp
    model = timm.create_model(
        model_name,
        pretrained=True,
        num_classes=num_classes
    )
    return model

def build_model_hf_mobilevit(model_name, num_classes):
    model = MobileViTForImageClassification.from_pretrained(
        model_name,
        num_labels=num_classes,
        ignore_mismatched_sizes=True
    )
    return model

# model = build_model(CFG.MODEL_NAME, CFG.num_classes)
model = build_model_hf_mobilevit(CFG.MODEL_NAME, CFG.num_classes)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

criterion = nn.CrossEntropyLoss(label_smoothing=CFG.label_smoothing)
optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)

steps_per_epoch = max(1, len(train_loader))
total_steps = steps_per_epoch * CFG.epochs
warmup_steps = int(0.1 * total_steps)

class CosineWithWarmup(torch.optim.lr_scheduler._LRScheduler):
    def __init__(self, optimizer, warmup_steps, total_steps, last_epoch=-1):
        self.warmup_steps = warmup_steps
        self.total_steps = total_steps
        super().__init__(optimizer, last_epoch)
    def get_lr(self):
        step = self.last_epoch + 1
        lrs = []
        for base_lr in self.base_lrs:
            if step < self.warmup_steps:
                lr = base_lr * step / max(1, self.warmup_steps)
            else:
                progress = (step - self.warmup_steps) / max(1, self.total_steps - self.warmup_steps)
                lr = base_lr * 0.5 * (1 + math.cos(math.pi * progress))
            lrs.append(lr)
        return lrs

scheduler = CosineWithWarmup(optimizer, warmup_steps=warmup_steps, total_steps=total_steps)
scaler = torch.cuda.amp.GradScaler(enabled=CFG.amp)
print(datetime.utcnow() + timedelta(hours=7))


## 7) Training Loop (AMP) + Logging
Saves:
- `models/weights/vit_tiny_best.pth`
- `training_history.pkl`
- `loss.png`, `accuracy.png`


In [None]:
best_val_acc, patience, no_improve = 0.0, CFG.early_stopping_patience, 0
history = {"train_loss": [], "val_loss": [], "train_acc": [], "val_acc": []}

def get_logits(m, x):
    out = m(x)
    return out.logits if hasattr(out, "logits") else out  # hỗ trợ transformers & timm
def safe_name(name: str) -> str:
    """Sanitize a model name to be safe for filenames."""
    return (
        name.replace('/', '__')
            .replace('\\', '__')
            .replace(':', '-')
            .replace(' ', '_')
    )

for epoch in tqdm(range(1, CFG.epochs+1), desc="Training epochs"):
    # ===== Train =====
    model.train()
    train_losses, train_preds, train_targets = [], [], []

    for imgs, targets in tqdm(train_loader, desc=f"Epoch {epoch}/{CFG.epochs} [train]", leave=False):
        imgs = imgs.to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)
        optimizer.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=CFG.amp):
            logits = get_logits(model, imgs)
            loss = criterion(logits, targets)
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.grad_clip_norm)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        train_losses.append(loss.item())
        train_preds.extend(torch.argmax(logits, dim=1).detach().cpu().numpy().tolist())
        train_targets.extend(targets.detach().cpu().numpy().tolist())
    tqdm.write(str(datetime.utcnow() + timedelta(hours=7)))
    train_acc = accuracy_score(train_targets, train_preds)
    history['train_loss'].append(float(np.mean(train_losses)))
    history['train_acc'].append(float(train_acc))

    # ===== Val =====
    model.eval()
    val_losses, val_preds, val_targets = [], [], []
    with torch.no_grad():
        for imgs, targets in tqdm(val_loader, desc=f"Epoch {epoch} [val]", leave=False):
            imgs = imgs.to(device, non_blocking=True)
            targets = targets.to(device, non_blocking=True).long()
            with torch.cuda.amp.autocast(enabled=CFG.amp):
                logits = get_logits(model, imgs)
                loss = criterion(logits, targets)
            val_losses.append(loss.item())
            val_preds.extend(torch.argmax(logits, dim=1).detach().cpu().numpy().tolist())
            val_targets.extend(targets.detach().cpu().numpy().tolist())

    val_acc = accuracy_score(val_targets, val_preds)
    history['val_loss'].append(float(np.mean(val_losses)))
    history['val_acc'].append(float(val_acc))

    print(f"Epoch {epoch}/{CFG.epochs} | Train Loss {history['train_loss'][-1]:.4f} Acc {train_acc:.4f} | Val Loss {history['val_loss'][-1]:.4f} Acc {val_acc:.4f}")
    print(datetime.utcnow() + timedelta(hours=7))

    # Early stopping + save best
    if val_acc > best_val_acc:
        best_val_acc, no_improve = val_acc, 0        
        best_fname = f"{safe_name(CFG.MODEL_NAME)}_best.pth"
        best_path = WEIGHTS_DIR / best_fname
        torch.save(model.state_dict(), str(best_path))
    else:
        no_improve += 1
        if no_improve >= patience:
            print('Early stopping triggered.')
            break

# Save history & plots
with open(os.path.join(BASE_DIR, 'training_history.pkl'), 'wb') as f:
    pickle.dump(history, f)

plt.figure(figsize=(8,4))
plt.plot(history['train_loss'], label='train_loss')
plt.plot(history['val_loss'], label='val_loss')
plt.xlabel('Epoch'); plt.ylabel('Loss'); plt.legend(); plt.grid(True)
plt.tight_layout(); plt.savefig(os.path.join(BASE_DIR, 'loss.png')); plt.show()

plt.figure(figsize=(8,4))
plt.plot(history['train_acc'], label='train_acc')
plt.plot(history['val_acc'], label='val_acc')
plt.xlabel('Epoch'); plt.ylabel('Accuracy'); plt.legend(); plt.grid(True)
plt.tight_layout(); plt.savefig(os.path.join(BASE_DIR, 'accuracy.png')); plt.show()

print("Training finished.")
print(datetime.utcnow() + timedelta(hours=7))


## 8) Evaluation & Reports
Generates `classification_report.txt` and confusion matrix plot.


In [None]:
print(datetime.utcnow() + timedelta(hours=7))

best_w = WEIGHTS_DIR / f"{safe_name(CFG.MODEL_NAME)}_best.pth"
if best_w.exists():
    model.load_state_dict(torch.load(str(best_w), map_location=device))
else:
    raise FileNotFoundError(f"Best weights not found: {best_w}")
model.eval()

all_preds, all_targets = [], []
with torch.no_grad():
    for imgs, targets in tqdm(test_loader, desc="Evaluate [test]"):
        imgs = imgs.to(device)
        with torch.cuda.amp.autocast(enabled=CFG.amp):
            logits = model(imgs)
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds.tolist())
        all_targets.extend(targets.numpy().tolist())

rep = classification_report(all_targets, all_preds, target_names=list(LABELS.keys()), digits=4)
print(rep)
with open(os.path.join(BASE_DIR, 'classification_report.txt'), 'w') as f:
    f.write(rep)

cm = confusion_matrix(all_targets, all_preds)
plt.figure(figsize=(5,4))
plt.imshow(cm, cmap='Blues')
plt.title('Confusion Matrix'); plt.colorbar()
plt.xticks(range(CFG.num_classes), list(LABELS.keys()), rotation=45)
plt.yticks(range(CFG.num_classes), list(LABELS.keys()))
for i in range(CFG.num_classes):
    for j in range(CFG.num_classes):
        plt.text(j, i, cm[i, j], ha='center', va='center', color='black')
plt.tight_layout(); plt.savefig(os.path.join(BASE_DIR, 'confusion_matrix.png')); plt.show()

print("Evaluation finished.")
print(datetime.utcnow() + timedelta(hours=7))


## 9) Export to ONNX & Quick CPU Inference (ONNX Runtime)
Exports the trained classifier and runs a quick inference timing test on a batch.


In [None]:
print(datetime.utcnow() + timedelta(hours=7))

onnx_path = WEIGHTS_DIR / f"{safe_name(CFG.MODEL_NAME)}_best.onnx"
dummy = torch.randn(1, 3, CFG.input_size, CFG.input_size, device=device)
model.eval()

torch.onnx.export(
    model, dummy, str(onnx_path),
    input_names=['input'], output_names=['logits'],
    opset_version=17, do_constant_folding=True,
    dynamic_axes={'input': {0: 'batch'}, 'logits': {0: 'batch'}}
)
print(f"Saved ONNX to {onnx_path}")

onnx_model = onnx.load(onnx_path)
onnx.checker.check_model(onnx_model)

sess = ort.InferenceSession(onnx_path, providers=['CPUExecutionProvider'])
input_name = sess.get_inputs()[0].name
batch = np.random.randn(32, 3, CFG.input_size, CFG.input_size).astype(np.float32)
start = time.time(); out = sess.run(None, {input_name: batch})[0]
lat = (time.time() - start) * 1000.0
print(f"ONNXRuntime batch=32 latency: {lat:.2f} ms")

print(datetime.utcnow() + timedelta(hours=7))


## 10) Real-time Inference Pipeline (OpenCV)
Reads frames from camera/video, runs model, and applies **EMA smoothing** over per-frame probabilities to stabilize outputs.


In [None]:
infer_tfms = T.Compose([
    T.Resize((CFG.input_size, CFG.input_size)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

@torch.no_grad()
def infer_frame_bgr(bgr):
    img = Image.fromarray(cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB))
    ten = infer_tfms(img).unsqueeze(0).to(device)
    with torch.cuda.amp.autocast(enabled=CFG.amp):
        logits = model(ten)
        probs = torch.softmax(logits, dim=1)[0].cpu().numpy()
    return probs

# EMA smoothing over time
class ProbEMA:
    def __init__(self, alpha=CFG.ema_alpha, num_classes=CFG.num_classes):
        self.alpha = alpha
        self.state = np.zeros(num_classes, dtype=np.float32)
    def update(self, probs):
        self.state = self.alpha * probs + (1-self.alpha) * self.state
        return self.state

# Demo with a sample video (replace with 0 for webcam)
source = 0  # webcam; or provide a path to a video file
cap = cv2.VideoCapture(source)
ema = ProbEMA()

while True:
    ok, frame = cap.read()
    if not ok:
        break
    t0 = time.time()
    probs = infer_frame_bgr(frame)
    smoothed = ema.update(probs)
    pred =(np.argmax(smoothed))
    label = list(LABELS.keys())[pred]
    latency_ms = (time.time() - t0) * 1000.0

    # Overlay text
    cv2.putText(frame, f"Pred: {label} | latency: {latency_ms:.1f}ms", (10, 28),
                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0,255,0), 2, cv2.LINE_AA)
    cv2.imshow('Violence Detection (ViT)', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()



## 11) Notes on Real-time Optimizations Applied
- **AMP (Mixed Precision)** for faster GPU throughput.
- **ViT-Tiny** backbone for low latency.
- **ONNX export** + **ONNX Runtime** for CPU acceleration.
- **EMA temporal smoothing** to stabilize per-frame predictions.
- **Strong augmentations + label smoothing + weight decay + dropout** to combat overfitting.
- **Cosine LR with warmup**, **early stopping**, **gradient clipping** for stable training.
- **Pinned memory & non-blocking transfers** improve input pipeline speed.

> For multi-stream scalability (10–20 streams), run inference in separate threads/processes, use batched model calls, and share a single model on GPU; on CPU, prefer ONNX with `OpenVINO`/`TensorRT` if available.
