In [7]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
import numpy as np
from PIL import Image
import os
import sys
from tqdm import tqdm

from model import FaceDetector

In [8]:
class FaceDataset(Dataset):
    """
    Датасет 
    Ожидает, что в папке root находятся изображения (в корне или в images/) и
    файлы аннотаций .txt с таким же именем, как у изображения.
    Исходный формат строки в .txt: class_id cx cy w h   (все нормализованы в [0,1])
    Если в файле несколько линий (несколько объектов), по умолчанию берётся рамка
    с максимальной площадью (w*h).
    """

    IMG_EXTS = (".jpg", ".jpeg", ".png", ".bmp")

    def __init__(self, root="data", img_size=256, transform=None, take_largest=True, augment=False):
        self.root = root
        self.img_size = img_size
        self.take_largest = take_largest
        self.augment = augment
        
        if transform is None:
            base_transforms = [T.Resize((img_size, img_size))]
            if augment:
                # Аугментации для обучения
                base_transforms = [
                    T.RandomRotation(degrees=15),
                    T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
                    T.Resize((img_size, img_size)),
                ]
            else:
                base_transforms = [T.Resize((img_size, img_size))]
            base_transforms.append(T.ToTensor())
            self.transform = T.Compose(base_transforms)
        else:
            self.transform = transform

        possible_dirs = [os.path.join(root, "images"), root]
        self.images = []
        for d in possible_dirs:
            if os.path.isdir(d):
                for fn in os.listdir(d):
                    if fn.lower().endswith(self.IMG_EXTS):
                        self.images.append(os.path.join(d, fn))
        self.images.sort()

    def _label_path_for_image(self, img_path):
        base, _ = os.path.splitext(img_path)
        candidates = [
            base + ".txt",
            os.path.join(self.root, "labels", os.path.basename(base) + ".txt"),
            os.path.join(self.root, os.path.basename(base) + ".txt"),
        ]
        for p in candidates:
            if os.path.isfile(p):
                return p
        return None

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_path = self.images[idx]
        img = Image.open(img_path).convert("RGB")
        label_path = self._label_path_for_image(img_path)
        if label_path is None:
            raise FileNotFoundError(f"Label file not found for image {img_path}")

        bboxes = []
        with open(label_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                parts = line.split()
                if len(parts) < 5:
                    continue
                try:
                    cls = int(float(parts[0]))
                    cx = float(parts[1])
                    cy = float(parts[2])
                    w = float(parts[3])
                    h = float(parts[4])
                    bboxes.append((cls, cx, cy, w, h))
                except Exception:
                    continue

        if len(bboxes) == 0:
            raise ValueError(f"No valid bbox in label file {label_path}")

        if self.take_largest and len(bboxes) > 1:
            areas = [bb[3] * bb[4] for bb in bboxes]
            idx_max = int(np.argmax(areas))
            chosen = bboxes[idx_max]
        else:
            chosen = bboxes[0]

        _, cx, cy, w, h = chosen
        img_t = self.transform(img)
        bbox = torch.tensor([cx, cy, w, h], dtype=torch.float32)
        return img_t, bbox


In [9]:
def train_epoch(model, dataloader, opt, loss_fn, device, epoch):
    model.train()
    total_loss = 0.0
    # Progress bar для тренировочных батчей
    train_pbar = tqdm(dataloader, desc=f"Train Epoch {epoch}", leave=False)
    for i, (imgs, targets) in enumerate(train_pbar):
        imgs = imgs.to(device)
        targets = targets.to(device)

        preds = model(imgs)
        loss = loss_fn(preds, targets)

        opt.zero_grad()
        loss.backward()
        opt.step()

        total_loss += loss.item() * imgs.size(0)
        # Обновляем описание для отображения текущего loss
        train_pbar.set_postfix({'batch_loss': f'{loss.item():.4f}'})
    
    train_pbar.close()
    return total_loss / len(dataloader.dataset)


def iou_xyxy(boxA, boxB):
    # Вычисление метрики IoU для рамки в формате [x1,y1,x2,y2]
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interW = max(0.0, xB - xA)
    interH = max(0.0, yB - yA)
    interArea = interW * interH

    boxAArea = max(0.0, boxA[2] - boxA[0]) * max(0.0, boxA[3] - boxA[1])
    boxBArea = max(0.0, boxB[2] - boxB[0]) * max(0.0, boxB[3] - boxB[1])

    denom = boxAArea + boxBArea - interArea
    if denom <= 0:
        return 0.0
    return interArea / denom


def cxcywh_to_xyxy_norm(cx, cy, w, h):
    # Преобразование нормализованных координат cx,cy,w,h в нормализованные x1,y1,x2,y2
    x1 = cx - w / 2.0
    y1 = cy - h / 2.0
    x2 = cx + w / 2.0
    y2 = cy + h / 2.0
    return [x1, y1, x2, y2]


def xyxy_norm_to_pixels(box, img_w, img_h):
    # Преобразует нормализованные xyxy в пиксели (int)
    x1 = int(round(box[0] * img_w))
    y1 = int(round(box[1] * img_h))
    x2 = int(round(box[2] * img_w))
    y2 = int(round(box[3] * img_h))
    x1 = max(0, min(x1, img_w - 1))
    x2 = max(0, min(x2, img_w - 1))
    y1 = max(0, min(y1, img_h - 1))
    y2 = max(0, min(y2, img_h - 1))
    return [x1, y1, x2, y2]

def train():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device:", device)

    data_root = "dataset"

    train_dir = os.path.join(data_root, "train")
    val_dir = os.path.join(data_root, "val")

    # Датасеты с аугментацией для обучения, без аугментации для валидации
    train_data = FaceDataset(
        root=train_dir, img_size=256, transform=None, take_largest=True, augment=True
    )
    val_data = FaceDataset(
        root=val_dir, img_size=256, transform=None, take_largest=True, augment=False
    )

    train_loader = DataLoader(train_data, batch_size=8, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=8, shuffle=False)

    model = FaceDetector()
    model.to(device)

    # SmoothL1Loss более устойчив к выбросам, чем L1Loss
    loss_fn = nn.SmoothL1Loss(beta=0.1)
    opt = torch.optim.Adam(model.parameters(), lr=1e-3)
    scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=20, gamma=0.5)

    epochs = 100
    best_val_iou = -1
    best_path = "best_model.pt"
    patience = 40
    epochs_without_improvement = 0

    # УБРАЛ главный progress bar для эпох - используем обычный цикл
    for epoch in range(1, epochs + 1):
        print(f"\nEpoch {epoch}/{epochs}")
        
        # Только ОДИН progress bar для тренировки
        train_loss = 0.0
        model.train()
        train_pbar = tqdm(train_loader, desc=f"Training")
        for i, (imgs, targets) in enumerate(train_pbar):
            imgs = imgs.to(device)
            targets = targets.to(device)

            preds = model(imgs)
            loss = loss_fn(preds, targets)

            opt.zero_grad()
            loss.backward()
            opt.step()

            train_loss += loss.item() * imgs.size(0)
            train_pbar.set_postfix({'loss': f'{loss.item():.4f}'})
        
        train_pbar.close()
        train_loss = train_loss / len(train_loader.dataset)

        # Валидация БЕЗ progress bar (простой цикл)
        model.eval()
        val_loss = 0.0
        iou_total = 0.0
        count = 0
        
        with torch.no_grad():
            # УБРАЛ val_pbar - используем обычный цикл
            for imgs, targets in val_loader:
                imgs = imgs.to(device)
                targets = targets.to(device)
                preds = model(imgs)
                loss = loss_fn(preds, targets)
                val_loss += loss.item() * imgs.size(0)

                for p, t in zip(preds.cpu().numpy(), targets.cpu().numpy()):
                    p_xyxy = cxcywh_to_xyxy_norm(p[0], p[1], p[2], p[3])
                    t_xyxy = cxcywh_to_xyxy_norm(t[0], t[1], t[2], t[3])
                    iou = iou_xyxy(p_xyxy, t_xyxy)
                    iou_total += iou
                    count += 1

        val_loss = val_loss / len(val_loader.dataset)
        avg_iou = iou_total / max(1, count)
        
        print(f"Train loss: {train_loss:.6f}  Val loss: {val_loss:.6f}  Val IoU: {avg_iou:.4f}")

        if avg_iou > best_val_iou:
            best_val_iou = avg_iou
            torch.save(model.state_dict(), best_path)
            print(f"✓ Saved best model to {best_path} (IoU: {avg_iou:.4f})")
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1
        
        # Early stopping если IoU не улучшается
        if epochs_without_improvement >= patience:
            print(f"Early stopping: no improvement for {patience} epochs")
            break
        
        scheduler.step()

    print(f"\nTraining finished. Best val IoU: {best_val_iou:.4f}")
    print(f"Best model saved to: {best_path}")

In [None]:
train()

Device: cuda

Epoch 1/100


Training: 100%|██████████| 146/146 [00:40<00:00,  3.62it/s, loss=0.0212]



Train loss: 0.021258  Val loss: 0.012495  Val IoU: 0.4519
✓ Saved best model to best_model.pt (IoU: 0.4519)

Epoch 2/100


Training: 100%|██████████| 146/146 [00:29<00:00,  4.88it/s, loss=0.0127]



Train loss: 0.012779  Val loss: 0.013081  Val IoU: 0.4346

Epoch 3/100


Training: 100%|██████████| 146/146 [00:30<00:00,  4.86it/s, loss=0.0185]



Train loss: 0.011891  Val loss: 0.010712  Val IoU: 0.5187
✓ Saved best model to best_model.pt (IoU: 0.5187)

Epoch 4/100


Training:  65%|██████▌   | 95/146 [00:20<00:11,  4.60it/s, loss=0.0077]