## Ïä¨Í∞úÍ≥® Ìè¨Ìï®ver. Train

In [None]:
import os
import random
import shutil
from tqdm import tqdm
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, get_linear_schedule_with_warmup
import torchvision.transforms as transforms
from torchvision.models import resnet34, ResNet34_Weights
from PIL import Image
import librosa
import numpy as np
from collections import defaultdict, Counter
from sklearn.utils.class_weight import compute_class_weight
import json
import gc

# =========================
# 0. ÏÑ§Ï†ï
# =========================
SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)

BEHAVIOR_ROOT = "files/1_Animal_Behavior"
EMOTION_ROOT = "files/2_Animal_emotions"
SOUND_ROOT = "files/3_Animal_Sound"
PATELLA_ROOT = "files/6_Animal_Patella"
WORK_DIR = "files/work/omni_dataset"

MAX_SAMPLES_BEHAVIOR = 100000
MAX_SAMPLES_EMOTION = 100000
MIN_SAMPLES_PER_SOUND_CLASS = 50

BATCH_SIZE = 32
EPOCHS = 100
LR_VIDEO = 5e-5
LR_AUDIO = 1e-5
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
NUM_WORKERS = 24
SR = 16000
MAX_AUDIO_LEN = SR * 5

LOSS_WEIGHTS = {
    "behavior": 1.0,
    "emotion": 0.8,
    "sound": 0.6,
    "patella": 1.0
}

AUDIO_MODEL_NAME = "facebook/wav2vec2-base"
FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained(AUDIO_MODEL_NAME)

print(f"üéØ Device: {DEVICE}")

# =========================
# üî• Audio Augmentation
# =========================
def augment_audio(waveform, p=0.5):
    if random.random() > p:
        return waveform
    
    n_steps = random.uniform(-2, 2)
    waveform = librosa.effects.pitch_shift(waveform, sr=SR, n_steps=n_steps)
    
    rate = random.uniform(0.9, 1.1)
    stretched = librosa.effects.time_stretch(waveform, rate=rate)
    if len(stretched) > MAX_AUDIO_LEN:
        stretched = stretched[:MAX_AUDIO_LEN]
    else:
        stretched = np.pad(stretched, (0, MAX_AUDIO_LEN - len(stretched)))
    waveform = stretched
    
    noise = np.random.normal(0, 0.003, len(waveform))
    waveform = waveform * 0.99 + noise
    
    return waveform

# =========================
# 1. Dataset Preparation
# =========================
def collect_samples(root, exts):
    samples = []
    for class_dir in sorted(os.listdir(root)):
        class_path = os.path.join(root, class_dir)
        if not os.path.isdir(class_path):
            continue
        
        for root_dir, _, files in os.walk(class_path):
            for filename in files:
                if any(filename.lower().endswith(ext) for ext in exts):
                    file_path = os.path.join(root_dir, filename)
                    samples.append((class_dir, file_path))
    
    print(f"  ‚Üí {len(samples)} samples, {len(set(s[0] for s in samples))} classes")
    return samples

def collect_patella_samples(root):
    samples = []
    
    for grade in sorted(os.listdir(root)):
        grade_path = os.path.join(root, grade)
        if not os.path.isdir(grade_path):
            continue
        
        for date_dir in os.listdir(grade_path):
            date_path = os.path.join(grade_path, date_dir)
            if not os.path.isdir(date_path):
                continue
            
            for direction in ['Back', 'Front', 'Left', 'Right']:
                direction_path = os.path.join(date_path, direction)
                if not os.path.exists(direction_path):
                    continue
                
                for filename in os.listdir(direction_path):
                    if filename.lower().endswith('.jpg'):
                        img_path = os.path.join(direction_path, filename)
                        json_path = img_path.replace('.jpg', '.json')
                        
                        if os.path.exists(json_path):
                            samples.append((grade, img_path, json_path))
    
    print(f"  ‚Üí {len(samples)} samples, {len(set(s[0] for s in samples))} classes")
    return samples

def sample_balanced(samples, max_total_samples):
    class_samples = defaultdict(list)
    for label, path in samples:
        class_samples[label].append(path)
    
    num_classes = len(class_samples)
    max_per_class = max_total_samples // num_classes
    
    print(f"  üéØ Target: {max_total_samples} samples")
    print(f"  üìä {num_classes} classes ‚Üí max {max_per_class} per class")
    
    sampled = []
    for label, paths in class_samples.items():
        n_samples = min(len(paths), max_per_class)
        selected = random.sample(paths, n_samples)
        sampled.extend([(label, p) for p in selected])
        print(f"    {label}: {n_samples}/{len(paths)}")
    
    print(f"  ‚úÖ Total sampled: {len(sampled)}")
    return sampled

def sample_balanced_audio(samples, min_per_class):
    # ÌÅ¥ÎûòÏä§Î≥Ñ ÏßëÍ≥Ñ (Î≥ëÌï© ÏóÜÏù¥ ÏõêÎ≥∏ ÌÅ¥ÎûòÏä§Î™Ö Í∑∏ÎåÄÎ°ú ÏÇ¨Ïö©)
    class_samples = defaultdict(list)
    for label, path in samples:
        class_samples[label].append(path)

    print(f"  üéØ Min samples per class: {min_per_class}")

    sampled = []
    for label, paths in sorted(class_samples.items()):
        sampled.extend([(label, p) for p in paths])
        print(f"    {label}: {len(paths)}")

    print(f"  ‚úÖ Total sampled: {len(sampled)}")
    return sampled

def split_and_copy(samples, task_name, is_patella=False, original_samples=None):
    """
    original_samples: sound task Ï†ÑÏö©. test setÏùÑ Ïò§Î≤ÑÏÉòÌîå Ïù¥Ï†Ñ ÏõêÎ≥∏ÏóêÏÑú Î∂ÑÎ¶¨Ìï† Îïå ÏÇ¨Ïö©.
                      Î≤ÑÍ∑∏ 1 ÏàòÏ†ï - Ïò§Î≤ÑÏÉòÌîåÎêú poolÍ≥º testÍ∞Ä Í≤πÏπòÎäî data leakage Î∞©ÏßÄ.
    """
    random.shuffle(samples)
    class_samples = defaultdict(list)

    if is_patella:
        for label, img_path, json_path in samples:
            class_samples[label].append((img_path, json_path))
    else:
        for label, path in samples:
            class_samples[label].append(path)

    for split in ["train", "val", "test"]:
        os.makedirs(os.path.join(WORK_DIR, split, task_name), exist_ok=True)

    # ‚úÖ sound: test setÏùÄ Ïò§Î≤ÑÏÉòÌîå Ïù¥Ï†Ñ ÏõêÎ≥∏(original_samples)ÏóêÏÑú Î≥ÑÎèÑ Ï∂îÏ∂ú
    if original_samples is not None:
        orig_class = defaultdict(list)
        for label, path in original_samples:
            orig_class[label].append(path)
        test_items_by_label = {
            label: paths[:max(10, len(paths) // 5)]
            for label, paths in orig_class.items()
        }
    else:
        test_items_by_label = None

    for label, items in class_samples.items():
        n = len(items)
        n_train = int(n * 0.8)
        n_val   = int(n * 0.1)

        if test_items_by_label is not None:
            # sound: train/valÏùÄ Ïò§Î≤ÑÏÉòÌîå pool, testÎäî ÏõêÎ≥∏
            train_items = items[:n_train]
            val_items   = items[n_train:n_train + n_val]
            test_items  = test_items_by_label.get(label, [])
        else:
            train_items = items[:n_train]
            val_items   = items[n_train:n_train + n_val]
            test_items  = items[n_train + n_val:]

        split_map = {"train": train_items, "val": val_items, "test": test_items}

        for split_name, split_items in split_map.items():
            dst_label_dir = os.path.join(WORK_DIR, split_name, task_name, label)
            os.makedirs(dst_label_dir, exist_ok=True)

            for item in tqdm(split_items, desc=f"{task_name}/{split_name}/{label}", leave=False):
                if is_patella:
                    img_path, json_path = item
                    dst_img  = os.path.join(dst_label_dir, f"{label}_{os.path.basename(img_path)}")
                    shutil.copy(img_path, dst_img)
                    dst_json = dst_img.replace('.jpg', '.json')
                    shutil.copy(json_path, dst_json)
                else:
                    dst_path = os.path.join(dst_label_dir, f"{label}_{os.path.basename(item)}")
                    shutil.copy(item, dst_path)

def _task_ready(task_name):
    """Ìï¥Îãπ taskÏùò train Ìè¥ÎçîÍ∞Ä Ï°¥Ïû¨ÌïòÍ≥† ÎπÑÏñ¥ÏûàÏßÄ ÏïäÏúºÎ©¥ True"""
    task_train = os.path.join(WORK_DIR, "train", task_name)
    return os.path.isdir(task_train) and len(os.listdir(task_train)) > 0


def prepare_dataset():
    # ‚úÖ taskÎ≥Ñ ÎèÖÎ¶Ω Ï≤¥ÌÅ¨: ÏóÜÎäî taskÎßå ÏÑ†ÌÉùÏ†ÅÏúºÎ°ú Ï§ÄÎπÑ
    need_behavior = not _task_ready("behavior")
    need_emotion  = not _task_ready("emotion")
    need_sound    = not _task_ready("sound")
    need_patella  = not _task_ready("patella")

    if not any([need_behavior, need_emotion, need_sound, need_patella]):
        print("‚úÖ All tasks already prepared, skipping.")
        return

    for split in ["train", "val", "test"]:
        os.makedirs(os.path.join(WORK_DIR, split), exist_ok=True)

    if need_behavior:
        print("\nüì¶ Collecting behavior...")
        behavior_all = collect_samples(BEHAVIOR_ROOT, ['.jpg', '.png', '.jpeg'])
        behavior = sample_balanced(behavior_all, MAX_SAMPLES_BEHAVIOR)
        print("  üìã Splitting & Copying behavior...")
        split_and_copy(behavior, "behavior")
    else:
        print("‚úÖ behavior already prepared, skipping.")

    if need_emotion:
        print("\nüì¶ Collecting emotion...")
        emotion_all = collect_samples(EMOTION_ROOT, ['.jpg', '.png', '.jpeg'])
        emotion = sample_balanced(emotion_all, MAX_SAMPLES_EMOTION)
        print("  üìã Splitting & Copying emotion...")
        split_and_copy(emotion, "emotion")
    else:
        print("‚úÖ emotion already prepared, skipping.")

    if need_sound:
        print("\nüì¶ Collecting sound...")
        sound_all = collect_samples(SOUND_ROOT, ['.wav', '.mp3', '.m4a'])
        sound = sample_balanced_audio(sound_all, MIN_SAMPLES_PER_SOUND_CLASS)
        print("  üìã Splitting & Copying sound...")
        # ‚úÖ original_samples Ï†ÑÎã¨Î°ú test set leakage Î∞©ÏßÄ (Î≥ëÌï© ÏóÜÏù¥ ÏõêÎ≥∏ ÎùºÎ≤® Í∑∏ÎåÄÎ°ú)
        split_and_copy(sound, "sound", original_samples=sound_all)
    else:
        print("‚úÖ sound already prepared, skipping.")

    if need_patella:
        print("\nüì¶ Collecting patella luxation...")
        patella_all = collect_patella_samples(PATELLA_ROOT)
        print("  ‚ÑπÔ∏è  Patella: Using all samples")
        print("  üìã Splitting & Copying patella...")
        split_and_copy(patella_all, "patella", is_patella=True)
    else:
        print("‚úÖ patella already prepared, skipping.")

    print("\n‚úÖ Dataset preparation complete.")


# =========================
# 2. Dataset Classes
# =========================
class ImageDataset(Dataset):
    def __init__(self, task_dir, augment=False):
        self.samples = []
        self.label_to_id = {}
        
        for label in sorted(os.listdir(task_dir)):
            label_dir = os.path.join(task_dir, label)
            if not os.path.isdir(label_dir):
                continue
            
            self.label_to_id[label] = len(self.label_to_id)
            
            for file in os.listdir(label_dir):
                if file.lower().endswith(('.jpg', '.png', '.jpeg')):
                    self.samples.append((os.path.join(label_dir, file), label))
        
        print(f"  üìä {os.path.basename(task_dir)}: {len(self.samples)} samples, {len(self.label_to_id)} classes")
        
        if augment:
            self.transform = transforms.Compose([
                transforms.Resize((256,256)),
                transforms.RandomCrop(224),
                transforms.RandomHorizontalFlip(),
                transforms.ColorJitter(0.2, 0.2, 0.2),
                transforms.ToTensor(),
                transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
            ])
        else:
            self.transform = transforms.Compose([
                transforms.Resize((224,224)),
                transforms.ToTensor(),
                transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
            ])
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        path, label = self.samples[idx]
        img = Image.open(path).convert("RGB")
        img = self.transform(img)
        return img, self.label_to_id[label]

class PatellaDataset(Dataset):
    def __init__(self, task_dir, augment=False):
        self.samples = []
        self.label_to_id = {}
        
        for label in sorted(os.listdir(task_dir)):
            label_dir = os.path.join(task_dir, label)
            if not os.path.isdir(label_dir):
                continue
            
            self.label_to_id[label] = len(self.label_to_id)
            
            for file in os.listdir(label_dir):
                if file.lower().endswith('.jpg'):
                    img_path = os.path.join(label_dir, file)
                    json_path = img_path.replace('.jpg', '.json')
                    
                    if os.path.exists(json_path):
                        self.samples.append((img_path, json_path, label))
        
        print(f"  üìä {os.path.basename(task_dir)}: {len(self.samples)} samples, {len(self.label_to_id)} classes")
        
        if augment:
            self.transform = transforms.Compose([
                transforms.Resize((256,256)),
                transforms.RandomCrop(224),
                transforms.RandomHorizontalFlip(),
                transforms.ColorJitter(0.2, 0.2, 0.2),
                transforms.ToTensor(),
                transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
            ])
        else:
            self.transform = transforms.Compose([
                transforms.Resize((224,224)),
                transforms.ToTensor(),
                transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
            ])
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        img_path, json_path, label = self.samples[idx]
        
        img = Image.open(img_path).convert("RGB")
        img = self.transform(img)
        
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        keypoints = []
        for annotation in data.get('annotation_info', []):
            x = float(annotation.get('x', 0))
            y = float(annotation.get('y', 0))
            keypoints.extend([x, y])
        
        while len(keypoints) < 18:
            keypoints.append(0.0)
        
        keypoints = torch.tensor(keypoints[:18], dtype=torch.float32)
        
        return img, keypoints, self.label_to_id[label]

class AudioDataset(Dataset):
    def __init__(self, task_dir, augment=False):
        self.samples = []
        self.label_to_id = {}
        self.id_to_label = {}   # ‚úÖ Ïó≠Î∞©Ìñ• Îß§Ìïë Ï∂îÍ∞Ä
        self.augment = augment
        next_id = 0

        for label in sorted(os.listdir(task_dir)):
            label_dir = os.path.join(task_dir, label)
            if not os.path.isdir(label_dir):
                continue

            self.label_to_id[label] = next_id
            self.id_to_label[next_id] = label
            next_id += 1

            for file in os.listdir(label_dir):
                if file.lower().endswith(('.wav', '.mp3', '.m4a')):
                    self.samples.append((os.path.join(label_dir, file), label))

        print(f"  üìä {os.path.basename(task_dir)}: {len(self.samples)} samples, {len(self.label_to_id)} classes, augment={augment}")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, label = self.samples[idx]

        try:
            waveform, _ = librosa.load(path, sr=SR, mono=True)
        except Exception:
            waveform = np.zeros(MAX_AUDIO_LEN)

        if self.augment:
            waveform = augment_audio(waveform)

        if len(waveform) > MAX_AUDIO_LEN:
            waveform = waveform[:MAX_AUDIO_LEN]
        else:
            waveform = np.pad(waveform, (0, MAX_AUDIO_LEN - len(waveform)))

        inputs = FEATURE_EXTRACTOR(waveform, sampling_rate=SR, return_tensors="pt")
        # ‚úÖ dict Î∞òÌôò: collate_fnÏù¥ ÏïàÏ†ÑÌïòÍ≤å Ïä§ÌÉùÌï† Ïàò ÏûàÎèÑÎ°ù
        return {
            "input_values": inputs.input_values.squeeze(0),
            "labels": torch.tensor(self.label_to_id[label], dtype=torch.long)
        }


def collate_fn_audio(batch):
    """AudioDatasetÏùò dict Î∞∞ÏπòÎ•º ÏïàÏ†ÑÌïòÍ≤å ÌÖåÏÑ†ÏÑúÎ°ú Î≥¥Ìòà"""
    input_values = torch.stack([item["input_values"] for item in batch])
    labels       = torch.stack([item["labels"]       for item in batch])
    return {"input_values": input_values, "labels": labels}

# =========================
# 3. Individual Models (ÎèÖÎ¶Ω Î™®Îç∏)
# =========================
class BehaviorModel(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        backbone = resnet34(weights=ResNet34_Weights.IMAGENET1K_V1)
        in_features = backbone.fc.in_features
        backbone.fc = nn.Identity()
        self.backbone = backbone
        self.head = nn.Linear(in_features, num_classes)
    
    def forward(self, x):
        feat = self.backbone(x)
        return self.head(feat)

class EmotionModel(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        backbone = resnet34(weights=ResNet34_Weights.IMAGENET1K_V1)
        in_features = backbone.fc.in_features
        backbone.fc = nn.Identity()
        self.backbone = backbone
        self.head = nn.Linear(in_features, num_classes)
    
    def forward(self, x):
        feat = self.backbone(x)
        return self.head(feat)

class PatellaModel(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        backbone = resnet34(weights=ResNet34_Weights.IMAGENET1K_V1)
        in_features = backbone.fc.in_features
        backbone.fc = nn.Identity()
        self.backbone = backbone
        
        self.head = nn.Sequential(
            nn.Linear(in_features + 18, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )
    
    def forward(self, x, keypoints):
        feat = self.backbone(x)
        combined = torch.cat([feat, keypoints], dim=1)
        return self.head(combined)

class AudioModel(nn.Module):
    def __init__(self, num_classes, freeze_backbone=False):
        super().__init__()
        self.model = Wav2Vec2ForSequenceClassification.from_pretrained(
            AUDIO_MODEL_NAME,
            num_labels=num_classes,
            ignore_mismatched_sizes=True
        )

        if freeze_backbone:
            for param in self.model.wav2vec2.parameters():
                param.requires_grad = False

    def forward(self, input_values, labels=None):
        # ‚úÖ labelsÎ•º ÎÑòÍ∏∞Î©¥ Î™®Îç∏ ÎÇ¥Î∂ÄÏóêÏÑú lossÎ•º ÏßÅÏ†ë Í≥ÑÏÇ∞ (padding mask Í≥†Î†§)
        return self.model(input_values=input_values, labels=labels)

# =========================
# 4. Helper Functions
# =========================
def mixup_data(x, y, alpha=0.4):
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1
    
    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(x.device)
    
    mixed_x = lam * x + (1 - lam) * x[index]
    y_a, y_b = y, y[index]
    
    return mixed_x, y_a, y_b, lam

def clear_memory():
    """üî• Î©îÎ™®Î¶¨ Ï†ïÎ¶¨"""
    gc.collect()
    torch.cuda.empty_cache()

# =========================
# 5. Sequential Training (Î©îÎ™®Î¶¨ Ìö®Ïú®Ï†Å)
# =========================
def train():
    prepare_dataset()
    
    # üî• label_to_id ÎØ∏Î¶¨ Î°úÎìú (ÎÇòÏ§ëÏóê ÏÇ¨Ïö©)
    print("\nüîÑ Pre-loading label mappings...")
    temp_b = ImageDataset(os.path.join(WORK_DIR, "train", "behavior"), augment=False)
    temp_e = ImageDataset(os.path.join(WORK_DIR, "train", "emotion"), augment=False)
    temp_s = AudioDataset(os.path.join(WORK_DIR, "train", "sound"), augment=False)
    temp_p = PatellaDataset(os.path.join(WORK_DIR, "train", "patella"), augment=False)
    
    behavior_label_to_id = temp_b.label_to_id
    emotion_label_to_id  = temp_e.label_to_id
    sound_label_to_id    = temp_s.label_to_id
    sound_id_to_label    = temp_s.id_to_label    # ‚úÖ Ïó≠Î∞©Ìñ• Îß§Ìïë Ï†ÄÏû•
    patella_label_to_id  = temp_p.label_to_id
    
    del temp_b, temp_e, temp_s, temp_p
    clear_memory()
    
    # üî• Î™®Îç∏ Ï¥àÍ∏∞Ìôî (CPUÏóê Î®ºÏ†Ä ÏÉùÏÑ±)
    print("\nüîÑ Initializing models...")
    behavior_model = BehaviorModel(len(behavior_label_to_id))
    emotion_model = EmotionModel(len(emotion_label_to_id))
    patella_model = PatellaModel(len(patella_label_to_id))
    audio_model = AudioModel(len(sound_label_to_id), freeze_backbone=False)
    
    # Optimizers (Î™®Îç∏Ïù¥ GPUÎ°ú Í∞ÄÍ∏∞ Ï†ÑÏóê ÏÉùÏÑ±)
    behavior_opt = torch.optim.AdamW(behavior_model.parameters(), lr=LR_VIDEO, weight_decay=0.01)
    emotion_opt = torch.optim.AdamW(emotion_model.parameters(), lr=LR_VIDEO, weight_decay=0.01)
    patella_opt = torch.optim.AdamW(patella_model.parameters(), lr=LR_VIDEO, weight_decay=0.01)
    audio_opt = torch.optim.AdamW(audio_model.parameters(), lr=LR_AUDIO, weight_decay=0.01)

    # ‚úÖ Audio LR Warmup Scheduler
    # sound loader ÌÅ¨Í∏∞Î•º ÎØ∏Î¶¨ Ï∂îÏ†ï (epochÎãπ Ìï©ÏÇ∞ÏúºÎ°ú Í≤∞Ï†ïÎê®)
    _temp_sound = AudioDataset(os.path.join(WORK_DIR, "train", "sound"), augment=False)
    _approx_sound_steps = (len(_temp_sound) // BATCH_SIZE) * EPOCHS
    del _temp_sound
    audio_scheduler = get_linear_schedule_with_warmup(
        audio_opt,
        num_warmup_steps=100,
        num_training_steps=_approx_sound_steps
    )
    clear_memory()

    # Scalers
    video_scaler = torch.amp.GradScaler("cuda")
    audio_scaler = torch.amp.GradScaler("cuda")

    # Loss
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    
    best_avg_acc = 0
    history = []
    
    for epoch in range(EPOCHS):
        print(f"\n{'='*60}")
        print(f"Epoch {epoch+1}/{EPOCHS}")
        print(f"{'='*60}")
        
        loss_b, loss_e, loss_s, loss_p = 0, 0, 0, 0
        
        # ========== 1. Behavior ==========
        print(f"\nüêæ Training Behavior...")
        behavior_model.to(DEVICE)
        behavior_model.train()
        
        behavior_train = ImageDataset(os.path.join(WORK_DIR, "train", "behavior"), augment=True)
        behavior_loader = DataLoader(behavior_train, BATCH_SIZE, True, num_workers=NUM_WORKERS, pin_memory=True)
        
        for imgs, labels in tqdm(behavior_loader, desc="Behavior", leave=False):
            imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)
            
            behavior_opt.zero_grad()  # ‚úÖ zero_grad Ïù¥Îèô: forward ÏïûÏúºÎ°ú
            with torch.amp.autocast("cuda"):
                imgs, labels_a, labels_b, lam = mixup_data(imgs, labels)
                logits = behavior_model(imgs)
                loss = lam * criterion(logits, labels_a) + (1 - lam) * criterion(logits, labels_b)

            video_scaler.scale(loss).backward()
            video_scaler.step(behavior_opt)
            video_scaler.update()
            
            loss_b += loss.item()
        
        loss_b /= len(behavior_loader)
        print(f"  ‚Üí Avg Loss: {loss_b:.4f}")
        
        # üî• Î©îÎ™®Î¶¨ Ìï¥Ï†ú
        behavior_model.cpu()
        del behavior_train, behavior_loader
        clear_memory()
        
        # ========== 2. Emotion ==========
        print(f"\nüòä Training Emotion...")
        emotion_model.to(DEVICE)
        emotion_model.train()
        
        emotion_train = ImageDataset(os.path.join(WORK_DIR, "train", "emotion"), augment=True)
        emotion_loader = DataLoader(emotion_train, BATCH_SIZE, True, num_workers=NUM_WORKERS, pin_memory=True)
        
        for imgs, labels in tqdm(emotion_loader, desc="Emotion", leave=False):
            imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)
            
            emotion_opt.zero_grad()  # ‚úÖ zero_grad Ïù¥Îèô: forward ÏïûÏúºÎ°ú
            with torch.amp.autocast("cuda"):
                imgs, labels_a, labels_b, lam = mixup_data(imgs, labels)
                logits = emotion_model(imgs)
                loss = lam * criterion(logits, labels_a) + (1 - lam) * criterion(logits, labels_b)

            video_scaler.scale(loss).backward()
            video_scaler.step(emotion_opt)
            video_scaler.update()
            
            loss_e += loss.item()
        
        loss_e /= len(emotion_loader)
        print(f"  ‚Üí Avg Loss: {loss_e:.4f}")
        
        emotion_model.cpu()
        del emotion_train, emotion_loader
        clear_memory()
        
        # ========== 3. Sound ==========
        print(f"\nüîä Training Sound...")
        audio_model.to(DEVICE)
        audio_model.train()
        
        sound_train = AudioDataset(os.path.join(WORK_DIR, "train", "sound"), augment=True)

        # ‚úÖ ÌÅ¥ÎûòÏä§ Í∞ÄÏ§ëÏπò (epochÎßàÎã§ Í≥ÑÏÇ∞ Ïú†ÏßÄ ‚Äì ÌÅ¥ÎûòÏä§Î≥Ñ Î≥ÑÎèÑ criterion)
        sound_labels_list = [item[1] for item in sound_train.samples]
        sound_label_ids   = [sound_train.label_to_id[l] for l in sound_labels_list]
        class_weights = compute_class_weight(
            'balanced',
            classes=np.arange(len(sound_train.label_to_id)),
            y=sound_label_ids
        )
        class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)

        # ‚úÖ collate_fn_audio Ï†ÅÏö©
        sound_loader = DataLoader(
            sound_train, BATCH_SIZE, True,
            num_workers=2, pin_memory=True,
            collate_fn=collate_fn_audio
        )

        for batch in tqdm(sound_loader, desc="Sound", leave=False):
            audios = batch["input_values"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)

            audio_opt.zero_grad()

            with torch.amp.autocast("cuda"):
                # ‚úÖ labels ÏßÅÏ†ë Ï†ÑÎã¨ ‚Üí outputs.loss ÏÇ¨Ïö© (padding mask Í≥†Î†§)
                outputs = audio_model(input_values=audios, labels=labels)
                # ‚úÖ LOSS_WEIGHTS Ïú†ÏßÄ, ÌÅ¥ÎûòÏä§ Í∞ÄÏ§ëÏπòÎäî ÏàòÎèôÏúºÎ°ú Ï†ÅÏö©
                loss = outputs.loss * LOSS_WEIGHTS["sound"]
                # class_weightsÎ•º ÌôúÏö©Ìïú Î≥¥Ï†ï Ìï≠ Ï∂îÍ∞Ä
                per_sample_w = class_weights_tensor[labels]
                loss = (loss * per_sample_w.mean())

            audio_scaler.scale(loss).backward()
            audio_scaler.unscale_(audio_opt)
            torch.nn.utils.clip_grad_norm_(audio_model.parameters(), 1.0)
            audio_scaler.step(audio_opt)
            audio_scaler.update()
            # ‚úÖ Ïä§ÏºÄÏ§ÑÎü¨ step
            audio_scheduler.step()

            loss_s += loss.item()

        loss_s /= len(sound_loader)
        print(f"  ‚Üí Avg Loss: {loss_s:.4f}")

        audio_model.cpu()
        del sound_train, sound_loader, class_weights_tensor
        clear_memory()
        
        # ========== 4. Patella ==========
        print(f"\nü¶¥ Training Patella...")
        patella_model.to(DEVICE)
        patella_model.train()
        
        patella_train = PatellaDataset(os.path.join(WORK_DIR, "train", "patella"), augment=True)
        patella_loader = DataLoader(patella_train, BATCH_SIZE, True, num_workers=NUM_WORKERS, pin_memory=True)
        
        for imgs, keypoints, labels in tqdm(patella_loader, desc="Patella", leave=False):
            imgs, keypoints, labels = imgs.to(DEVICE), keypoints.to(DEVICE), labels.to(DEVICE)
            
            patella_opt.zero_grad()  # ‚úÖ zero_grad Ïù¥Îèô: forward ÏïûÏúºÎ°ú
            with torch.amp.autocast("cuda"):
                imgs, labels_a, labels_b, lam = mixup_data(imgs, labels)
                logits = patella_model(imgs, keypoints)
                loss = lam * criterion(logits, labels_a) + (1 - lam) * criterion(logits, labels_b)

            video_scaler.scale(loss).backward()
            video_scaler.step(patella_opt)
            video_scaler.update()
            
            loss_p += loss.item()
        
        loss_p /= len(patella_loader)
        print(f"  ‚Üí Avg Loss: {loss_p:.4f}")
        
        patella_model.cpu()
        del patella_train, patella_loader
        clear_memory()
        
        # ========== Validation ==========
        print(f"\nüîç Validation...")
        
        # Behavior Val
        behavior_model.to(DEVICE)
        behavior_model.eval()
        behavior_val = ImageDataset(os.path.join(WORK_DIR, "val", "behavior"), augment=False)
        behavior_val_loader = DataLoader(behavior_val, BATCH_SIZE, False, num_workers=NUM_WORKERS//2, pin_memory=True)
        
        correct_b, total_b = 0, 0
        with torch.no_grad():
            for imgs, labels in tqdm(behavior_val_loader, desc="Val Behavior", leave=False):
                imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)
                logits = behavior_model(imgs)
                pred = logits.argmax(-1)
                correct_b += (pred == labels).sum().item()
                total_b += labels.size(0)
        acc_b = correct_b / total_b
        
        behavior_model.cpu()
        del behavior_val, behavior_val_loader
        clear_memory()
        
        # Emotion Val
        emotion_model.to(DEVICE)
        emotion_model.eval()
        emotion_val = ImageDataset(os.path.join(WORK_DIR, "val", "emotion"), augment=False)
        emotion_val_loader = DataLoader(emotion_val, BATCH_SIZE, False, num_workers=NUM_WORKERS//2, pin_memory=True)
        
        correct_e, total_e = 0, 0
        with torch.no_grad():
            for imgs, labels in tqdm(emotion_val_loader, desc="Val Emotion", leave=False):
                imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)
                logits = emotion_model(imgs)
                pred = logits.argmax(-1)
                correct_e += (pred == labels).sum().item()
                total_e += labels.size(0)
        acc_e = correct_e / total_e
        
        emotion_model.cpu()
        del emotion_val, emotion_val_loader
        clear_memory()
        
        # Sound Val
        audio_model.to(DEVICE)
        audio_model.eval()
        sound_val = AudioDataset(os.path.join(WORK_DIR, "val", "sound"), augment=False)
        # ‚úÖ collate_fn_audio Ï†ÅÏö©
        sound_val_loader = DataLoader(
            sound_val, BATCH_SIZE, False,
            num_workers=2, pin_memory=True,
            collate_fn=collate_fn_audio
        )

        correct_s, total_s = 0, 0
        with torch.no_grad():
            for batch in tqdm(sound_val_loader, desc="Val Sound", leave=False):
                audios = batch["input_values"].to(DEVICE)
                labels = batch["labels"].to(DEVICE)
                # ‚úÖ outputs.logits ÏÇ¨Ïö©
                outputs = audio_model(input_values=audios, labels=labels)
                pred = outputs.logits.argmax(-1)
                correct_s += (pred == labels).sum().item()
                total_s   += labels.size(0)
        acc_s = correct_s / total_s
        
        audio_model.cpu()
        del sound_val, sound_val_loader
        clear_memory()
        
        # Patella Val
        patella_model.to(DEVICE)
        patella_model.eval()
        patella_val = PatellaDataset(os.path.join(WORK_DIR, "val", "patella"), augment=False)
        patella_val_loader = DataLoader(patella_val, BATCH_SIZE, False, num_workers=NUM_WORKERS//2, pin_memory=True)
        
        correct_p, total_p = 0, 0
        with torch.no_grad():
            for imgs, keypoints, labels in tqdm(patella_val_loader, desc="Val Patella", leave=False):
                imgs, keypoints, labels = imgs.to(DEVICE), keypoints.to(DEVICE), labels.to(DEVICE)
                logits = patella_model(imgs, keypoints)
                pred = logits.argmax(-1)
                correct_p += (pred == labels).sum().item()
                total_p += labels.size(0)
        acc_p = correct_p / total_p
        
        patella_model.cpu()
        del patella_val, patella_val_loader
        clear_memory()
        
        avg_acc = (acc_b + acc_e + acc_s + acc_p) / 4
        
        print(f"\nüìä Results:")
        print(f"  Behavior: Loss {loss_b:.4f} | Acc {acc_b:.4f} ({acc_b*100:.1f}%)")
        print(f"  Emotion:  Loss {loss_e:.4f} | Acc {acc_e:.4f} ({acc_e*100:.1f}%)")
        print(f"  Sound:    Loss {loss_s:.4f} | Acc {acc_s:.4f} ({acc_s*100:.1f}%)")
        print(f"  Patella:  Loss {loss_p:.4f} | Acc {acc_p:.4f} ({acc_p*100:.1f}%)")
        print(f"  Average Acc: {avg_acc:.4f} ({avg_acc*100:.1f}%)")
        
        history.append({
            'epoch': epoch+1,
            'loss_b': loss_b, 'loss_e': loss_e, 'loss_s': loss_s, 'loss_p': loss_p,
            'acc_b': acc_b, 'acc_e': acc_e, 'acc_s': acc_s, 'acc_p': acc_p,
            'acc_avg': avg_acc
        })
        
        if avg_acc > best_avg_acc:
            best_avg_acc = avg_acc
            
            # üî• Î™®Îç∏Îì§ÏùÑ CPUÎ°ú ÏòÆÍ∏¥ ÌõÑ Ï†ÄÏû•
            torch.save({
                "behavior_model":       behavior_model.state_dict(),
                "emotion_model":        emotion_model.state_dict(),
                "audio_model":          audio_model.state_dict(),
                "patella_model":        patella_model.state_dict(),
                "behavior_label_to_id": behavior_label_to_id,
                "emotion_label_to_id":  emotion_label_to_id,
                "sound_label_to_id":    sound_label_to_id,
                "sound_id_to_label":    sound_id_to_label,    # ‚úÖ Ïó≠Î∞©Ìñ• Îß§Ìïë Ï∂îÍ∞Ä
                "patella_label_to_id":  patella_label_to_id,
                "best_epoch":           epoch + 1,
                "best_acc":             best_avg_acc,
                "history":              history
            }, "pet_normal_omni_best.pth")
            print(f"  üíæ Saved new best model! (Acc: {best_avg_acc:.4f})")
    
    # Í∑∏ÎûòÌîÑ
    print("\nüìà Generating training history plot...")
    plt.figure(figsize=(20, 5))
    
    plt.subplot(141)
    plt.plot([h['acc_b'] for h in history], 'b-', label='Behavior', linewidth=2)
    plt.xlabel('Epoch'); plt.ylabel('Accuracy')
    plt.title('Behavior Accuracy'); plt.ylim(0, 1); plt.grid(True, alpha=0.3); plt.legend()
    
    plt.subplot(142)
    plt.plot([h['acc_e'] for h in history], 'r-', label='Emotion', linewidth=2)
    plt.xlabel('Epoch'); plt.ylabel('Accuracy')
    plt.title('Emotion Accuracy'); plt.ylim(0, 1); plt.grid(True, alpha=0.3); plt.legend()
    
    plt.subplot(143)
    plt.plot([h['acc_s'] for h in history], 'g-', label='Sound', linewidth=2)
    plt.xlabel('Epoch'); plt.ylabel('Accuracy')
    plt.title('Sound Accuracy'); plt.ylim(0, 1); plt.grid(True, alpha=0.3); plt.legend()
    
    plt.subplot(144)
    plt.plot([h['acc_p'] for h in history], 'purple', label='Patella', linewidth=2)
    plt.xlabel('Epoch'); plt.ylabel('Accuracy')
    plt.title('Patella Accuracy'); plt.ylim(0, 1); plt.grid(True, alpha=0.3); plt.legend()
    
    plt.tight_layout()
    plt.savefig('pet_omni_sequential_history.png', dpi=150, bbox_inches='tight')
    print("  ‚úÖ Saved: pet_omni_sequential_history.png")
    
    print(f"\nüéâ Training Finished!")
    print(f"  Best Average Acc: {best_avg_acc:.4f} ({best_avg_acc*100:.1f}%)")

if __name__ == "__main__":
    train()

  from .autonotebook import tqdm as notebook_tqdm


üéØ Device: cuda:0

üì¶ Collecting behavior...
  ‚Üí 757113 samples, 25 classes
  üéØ Target: 100000 samples
  üìä 25 classes ‚Üí max 4000 per class
    CAT_ARCH: 2296/2296
    CAT_ARMSTRETCH: 4000/38483
    CAT_FOOTPUSH: 4000/9517
    CAT_GETDOWN: 4000/13421
    CAT_GROOMING: 4000/65029
    CAT_HEADING: 4000/11237
    CAT_LAYDOWN: 4000/21474
    CAT_LYING: 4000/12119
    CAT_ROLL: 4000/8513
    CAT_SITDOWN: 4000/18401
    CAT_TAILING: 4000/36960
    CAT_WALKRUN: 4000/30498
    DOG_BODYLOWER: 4000/79772
    DOG_BODYSCRATCH: 4000/15783
    DOG_BODYSHAKE: 4000/15296
    DOG_FEETUP: 4000/34365
    DOG_FOOTUP: 4000/52506
    DOG_HEADING: 4000/19052
    DOG_LYING: 4000/32129
    DOG_MOUNTING: 4000/5211
    DOG_SIT: 4000/79182
    DOG_TAILING: 4000/35824
    DOG_TAILLOW: 4000/8376
    DOG_TURN: 4000/21554
    DOG_WALKRUN: 4000/90115
  ‚úÖ Total sampled: 98296
  üìã Splitting & Copying behavior...


                                                                                     


üì¶ Collecting emotion...
  ‚Üí 69113 samples, 10 classes
  üéØ Target: 100000 samples
  üìä 10 classes ‚Üí max 10000 per class
    cat_attentive: 997/997
    cat_happy: 1221/1221
    cat_relaxed: 2999/2999
    cat_sad: 171/171
    dog_angry: 8589/8589
    dog_anxious : 10000/11590
    dog_confused: 3286/3286
    dog_happy: 10000/17355
    dog_relaxed: 8699/8699
    dog_sad: 10000/14206
  ‚úÖ Total sampled: 55962
  üìã Splitting & Copying emotion...


                                                                                 


üì¶ Collecting sound...
  ‚Üí 1248 samples, 14 classes
  üéØ Min samples per class: 50
    cat_aggressive: 39
    cat_huntingMind: 10
    cat_mating: 10
    cat_paining: 10
    cat_positive: 30
    dog_bark: 316
    dog_breath: 62
    dog_cough: 115
    dog_growl: 65
    dog_howling: 151
    dog_playing: 91
    dog_sneeze: 110
    dog_tracheal_collapse: 89
    dog_whining: 150
  ‚úÖ Total sampled: 1248
  üìã Splitting & Copying sound...


                                                                         


üì¶ Collecting patella luxation...
  ‚Üí 100873 samples, 5 classes
  ‚ÑπÔ∏è  Patella: Using all samples
  üìã Splitting & Copying patella...


                                                                            


‚úÖ Dataset preparation complete.

üîÑ Pre-loading label mappings...
  üìä behavior: 11843 samples, 25 classes
  üìä emotion: 44766 samples, 10 classes
  üìä sound: 995 samples, 14 classes, augment=False
  üìä patella: 80696 samples, 5 classes

üîÑ Initializing models...


Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 211/211 [00:00<00:00, 546.70it/s, Materializing param=wav2vec2.masked_spec_embed]                                            
[1mWav2Vec2ForSequenceClassification LOAD REPORT[0m from: facebook/wav2vec2-base
Key                          | Status     | 
-----------------------------+------------+-
project_q.bias               | UNEXPECTED | 
project_hid.bias             | UNEXPECTED | 
quantizer.weight_proj.weight | UNEXPECTED | 
quantizer.weight_proj.bias   | UNEXPECTED | 
quantizer.codevectors        | UNEXPECTED | 
project_hid.weight           | UNEXPECTED | 
project_q.weight             | UNEXPECTED | 
projector.weight             | MISSING    | 
classifier.bias              | MISSING    | 
projector.bias               | MISSING    | 
classifier.weight            | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those pa

  üìä sound: 995 samples, 14 classes, augment=False

Epoch 1/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 2.6111

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.4937

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


  audio_scheduler.step()
                                                      

  ‚Üí Avg Loss: 1.5670

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 1.1043

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 2.6111 | Acc 0.4579 (45.8%)
  Emotion:  Loss 1.4937 | Acc 0.6477 (64.8%)
  Sound:    Loss 1.5670 | Acc 0.2066 (20.7%)
  Patella:  Loss 1.1043 | Acc 0.8072 (80.7%)
  Average Acc: 0.5298 (53.0%)
  üíæ Saved new best model! (Acc: 0.5298)

Epoch 2/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 2.0962

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.3883

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 1.5122

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.8965

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 2.0962 | Acc 0.5472 (54.7%)
  Emotion:  Loss 1.3883 | Acc 0.6726 (67.3%)
  Sound:    Loss 1.5122 | Acc 0.2562 (25.6%)
  Patella:  Loss 0.8965 | Acc 0.8866 (88.7%)
  Average Acc: 0.5906 (59.1%)
  üíæ Saved new best model! (Acc: 0.5906)

Epoch 3/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.8676

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.3309

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 1.4512

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.8193

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 1.8676 | Acc 0.5940 (59.4%)
  Emotion:  Loss 1.3309 | Acc 0.6931 (69.3%)
  Sound:    Loss 1.4512 | Acc 0.2562 (25.6%)
  Patella:  Loss 0.8193 | Acc 0.9094 (90.9%)
  Average Acc: 0.6132 (61.3%)
  üíæ Saved new best model! (Acc: 0.6132)

Epoch 4/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.7872

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.2912

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 1.3728

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.7712

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 1.7872 | Acc 0.6192 (61.9%)
  Emotion:  Loss 1.2912 | Acc 0.7148 (71.5%)
  Sound:    Loss 1.3728 | Acc 0.3223 (32.2%)
  Patella:  Loss 0.7712 | Acc 0.9249 (92.5%)
  Average Acc: 0.6453 (64.5%)
  üíæ Saved new best model! (Acc: 0.6453)

Epoch 5/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.6643

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.2694

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 1.2622

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.7564

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 1.6643 | Acc 0.6378 (63.8%)
  Emotion:  Loss 1.2694 | Acc 0.7010 (70.1%)
  Sound:    Loss 1.2622 | Acc 0.3636 (36.4%)
  Patella:  Loss 0.7564 | Acc 0.9293 (92.9%)
  Average Acc: 0.6579 (65.8%)
  üíæ Saved new best model! (Acc: 0.6579)

Epoch 6/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.5383

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.2308

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 1.2335

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.7252

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 1.5383 | Acc 0.6677 (66.8%)
  Emotion:  Loss 1.2308 | Acc 0.7124 (71.2%)
  Sound:    Loss 1.2335 | Acc 0.4050 (40.5%)
  Patella:  Loss 0.7252 | Acc 0.9536 (95.4%)
  Average Acc: 0.6847 (68.5%)
  üíæ Saved new best model! (Acc: 0.6847)

Epoch 7/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.5958

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.2159

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 1.1762

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.7177

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 1.5958 | Acc 0.6698 (67.0%)
  Emotion:  Loss 1.2159 | Acc 0.7142 (71.4%)
  Sound:    Loss 1.1762 | Acc 0.4463 (44.6%)
  Patella:  Loss 0.7177 | Acc 0.9561 (95.6%)
  Average Acc: 0.6966 (69.7%)
  üíæ Saved new best model! (Acc: 0.6966)

Epoch 8/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.5329

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.1984

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 1.1428

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.7010

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 1.5329 | Acc 0.6804 (68.0%)
  Emotion:  Loss 1.1984 | Acc 0.7266 (72.7%)
  Sound:    Loss 1.1428 | Acc 0.5041 (50.4%)
  Patella:  Loss 0.7010 | Acc 0.9600 (96.0%)
  Average Acc: 0.7178 (71.8%)
  üíæ Saved new best model! (Acc: 0.7178)

Epoch 9/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.4929

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.1916

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 1.2921

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.6970

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 1.4929 | Acc 0.6838 (68.4%)
  Emotion:  Loss 1.1916 | Acc 0.7278 (72.8%)
  Sound:    Loss 1.2921 | Acc 0.5372 (53.7%)
  Patella:  Loss 0.6970 | Acc 0.9558 (95.6%)
  Average Acc: 0.7261 (72.6%)
  üíæ Saved new best model! (Acc: 0.7261)

Epoch 10/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.4343

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.1784

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 1.1791

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.6929

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 1.4343 | Acc 0.6891 (68.9%)
  Emotion:  Loss 1.1784 | Acc 0.7328 (73.3%)
  Sound:    Loss 1.1791 | Acc 0.5372 (53.7%)
  Patella:  Loss 0.6929 | Acc 0.9578 (95.8%)
  Average Acc: 0.7292 (72.9%)
  üíæ Saved new best model! (Acc: 0.7292)

Epoch 11/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.4172

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.1626

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 1.0418

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.6865

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 1.4172 | Acc 0.7004 (70.0%)
  Emotion:  Loss 1.1626 | Acc 0.7357 (73.6%)
  Sound:    Loss 1.0418 | Acc 0.6116 (61.2%)
  Patella:  Loss 0.6865 | Acc 0.9694 (96.9%)
  Average Acc: 0.7543 (75.4%)
  üíæ Saved new best model! (Acc: 0.7543)

Epoch 12/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.4223

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.1494

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 0.9880

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.6878

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 1.4223 | Acc 0.6953 (69.5%)
  Emotion:  Loss 1.1494 | Acc 0.7486 (74.9%)
  Sound:    Loss 0.9880 | Acc 0.5702 (57.0%)
  Patella:  Loss 0.6878 | Acc 0.9659 (96.6%)
  Average Acc: 0.7450 (74.5%)

Epoch 13/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.3880

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.1472

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 1.1041

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.6831

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 1.3880 | Acc 0.7038 (70.4%)
  Emotion:  Loss 1.1472 | Acc 0.7416 (74.2%)
  Sound:    Loss 1.1041 | Acc 0.6033 (60.3%)
  Patella:  Loss 0.6831 | Acc 0.9705 (97.0%)
  Average Acc: 0.7548 (75.5%)
  üíæ Saved new best model! (Acc: 0.7548)

Epoch 14/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.3690

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.1400

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 0.9525

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.6814

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 1.3690 | Acc 0.7107 (71.1%)
  Emotion:  Loss 1.1400 | Acc 0.7428 (74.3%)
  Sound:    Loss 0.9525 | Acc 0.5868 (58.7%)
  Patella:  Loss 0.6814 | Acc 0.9641 (96.4%)
  Average Acc: 0.7511 (75.1%)

Epoch 15/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.3925

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.1221

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 0.9243

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.6782

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 1.3925 | Acc 0.7054 (70.5%)
  Emotion:  Loss 1.1221 | Acc 0.7432 (74.3%)
  Sound:    Loss 0.9243 | Acc 0.5620 (56.2%)
  Patella:  Loss 0.6782 | Acc 0.9781 (97.8%)
  Average Acc: 0.7472 (74.7%)

Epoch 16/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.4008

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.1198

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 0.9139

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.6800

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 1.4008 | Acc 0.7093 (70.9%)
  Emotion:  Loss 1.1198 | Acc 0.7371 (73.7%)
  Sound:    Loss 0.9139 | Acc 0.6198 (62.0%)
  Patella:  Loss 0.6800 | Acc 0.9754 (97.5%)
  Average Acc: 0.7604 (76.0%)
  üíæ Saved new best model! (Acc: 0.7604)

Epoch 17/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.3693

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.1110

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 0.8509

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.6776

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 1.3693 | Acc 0.7118 (71.2%)
  Emotion:  Loss 1.1110 | Acc 0.7369 (73.7%)
  Sound:    Loss 0.8509 | Acc 0.6612 (66.1%)
  Patella:  Loss 0.6776 | Acc 0.9778 (97.8%)
  Average Acc: 0.7719 (77.2%)
  üíæ Saved new best model! (Acc: 0.7719)

Epoch 18/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.3369

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.1073

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 0.8261

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.6678

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 1.3369 | Acc 0.7078 (70.8%)
  Emotion:  Loss 1.1073 | Acc 0.7321 (73.2%)
  Sound:    Loss 0.8261 | Acc 0.6529 (65.3%)
  Patella:  Loss 0.6678 | Acc 0.9734 (97.3%)
  Average Acc: 0.7666 (76.7%)

Epoch 19/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.4254

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.1097

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 0.9297

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.6707

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 1.4254 | Acc 0.7054 (70.5%)
  Emotion:  Loss 1.1097 | Acc 0.7409 (74.1%)
  Sound:    Loss 0.9297 | Acc 0.6612 (66.1%)
  Patella:  Loss 0.6707 | Acc 0.9756 (97.6%)
  Average Acc: 0.7708 (77.1%)

Epoch 20/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.3324

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.0966

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 0.7954

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.6760

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 1.3324 | Acc 0.7211 (72.1%)
  Emotion:  Loss 1.0966 | Acc 0.7502 (75.0%)
  Sound:    Loss 0.7954 | Acc 0.6529 (65.3%)
  Patella:  Loss 0.6760 | Acc 0.9772 (97.7%)
  Average Acc: 0.7753 (77.5%)
  üíæ Saved new best model! (Acc: 0.7753)

Epoch 21/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.3531

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.0999

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 0.7702

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.6654

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 1.3531 | Acc 0.7186 (71.9%)
  Emotion:  Loss 1.0999 | Acc 0.7538 (75.4%)
  Sound:    Loss 0.7702 | Acc 0.6446 (64.5%)
  Patella:  Loss 0.6654 | Acc 0.9760 (97.6%)
  Average Acc: 0.7733 (77.3%)

Epoch 22/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.3485

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.0858

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 0.7576

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.6626

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 1.3485 | Acc 0.7190 (71.9%)
  Emotion:  Loss 1.0858 | Acc 0.7486 (74.9%)
  Sound:    Loss 0.7576 | Acc 0.6446 (64.5%)
  Patella:  Loss 0.6626 | Acc 0.9757 (97.6%)
  Average Acc: 0.7720 (77.2%)

Epoch 23/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.3635

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.0796

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 0.7225

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.6566

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 1.3635 | Acc 0.7182 (71.8%)
  Emotion:  Loss 1.0796 | Acc 0.7318 (73.2%)
  Sound:    Loss 0.7225 | Acc 0.6364 (63.6%)
  Patella:  Loss 0.6566 | Acc 0.9784 (97.8%)
  Average Acc: 0.7662 (76.6%)

Epoch 24/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.3452

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.0690

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 0.7000

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.6640

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 1.3452 | Acc 0.7171 (71.7%)
  Emotion:  Loss 1.0690 | Acc 0.7398 (74.0%)
  Sound:    Loss 0.7000 | Acc 0.6446 (64.5%)
  Patella:  Loss 0.6640 | Acc 0.9794 (97.9%)
  Average Acc: 0.7702 (77.0%)

Epoch 25/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.3904

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.0820

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 0.6902

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.6637

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 1.3904 | Acc 0.7156 (71.6%)
  Emotion:  Loss 1.0820 | Acc 0.7325 (73.2%)
  Sound:    Loss 0.6902 | Acc 0.6612 (66.1%)
  Patella:  Loss 0.6637 | Acc 0.9782 (97.8%)
  Average Acc: 0.7719 (77.2%)

Epoch 26/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.3012

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.0823

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 0.6590

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.6571

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 1.3012 | Acc 0.7215 (72.1%)
  Emotion:  Loss 1.0823 | Acc 0.7364 (73.6%)
  Sound:    Loss 0.6590 | Acc 0.6446 (64.5%)
  Patella:  Loss 0.6571 | Acc 0.9788 (97.9%)
  Average Acc: 0.7703 (77.0%)

Epoch 27/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.3184

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.0738

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                      

  ‚Üí Avg Loss: 0.6632

ü¶¥ Training Patella...
  üìä patella: 80696 samples, 5 classes


                                                            

  ‚Üí Avg Loss: 0.6615

üîç Validation...
  üìä behavior: 5281 samples, 25 classes


                                                               

  üìä emotion: 5592 samples, 10 classes


                                                              

  üìä sound: 121 samples, 14 classes, augment=False


                                                        

  üìä patella: 10086 samples, 5 classes


                                                              


üìä Results:
  Behavior: Loss 1.3184 | Acc 0.7215 (72.1%)
  Emotion:  Loss 1.0738 | Acc 0.7305 (73.1%)
  Sound:    Loss 0.6632 | Acc 0.6446 (64.5%)
  Patella:  Loss 0.6615 | Acc 0.9801 (98.0%)
  Average Acc: 0.7692 (76.9%)

Epoch 28/100

üêæ Training Behavior...
  üìä behavior: 11843 samples, 25 classes


                                                           

  ‚Üí Avg Loss: 1.3733

üòä Training Emotion...
  üìä emotion: 44766 samples, 10 classes


                                                            

  ‚Üí Avg Loss: 1.0826

üîä Training Sound...
  üìä sound: 995 samples, 14 classes, augment=True


                                                     

OutOfMemoryError: CUDA out of memory. Tried to allocate 1000.00 MiB. GPU 0 has a total capacity of 23.57 GiB of which 830.25 MiB is free. Including non-PyTorch memory, this process has 7.05 GiB memory in use. Of the allocated memory 4.88 GiB is allocated by PyTorch, and 1.84 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## Test

In [None]:
# import os
# import random
# import shutil
# from tqdm import tqdm
# import matplotlib.pyplot as plt
# import torch
# import torch.nn as nn
# from torch.utils.data import Dataset, DataLoader
# from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, get_linear_schedule_with_warmup
# import torchvision.transforms as transforms
# from torchvision.models import resnet34, ResNet34_Weights
# from PIL import Image
# import librosa
# import numpy as np
# from collections import defaultdict, Counter
# from sklearn.utils.class_weight import compute_class_weight

# AUDIO_MODEL_NAME = "facebook/wav2vec2-base"
# FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained(AUDIO_MODEL_NAME)

# class VideoMultiBackbone(nn.Module):
#     def __init__(self, num_b, num_e):
#         super().__init__()
        
#         backbone_b = resnet34(weights=ResNet34_Weights.IMAGENET1K_V1)
#         in_features_b = backbone_b.fc.in_features
#         backbone_b.fc = nn.Identity()
#         self.behavior_backbone = backbone_b
#         self.behavior_head = nn.Linear(in_features_b, num_b)
        
#         backbone_e = resnet34(weights=ResNet34_Weights.IMAGENET1K_V1)
#         in_features_e = backbone_e.fc.in_features
#         backbone_e.fc = nn.Identity()
#         self.emotion_backbone = backbone_e
#         self.emotion_head = nn.Linear(in_features_e, num_e)
    
#     def forward(self, x, task):
#         if task == "behavior":
#             feat = self.behavior_backbone(x)
#             return self.behavior_head(feat)
#         elif task == "emotion":
#             feat = self.emotion_backbone(x)
#             return self.emotion_head(feat)
#         else:
#             raise ValueError("Task must be 'behavior' or 'emotion'")
        
# class AudioModel(nn.Module):
#     def __init__(self, num_classes, freeze_backbone=False):  # üî• Í∏∞Î≥∏Í∞í False
#         super().__init__()
#         self.model = Wav2Vec2ForSequenceClassification.from_pretrained(
#             AUDIO_MODEL_NAME,
#             num_labels=num_classes,
#             ignore_mismatched_sizes=True
#         )
        
#         # üî• Freeze ÏòµÏÖò (Í∏∞Î≥∏: Ï†ÑÏ≤¥ ÌïôÏäµ)
#         if freeze_backbone:
#             for param in self.model.wav2vec2.parameters():
#                 param.requires_grad = False
    
#     def forward(self, x):
#         return self.model(input_values=x).logits

# def test():
#     from transformers import Wav2Vec2FeatureExtractor
#     FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained(
#         "facebook/wav2vec2-base"
#     )

#     DEVICE = "cuda:1" if torch.cuda.is_available() else "cpu"
#     BATCH_SIZE = 16
#     SR = 16000
#     MAX_AUDIO_LEN = SR * 5

#     print("üîé Loading best model...")
#     checkpoint = torch.load("pet_omni_best.pth", map_location=DEVICE)

#     behavior_label_to_id = checkpoint["behavior_label_to_id"]
#     emotion_label_to_id = checkpoint["emotion_label_to_id"]
#     sound_label_to_id = checkpoint["sound_label_to_id"]

#     # -----------------------------
#     # Î™®Îç∏ Î≥µÏõê
#     # -----------------------------
#     video_model = VideoMultiBackbone(
#         len(behavior_label_to_id),
#         len(emotion_label_to_id)
#     ).to(DEVICE)

#     audio_model = AudioModel(
#         len(sound_label_to_id)
#     ).to(DEVICE)

#     video_model.load_state_dict(checkpoint["video_model"])
#     audio_model.load_state_dict(checkpoint["audio_model"])

#     video_model.eval()
#     audio_model.eval()

#     print("üì¶ Loading TEST datasets...")

#     TEST_DIR = os.path.join("files", "work", "omni_dataset", "test")

#     # -----------------------------
#     # Train ÏùòÏ°¥ ÏóÜÎäî Dataset Ï†ïÏùò
#     # -----------------------------
#     class TestImageDataset(Dataset):
#         def __init__(self, task_dir, label_to_id):
#             self.samples = []
#             self.label_to_id = label_to_id

#             for label in os.listdir(task_dir):
#                 if label not in label_to_id:
#                     continue

#                 label_dir = os.path.join(task_dir, label)
#                 for file in os.listdir(label_dir):
#                     if file.lower().endswith(('.jpg','.png','.jpeg')):
#                         self.samples.append(
#                             (os.path.join(label_dir,file),
#                              label_to_id[label])
#                         )

#             self.transform = transforms.Compose([
#                 transforms.Resize((224,224)),
#                 transforms.ToTensor(),
#                 transforms.Normalize(
#                     [0.485,0.456,0.406],
#                     [0.229,0.224,0.225]
#                 )
#             ])

#         def __len__(self):
#             return len(self.samples)

#         def __getitem__(self, idx):
#             path, label_id = self.samples[idx]
#             img = Image.open(path).convert("RGB")
#             img = self.transform(img)
#             return img, label_id


#     class TestAudioDataset(Dataset):
#         def __init__(self, task_dir, label_to_id):
#             self.samples = []
#             self.label_to_id = label_to_id

#             for label in os.listdir(task_dir):
#                 if label not in label_to_id:
#                     continue

#                 label_dir = os.path.join(task_dir, label)
#                 for file in os.listdir(label_dir):
#                     if file.lower().endswith(('.wav','.mp3','.m4a')):
#                         self.samples.append(
#                             (os.path.join(label_dir,file),
#                              label_to_id[label])
#                         )

#         def __len__(self):
#             return len(self.samples)

#         def __getitem__(self, idx):
#             path, label_id = self.samples[idx]
#             waveform, _ = librosa.load(path, sr=SR, mono=True)

#             if len(waveform) > MAX_AUDIO_LEN:
#                 waveform = waveform[:MAX_AUDIO_LEN]
#             else:
#                 waveform = np.pad(
#                     waveform,
#                     (0, MAX_AUDIO_LEN - len(waveform))
#                 )

#             inputs = FEATURE_EXTRACTOR(
#                 waveform,
#                 sampling_rate=SR,
#                 return_tensors="pt"
#             )

#             return inputs.input_values.squeeze(0), label_id


#     # -----------------------------
#     # Loader
#     # -----------------------------
#     behavior_loader = DataLoader(
#         TestImageDataset(
#             os.path.join(TEST_DIR,"behavior"),
#             behavior_label_to_id
#         ),
#         BATCH_SIZE, False
#     )

#     emotion_loader = DataLoader(
#         TestImageDataset(
#             os.path.join(TEST_DIR,"emotion"),
#             emotion_label_to_id
#         ),
#         BATCH_SIZE, False
#     )

#     sound_loader = DataLoader(
#         TestAudioDataset(
#             os.path.join(TEST_DIR,"sound"),
#             sound_label_to_id
#         ),
#         BATCH_SIZE, False
#     )

#     # -----------------------------
#     # Evaluation
#     # -----------------------------
#     def evaluate(loader, task):
#         correct, total = 0, 0
#         with torch.no_grad():
#             for x, y in loader:
#                 x, y = x.to(DEVICE), y.to(DEVICE)

#                 if task in ["behavior","emotion"]:
#                     logits = video_model(x, task)
#                 else:
#                     logits = audio_model(x)

#                 pred = logits.argmax(-1)
#                 correct += (pred == y).sum().item()
#                 total += y.size(0)

#         return correct / total if total > 0 else 0


#     acc_b = evaluate(behavior_loader, "behavior")
#     acc_e = evaluate(emotion_loader, "emotion")
#     acc_s = evaluate(sound_loader, "sound")

#     avg_acc = (acc_b + acc_e + acc_s) / 3

#     print("\nüìä TEST Results:")
#     print(f"  Behavior Acc: {acc_b:.4f} ({acc_b*100:.1f}%)")
#     print(f"  Emotion Acc:  {acc_e:.4f} ({acc_e*100:.1f}%)")
#     print(f"  Sound Acc:    {acc_s:.4f} ({acc_s*100:.1f}%)")
#     print(f"  Average Acc:  {avg_acc:.4f} ({avg_acc*100:.1f}%)")


# if __name__ == "__main__":
#     test()



üîé Loading best model...


Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 211/211 [00:00<00:00, 658.49it/s, Materializing param=wav2vec2.masked_spec_embed]                                            
[1mWav2Vec2ForSequenceClassification LOAD REPORT[0m from: facebook/wav2vec2-base
Key                          | Status     | 
-----------------------------+------------+-
project_hid.weight           | UNEXPECTED | 
project_q.weight             | UNEXPECTED | 
quantizer.codevectors        | UNEXPECTED | 
quantizer.weight_proj.bias   | UNEXPECTED | 
quantizer.weight_proj.weight | UNEXPECTED | 
project_hid.bias             | UNEXPECTED | 
project_q.bias               | UNEXPECTED | 
projector.weight             | MISSING    | 
classifier.bias              | MISSING    | 
projector.bias               | MISSING    | 
classifier.weight            | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those pa

üì¶ Loading TEST datasets...

üìä TEST Results:
  Behavior Acc: 0.7273 (72.7%)
  Emotion Acc:  0.7525 (75.2%)
  Sound Acc:    0.9138 (91.4%)
  Average Acc:  0.7979 (79.8%)
