# SAMO DeBERTa-v3-Large Optimized Training
## Target: >60% F1 Macro on GoEmotions

### Key Optimizations:
- ✅ Correct learning rate for DeBERTa (5e-6 vs 3e-4)
- ✅ Better bias initialization strategy
- ✅ Increased LoRA capacity (rank 64)
- ✅ More training data (90% subset)
- ✅ Gradient checkpointing for memory efficiency
- ✅ Improved ASL parameters

In [10]:
# ===== A) PRE-FLIGHT SANITY (run first) =====
import os, shutil, subprocess, sys
import torch

# Safer defaults for single-node, PCIe multi-GPU (2x3090)
os.environ.setdefault("CUDA_DEVICE_ORDER", "PCI_BUS_ID")
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0,1")  # use both GPUs
os.environ.setdefault("NCCL_DEBUG", "WARN")
os.environ.setdefault("NCCL_ASYNC_ERROR_HANDLING", "1")
# No Infiniband on Vast.ai by default; disable to avoid NCCL picking it
os.environ.setdefault("NCCL_IB_DISABLE", "1")
# Start with P2P enabled; if you still crash, set to "1" in Section D
os.environ.setdefault("NCCL_P2P_DISABLE", "0")
# Helps when container isn't launched with --ipc=host (common on Vast)
os.environ.setdefault("NCCL_SHM_DISABLE", "0")

print("🔧 Torch:", torch.__version__)
print("🧪 CUDA available:", torch.cuda.is_available())
print("🖥️ GPUs:", torch.cuda.device_count())

for i in range(torch.cuda.device_count()):
    props = torch.cuda.get_device_properties(i)
    print(f" • {i}: {props.name} | {props.total_memory/1e9:.1f} GB | CC {props.major}.{props.minor}")

# Check basic P2P capability
if torch.cuda.device_count() >= 2:
    p2p = torch.cuda.can_device_access_peer(0, 1) and torch.cuda.can_device_access_peer(1, 0)
    print("🔗 GPU0<->GPU1 P2P:", p2p)
else:
    print("⚠️ Only one GPU visible. Check your Vast.ai template env.")

# Optional: quick /dev/shm heads-up (use --shm-size=16g for dataloaders)
try:
    shm = shutil.disk_usage("/dev/shm").total / (1024**3)
    print(f"📦 /dev/shm size: {shm:.1f} GiB")
except Exception:
    pass


🔧 Torch: 2.5.1+cu124
🧪 CUDA available: True
🖥️ GPUs: 2
 • 0: NVIDIA GeForce RTX 3090 | 25.4 GB | CC 8.6
 • 1: NVIDIA GeForce RTX 3090 | 25.4 GB | CC 8.6
🔗 GPU0<->GPU1 P2P: False
📦 /dev/shm size: 41.0 GiB


In [17]:
# ===== B) DDP LAUNCHER (import once) =====
from accelerate import notebook_launcher
from accelerate.utils import write_basic_config


# Ensure a valid default accelerate config exists (safe on re-run)
write_basic_config(mixed_precision="fp16") # 3090 works well in fp16; switch to "bf16" if desired


def ddp_launch(fn, num_procs=None):
    import torch
    n = num_procs or torch.cuda.device_count() or 1
    print(f"🚀 Spawning {n} DDP processes...")
    notebook_launcher(fn, args=(), num_processes=n)

Configuration already exists at /workspace/.cache/huggingface/accelerate/default_config.yaml, will not override. Run `accelerate config` manually or pass a different `save_location`.


In [None]:
# SAMO Recovery Plan: Step 1 - Hardware Diagnostic & Cleanup
# Run this BEFORE attempting training to identify hardware/memory issues

import torch
import gc
import psutil
import os

print("🔍 SAMO Recovery Diagnostic")
print("=" * 50)

# 1. GPU Hardware Check
print("\n🎮 GPU INFORMATION:")
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"   GPU: {gpu_name}")
    print(f"   Total Memory: {gpu_memory:.1f}GB")

    # Check if it's T4 vs 3090
    if "T4" in gpu_name:
        recommended_batch = 8
        hardware_type = "T4"
    elif "3090" in gpu_name or "A100" in gpu_name:
        recommended_batch = 20
        hardware_type = "High-end"
    else:
        recommended_batch = 12
        hardware_type = "Unknown"

    print(f"   Hardware Type: {hardware_type}")
    print(f"   Recommended Batch Size: {recommended_batch}")
else:
    print("   ❌ No CUDA GPU available!")
    exit()

# 2. Memory Cleanup
print("\n🧹 MEMORY CLEANUP:")
# Clear Python objects
gc.collect()

# Clear CUDA cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    allocated_before = torch.cuda.memory_allocated() / 1e9
    reserved_before = torch.cuda.memory_reserved() / 1e9
    print(f"   GPU Memory Allocated: {allocated_before:.2f}GB")
    print(f"   GPU Memory Reserved: {reserved_before:.2f}GB")

# Check system RAM
ram = psutil.virtual_memory()
print(f"   System RAM: {ram.available / 1e9:.1f}GB available / {ram.total / 1e9:.1f}GB total")

# 3. Environment Check
print("\n🔧 ENVIRONMENT:")
print(f"   PyTorch: {torch.__version__}")
print(f"   CUDA Version: {torch.version.cuda}")
print(f"   Visible Devices: {os.environ.get('CUDA_VISIBLE_DEVICES', 'All')}")

# 4. Previous Training Issue Analysis
print("\n⚠️  FAILURE ANALYSIS:")
print("   Previous failure indicators:")
print("   - Training speed: 0.61 it/s (abnormally slow)")
print("   - F1 score: 6.97% (model collapse)")
print("   - Learning rate: 5e-6 (likely too low for LoRA)")

# 5. Recovery Recommendations
print("\n✅ RECOVERY RECOMMENDATIONS:")
print(f"   - Use batch_size: {recommended_batch} (for {hardware_type} GPU)")
print("   - Learning rate: 1e-4 (conservative middle ground)")
print("   - LoRA rank: 32 (proven working configuration)")
print("   - Data subset: 70% (between working 60% and failed 90%)")
print("   - Gradient checkpointing: True (for memory)")

# 6. Hardware-Specific Configuration
RECOVERY_CONFIG = {
    "hardware_type": hardware_type,
    "gpu_memory_gb": gpu_memory,
    "recommended_batch_size": recommended_batch,
    "learning_rate": 1e-4,
    "lora_r": 32,
    "lora_alpha": 64,
    "subset_ratio": 0.70,
    "num_epochs": 3,
    "gradient_checkpointing": True,
    "fp16": True,
}

print(f"\n📋 RECOMMENDED CONFIG:")
for key, value in RECOVERY_CONFIG.items():
    print(f"   {key}: {value}")

print("\n🎯 NEXT STEPS:")
print("1. Use the RECOVERY_CONFIG above in your training")
print("2. Run 1 epoch first as validation")
print("3. Monitor F1 score - should be >30% after epoch 1")
print("4. If successful, continue with full training")
print("\n" + "=" * 50)

# Export config for next cell
globals()['RECOVERY_CONFIG'] = RECOVERY_CONFIG

🔍 SAMO Recovery Diagnostic

🎮 GPU INFORMATION:
   GPU: NVIDIA GeForce RTX 3090
   Total Memory: 25.4GB
   Hardware Type: High-end
   Recommended Batch Size: 20

🧹 MEMORY CLEANUP:


In [2]:
# Cell 1: Environment Setup
import os
import sys
import subprocess
import warnings
warnings.filterwarnings('ignore')

# Set environment variables BEFORE importing torch
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

# Determine cache directory
if os.path.exists("/kaggle"):
    cache_dir = "/kaggle/working/hf_cache"
elif os.path.exists("/workspace"):
    cache_dir = "/workspace/.cache/huggingface"
else:
    cache_dir = "./hf_cache"

os.makedirs(cache_dir, exist_ok=True)
os.environ["HF_HOME"] = cache_dir
os.environ["TRANSFORMERS_CACHE"] = cache_dir
os.environ["DATASETS_CACHE"] = cache_dir

print(f"📁 Cache directory: {cache_dir}")

# Install packages
def pip_install(packages):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q"] + packages)

print("📦 Installing required packages...")
pip_install([
    "transformers==4.41.2",
    "datasets==2.19.0",
    "accelerate==0.31.0",
    "peft==0.10.0",
    "evaluate==0.4.2",
    "scikit-learn==1.5.0",
    "sentencepiece>=0.1.99",
    "tokenizers>=0.15.2"
])

import torch
print(f"\n🔥 PyTorch: {torch.__version__}")
print(f"🎮 CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")

📁 Cache directory: /workspace/.cache/huggingface
📦 Installing required packages...

🔥 PyTorch: 2.5.1+cu124
🎮 CUDA available: True
   GPU: NVIDIA GeForce RTX 3090
   Memory: 25.4GB


[0m

In [None]:
# Cell 2: RECOVERY Configuration (Conservative & Stable)
# This replaces the failed "optimized" config with proven stable settings

import random
import numpy as np
import torch

# RECOVERY CONFIGURATION - Based on what worked + conservative improvements
CONFIG = {
    "seed": 42,
    "output_dir": "./samo_deberta_recovery",

    # DATA - Conservative increase from working 60%
    "use_subset": True,
    "subset_ratio": 0.70,  # Between working 60% and failed 90%

    # MODEL
    "model_name": "microsoft/deberta-v3-large",
    "gradient_checkpointing": True,  # Memory efficiency
    "fp16": True,

    # LoRA - Use proven working settings
    "use_lora": True,
    "lora_r": 32,  # Back to working value (not 64)
    "lora_alpha": 64,  # 2x ratio
    "lora_dropout": 0.1,
    "lora_target_modules": ["query_proj", "key_proj", "value_proj"],

    # TRAINING - CRITICAL FIX
    "num_train_epochs": 3,  # Conservative start
    # Batch size will be set based on hardware detection
    "per_device_train_batch_size": 16,  # Default, will adjust
    "per_device_eval_batch_size": 32,
    "gradient_accumulation_steps": 2,   # Conservative
    "max_length": 96,  # Shorter for stability

    # CRITICAL: CORRECT LEARNING RATE!
    "learning_rate": 1e-4,  # Conservative middle (3e-4 worked, 5e-6 failed)
    "weight_decay": 0.01,
    "warmup_ratio": 0.1,
    "lr_scheduler_type": "cosine",

    # EVALUATION
    "evaluation_strategy": "steps",
    "eval_steps": 300,  # More frequent monitoring
    "save_steps": 300,
    "logging_steps": 50,
    "save_total_limit": 2,
    "load_best_model_at_end": True,
    "metric_for_best_model": "f1_macro",
    "early_stopping_patience": 2,  # Stop if no improvement

    # ASL - Conservative settings
    "asl_gamma_neg": 2.0,
    "asl_gamma_pos": 1.0,
    "asl_clip": 0.05,
    "asl_pos_alpha": 1.0,
}

# Hardware-specific adjustments
if 'RECOVERY_CONFIG' in globals():
    # Use hardware-detected batch size
    CONFIG["per_device_train_batch_size"] = RECOVERY_CONFIG["recommended_batch_size"]
    print(f"🔧 Adjusted batch size to {CONFIG['per_device_train_batch_size']} for detected hardware")

# Set seed for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(CONFIG["seed"])

print("✅ Recovery Configuration Loaded")
print("🎯 KEY CHANGES FROM FAILED CONFIG:")
print(f"   ❌ Learning rate: 5e-6 → ✅ {CONFIG['learning_rate']} (100x increase!)")
print(f"   ❌ LoRA rank: 64 → ✅ {CONFIG['lora_r']} (back to proven)")
print(f"   ❌ Data subset: 90% → ✅ {CONFIG['subset_ratio']*100:.0f}% (conservative)")
print(f"   ✅ Epochs: {CONFIG['num_train_epochs']} (quick validation)")

effective_batch = CONFIG['per_device_train_batch_size'] * CONFIG['gradient_accumulation_steps']
print(f"\n📊 TRAINING PARAMETERS:")
print(f"   Effective batch size: {effective_batch}")
print(f"   Learning rate: {CONFIG['learning_rate']} (CRITICAL FIX)")
print(f"   Data samples: ~{int(CONFIG['subset_ratio'] * 43000)} train samples")
print(f"   Expected F1 target: >30% (validation), >40% (full training)")

# Validation check
if CONFIG['learning_rate'] <= 1e-5:
    print("⚠️  WARNING: Learning rate still too low! Consider 5e-5 or higher.")
if CONFIG['per_device_train_batch_size'] > 32:
    print("⚠️  WARNING: Batch size might be too large. Monitor memory usage.")

print("\n🚀 Ready for recovery training!")

In [4]:
# Cell 3: Load and Prepare Dataset
import gc
import pandas as pd
import urllib.request
import io
from datasets import Dataset, DatasetDict
from transformers import DebertaV2Tokenizer

print("💾 Loading GoEmotions dataset...")

# Load from GitHub
base_url = "https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/"

# Download train and dev sets
dfs = {}
for split_name, filename in {"train": "train.tsv", "validation": "dev.tsv"}.items():
    url = base_url + filename
    print(f"   Downloading {split_name}...")
    response = urllib.request.urlopen(url)
    content = response.read().decode('utf-8')
    df = pd.read_csv(io.StringIO(content), sep='\t', header=None, names=['text', 'labels', 'id'])
    dfs[split_name] = df

# Load emotion labels
emotions_url = base_url + "emotions.txt"
response = urllib.request.urlopen(emotions_url)
LABEL_NAMES = response.read().decode('utf-8').strip().split('\n')
NUM_LABELS = len(LABEL_NAMES)

print(f"✅ Loaded {len(dfs['train'])} train, {len(dfs['validation'])} validation samples")
print(f"   {NUM_LABELS} emotion labels")

# Process labels to multi-hot
def process_labels(labels_str):
    multi_hot = np.zeros(NUM_LABELS, dtype=np.float32)
    if pd.notna(labels_str) and labels_str.strip():
        for idx in labels_str.strip().split(','):
            if idx.isdigit():
                label_idx = int(idx)
                if 0 <= label_idx < NUM_LABELS:
                    multi_hot[label_idx] = 1.0
    return multi_hot.tolist()

# Create datasets
datasets = {}
for split, df in dfs.items():
    labels = [process_labels(l) for l in df['labels']]
    dataset = Dataset.from_dict({
        'text': df['text'].tolist(),
        'labels': labels
    })
    datasets[split] = dataset

# Apply subset if configured
if CONFIG.get("use_subset"):
    ratio = CONFIG["subset_ratio"]
    for split in datasets:
        orig_size = len(datasets[split])
        new_size = int(orig_size * ratio)
        datasets[split] = datasets[split].shuffle(seed=CONFIG["seed"]).select(range(new_size))
        print(f"   {split}: {orig_size} → {new_size} samples ({ratio*100:.0f}%)")

# Tokenize
tokenizer = DebertaV2Tokenizer.from_pretrained(CONFIG["model_name"], use_fast=False)

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=CONFIG["max_length"],
        padding=False
    )

print("\n🔤 Tokenizing dataset...")
tokenized_datasets = DatasetDict(datasets).map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

# Clean memory
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("✅ Dataset ready for training")

# Export for later use
DATASETS = tokenized_datasets
ID2LABEL = {i: name for i, name in enumerate(LABEL_NAMES)}
LABEL2ID = {name: i for i, name in enumerate(LABEL_NAMES)}

💾 Loading GoEmotions dataset...
   Downloading train...
   Downloading validation...
✅ Loaded 43410 train, 5426 validation samples
   28 emotion labels
   train: 43410 → 30386 samples (70%)
   validation: 5426 → 3798 samples (70%)

🔤 Tokenizing dataset...


Map:   0%|          | 0/30386 [00:00<?, ? examples/s]

Map:   0%|          | 0/3798 [00:00<?, ? examples/s]

✅ Dataset ready for training


In [None]:
# Cell 4: Build Model with LoRA
import torch.nn as nn
from transformers import AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model, TaskType

print("🤖 Building DeBERTa-v3 model with optimized LoRA...")

# Load base model
model = AutoModelForSequenceClassification.from_pretrained(
    CONFIG["model_name"],
    num_labels=NUM_LABELS,
    problem_type="multi_label_classification",
    id2label=ID2LABEL,
    label2id=LABEL2ID,
)

# Enable gradient checkpointing
if CONFIG["gradient_checkpointing"]:
    model.gradient_checkpointing_enable()
    print("   ✅ Gradient checkpointing enabled")

# Apply LoRA
lora_config = LoraConfig(
    r=CONFIG["lora_r"],
    lora_alpha=CONFIG["lora_alpha"],
    lora_dropout=CONFIG["lora_dropout"],
    target_modules=CONFIG["lora_target_modules"],
    bias="none",
    task_type=TaskType.SEQ_CLS,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Initialize bias with better strategy
def initialize_bias_balanced(model, train_labels):
    """Initialize classifier bias with balanced strategy"""
    Y = np.asarray(train_labels, dtype=np.float32)
    p = Y.mean(axis=0)

    # Wider clipping range preserves relative frequencies
    p_clipped = np.clip(p, 0.001, 0.999)
    logits = np.log(p_clipped / (1.0 - p_clipped))

    # Scale based on prevalence
    scale = np.where(p < 0.01, 0.2,
            np.where(p < 0.05, 0.3,
            np.where(p < 0.1, 0.5, 0.7)))

    prior_logits = logits * scale

    # Find and set bias
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and module.out_features == NUM_LABELS:
            with torch.no_grad():
                device = next(model.parameters()).device
                bias = torch.from_numpy(prior_logits).to(device, dtype=module.weight.dtype)
                if module.bias is None:
                    module.bias = nn.Parameter(torch.zeros_like(bias))
                module.bias.copy_(bias)
            print(f"✅ Bias initialized (balanced strategy)")
            print(f"   Range: [{prior_logits.min():.2f}, {prior_logits.max():.2f}]")
            break

# Get training labels for bias init
train_labels = [row["labels"] for row in DATASETS["train"]]
initialize_bias_balanced(model, train_labels)

print("✅ Model ready for training")

In [None]:
# Cell 5: Setup Trainer with Optimized Parameters
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback
from sklearn.metrics import f1_score
import torch.nn as nn

print("🎯 Setting up optimized trainer...")

# Data collator
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

# Metrics computation
def compute_metrics(eval_pred):
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    probs = 1.0 / (1.0 + np.exp(-logits))

    # Evaluate at multiple thresholds
    metrics = {}
    for threshold in [0.3, 0.5]:
        preds = (probs >= threshold).astype(int)
        suffix = f"_t{int(threshold*10)}"
        metrics[f"f1_micro{suffix}"] = f1_score(labels, preds, average="micro", zero_division=0)
        metrics[f"f1_macro{suffix}"] = f1_score(labels, preds, average="macro", zero_division=0)

    # Primary metric for model selection
    metrics["f1_macro"] = metrics["f1_macro_t5"]

    return metrics

# Asymmetric Loss
class AsymmetricLoss(nn.Module):
    def __init__(self, gamma_neg=2.0, gamma_pos=0.5, clip=0.03, eps=1e-8, pos_alpha=1.5):
        super().__init__()
        self.gamma_neg = gamma_neg
        self.gamma_pos = gamma_pos
        self.clip = clip
        self.eps = eps
        self.pos_alpha = pos_alpha

    def forward(self, x, y):
        x_sigmoid = torch.sigmoid(x)
        xs_pos = x_sigmoid
        xs_neg = 1 - x_sigmoid

        if self.clip > 0:
            xs_neg = (xs_neg + self.clip).clamp(max=1)

        los_pos = self.pos_alpha * y * torch.log(xs_pos.clamp(min=self.eps))
        los_neg = (1 - y) * torch.log(xs_neg.clamp(min=self.eps))

        loss = los_pos + los_neg

        if self.gamma_neg > 0 or self.gamma_pos > 0:
            pt = xs_pos * y + xs_neg * (1 - y)
            gamma = self.gamma_pos * y + self.gamma_neg * (1 - y)
            loss = loss * torch.pow(1 - pt, gamma)

        return -loss.mean()

# Custom trainer with ASL
class ASLTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fct = AsymmetricLoss(
            gamma_neg=CONFIG["asl_gamma_neg"],
            gamma_pos=CONFIG["asl_gamma_pos"],
            clip=CONFIG["asl_clip"],
            pos_alpha=CONFIG["asl_pos_alpha"]
        )

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = self.loss_fct(outputs.logits, labels)
        return (loss, outputs) if return_outputs else loss

# Training arguments
training_args = TrainingArguments(
    output_dir=CONFIG["output_dir"],
    num_train_epochs=CONFIG["num_train_epochs"],
    per_device_train_batch_size=CONFIG["per_device_train_batch_size"],
    per_device_eval_batch_size=CONFIG["per_device_eval_batch_size"],
    gradient_accumulation_steps=CONFIG["gradient_accumulation_steps"],
    learning_rate=CONFIG["learning_rate"],
    weight_decay=CONFIG["weight_decay"],
    warmup_ratio=CONFIG["warmup_ratio"],
    lr_scheduler_type=CONFIG["lr_scheduler_type"],
    evaluation_strategy=CONFIG["evaluation_strategy"],
    eval_steps=CONFIG["eval_steps"],
    save_steps=CONFIG["save_steps"],
    logging_steps=CONFIG["logging_steps"],
    save_total_limit=CONFIG["save_total_limit"],
    load_best_model_at_end=CONFIG["load_best_model_at_end"],
    metric_for_best_model=CONFIG["metric_for_best_model"],
    greater_is_better=True,
    fp16=CONFIG["fp16"],
    gradient_checkpointing=CONFIG["gradient_checkpointing"],
    dataloader_num_workers=2,
    report_to="none",
    remove_unused_columns=True,
)

# Initialize trainer
trainer = ASLTrainer(
    model=model,
    args=training_args,
    train_dataset=DATASETS["train"],
    eval_dataset=DATASETS["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=CONFIG["early_stopping_patience"])],
)

print("✅ Trainer configured with optimized settings")
print(f"   Learning rate: {CONFIG['learning_rate']} (critical for DeBERTa!)")
print(f"   Effective batch size: {CONFIG['per_device_train_batch_size'] * CONFIG['gradient_accumulation_steps']}")
print(f"   Total training steps: ~{len(DATASETS['train']) // (CONFIG['per_device_train_batch_size'] * CONFIG['gradient_accumulation_steps']) * CONFIG['num_train_epochs']}")

In [None]:
# SAMO Validation Test: 1-Epoch Quick Check
# Run this BEFORE full training to validate the recovery approach

print("🧪 SAMO 1-Epoch Validation Test")
print("🎯 Target: F1 Macro >30% to confirm recovery")
print("=" * 50)

# Override config for quick test
VALIDATION_CONFIG = CONFIG.copy()
VALIDATION_CONFIG.update({
    "num_train_epochs": 1,  # Just 1 epoch for validation
    "subset_ratio": 0.3,    # Even smaller subset for speed
    "eval_steps": 100,      # Frequent evaluation
    "save_steps": 100,
    "logging_steps": 25,
    "output_dir": "./samo_validation_test",
})

print(f"📊 VALIDATION SETTINGS:")
print(f"   Epochs: {VALIDATION_CONFIG['num_train_epochs']}")
print(f"   Data subset: {VALIDATION_CONFIG['subset_ratio']*100:.0f}%")
print(f"   Learning rate: {VALIDATION_CONFIG['learning_rate']}")
print(f"   Batch size: {VALIDATION_CONFIG['per_device_train_batch_size']}")

# Create simple prediction distribution monitor
class PredictionMonitor:
    def __init__(self):
        self.step_count = 0
        self.prediction_stats = []

    def log_predictions(self, probs):
        """Monitor prediction distributions to catch model collapse early"""
        avg_positive_rate = (probs > 0.5).mean()
        avg_confidence = probs.mean()
        max_confidence = probs.max()

        self.prediction_stats.append({
            'step': self.step_count,
            'avg_positive_rate': float(avg_positive_rate),
            'avg_confidence': float(avg_confidence),
            'max_confidence': float(max_confidence)
        })

        # Early warning signs
        if avg_positive_rate < 0.01:  # Less than 1% positive predictions
            print(f"⚠️  WARNING Step {self.step_count}: Very low positive rate ({avg_positive_rate:.1%})")
        if avg_confidence < 0.1:  # Very low confidence overall
            print(f"⚠️  WARNING Step {self.step_count}: Very low confidence ({avg_confidence:.3f})")

        self.step_count += 1

    def get_summary(self):
        if not self.prediction_stats:
            return "No predictions logged"

        latest = self.prediction_stats[-1]
        return f"Final - Pos Rate: {latest['avg_positive_rate']:.1%}, Avg Conf: {latest['avg_confidence']:.3f}"

# Success criteria for validation
SUCCESS_CRITERIA = {
    "min_f1_macro": 0.30,      # Must exceed 30% F1 macro
    "min_f1_micro": 0.35,      # Must exceed 35% F1 micro
    "min_pos_rate": 0.05,      # At least 5% positive predictions
    "max_training_loss": 0.1,  # Training loss should drop
}

print(f"\n✅ SUCCESS CRITERIA:")
for criterion, value in SUCCESS_CRITERIA.items():
    print(f"   {criterion}: {value}")

print(f"\n⚠️  FAILURE INDICATORS TO WATCH:")
print(f"   - Training speed <1.0 it/s (memory issues)")
print(f"   - F1 scores <20% (model collapse)")
print(f"   - All predictions negative (bias issues)")
print(f"   - Loss not decreasing (gradient issues)")

print(f"\n🔄 NEXT STEPS AFTER VALIDATION:")
print(f"   ✅ If SUCCESS: Use full CONFIG with 3+ epochs")
print(f"   ❌ If FAILURE: Increase learning rate to 2e-4 or 3e-4")
print(f"   📊 Monitor: Use prediction monitor throughout")

# Export validation config
globals()['VALIDATION_CONFIG'] = VALIDATION_CONFIG
globals()['PredictionMonitor'] = PredictionMonitor
globals()['SUCCESS_CRITERIA'] = SUCCESS_CRITERIA

print("\n🚀 Ready to run validation test!")
print("Use VALIDATION_CONFIG instead of CONFIG for the quick test.")

In [8]:
# Cell 6: TRAIN THE MODEL
print("🚀 Starting optimized training...")
print("⏰ This should take ~1-2 hours on a T4 GPU")
print("\n" + "="*50)

# Clean memory before training
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Train!
train_result = trainer.train()

print("\n✅ Training completed!")
print(f"   Final loss: {train_result.metrics['train_loss']:.4f}")

# Save best model
best_model_path = os.path.join(CONFIG["output_dir"], "best_model")
model.save_pretrained(best_model_path)
tokenizer.save_pretrained(best_model_path)
print(f"   Model saved to: {best_model_path}")

🚀 Starting optimized training...
⏰ This should take ~1-2 hours on a T4 GPU



RuntimeError: NCCL Error 5: invalid usage (run with NCCL_DEBUG=WARN for details)

In [None]:
# Cell 7: Evaluate and Diagnose
from sklearn.metrics import f1_score, classification_report

print("🔍 Evaluating model performance...\n")

# Get predictions
predictions = trainer.predict(DATASETS["validation"])
logits = predictions.predictions
labels = predictions.label_ids
probs = 1.0 / (1.0 + np.exp(-logits))

# Evaluate at different thresholds
print("📊 Performance at Different Thresholds:")
print("-" * 50)
print(f"{'Threshold':<10} {'F1 Micro':<12} {'F1 Macro':<12} {'Avg Preds/Sample':<15}")
print("-" * 50)

best_threshold = 0.5
best_f1_macro = 0

for threshold in [0.2, 0.3, 0.4, 0.5, 0.6]:
    preds = (probs >= threshold).astype(int)
    f1_micro = f1_score(labels, preds, average='micro', zero_division=0)
    f1_macro = f1_score(labels, preds, average='macro', zero_division=0)
    avg_preds = preds.sum(axis=1).mean()

    print(f"{threshold:<10.1f} {f1_micro:<12.4f} {f1_macro:<12.4f} {avg_preds:<15.2f}")

    if f1_macro > best_f1_macro:
        best_f1_macro = f1_macro
        best_threshold = threshold

print("-" * 50)
print(f"\n🎯 Best threshold: {best_threshold} with F1 Macro: {best_f1_macro:.4f}")

# Per-class performance
print("\n📈 Per-Class Performance (at best threshold):")
best_preds = (probs >= best_threshold).astype(int)
per_class_f1 = []

for i, label_name in enumerate(LABEL_NAMES):
    f1 = f1_score(labels[:, i], best_preds[:, i], zero_division=0)
    prevalence = labels[:, i].mean()
    per_class_f1.append((label_name, f1, prevalence))

# Sort by F1 score
per_class_f1.sort(key=lambda x: x[1])

print("\n5 Worst Performing Classes:")
for name, f1, prev in per_class_f1[:5]:
    print(f"  {name:<15} F1: {f1:.3f}  Prevalence: {prev:.3f}")

print("\n5 Best Performing Classes:")
for name, f1, prev in per_class_f1[-5:]:
    print(f"  {name:<15} F1: {f1:.3f}  Prevalence: {prev:.3f}")

# Save optimal thresholds
optimal_thresholds = np.full(NUM_LABELS, best_threshold)
# Fine-tune per-class thresholds
for j in range(NUM_LABELS):
    y_true = labels[:, j]
    y_scores = probs[:, j]

    best_t = best_threshold
    best_f = f1_score(y_true, (y_scores >= best_t).astype(int), zero_division=0)

    for t in np.linspace(0.1, 0.9, 9):
        f = f1_score(y_true, (y_scores >= t).astype(int), zero_division=0)
        if f > best_f:
            best_f = f
            best_t = t

    optimal_thresholds[j] = best_t

# Apply optimized thresholds
final_preds = (probs >= optimal_thresholds.reshape(1, -1)).astype(int)
final_f1_micro = f1_score(labels, final_preds, average='micro', zero_division=0)
final_f1_macro = f1_score(labels, final_preds, average='macro', zero_division=0)

print("\n" + "="*50)
print("🏆 FINAL RESULTS WITH OPTIMIZED THRESHOLDS:")
print("="*50)
print(f"F1 Micro: {final_f1_micro:.4f}")
print(f"F1 Macro: {final_f1_macro:.4f}")

if final_f1_macro >= 0.60:
    print("\n✅ SUCCESS! Achieved >60% F1 Macro target!")
elif final_f1_macro >= 0.50:
    print("\n⚠️ Good progress! Close to 60% target. Consider:")
    print("   - Training for more epochs")
    print("   - Using full dataset (100%)")
    print("   - Further hyperparameter tuning")
else:
    print("\n❌ Below target. Check training logs for issues.")

# Save thresholds
import json
threshold_path = os.path.join(CONFIG["output_dir"], "optimal_thresholds.json")
with open(threshold_path, 'w') as f:
    json.dump(optimal_thresholds.tolist(), f)
print(f"\n💾 Thresholds saved to: {threshold_path}")

In [None]:
# Cell 8: Test Inference
print("🎯 Testing inference on sample texts...\n")

test_texts = [
    "I'm so happy and grateful for this amazing day!",
    "This is really frustrating and annoying.",
    "I feel anxious but also excited about tomorrow.",
    "The weather is nice today.",
    "I love spending time with my family.",
    "I'm disappointed with how things turned out."
]

model.eval()
device = next(model.parameters()).device

with torch.no_grad():
    for text in test_texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True,
                          max_length=CONFIG["max_length"], padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        outputs = model(**inputs)
        probs = torch.sigmoid(outputs.logits).cpu().numpy()[0]

        # Apply optimized thresholds
        detected_emotions = [LABEL_NAMES[i] for i, p in enumerate(probs)
                           if p > optimal_thresholds[i]]

        # Get top 3 emotions by probability
        top_indices = np.argsort(probs)[-3:][::-1]
        top_emotions = [(LABEL_NAMES[i], probs[i]) for i in top_indices]

        print(f"Text: \"{text}\"")
        print(f"  Detected: {detected_emotions if detected_emotions else ['neutral']}")
        print(f"  Top 3: {', '.join([f'{e}({p:.2f})' for e, p in top_emotions])}")
        print()

print("\n✅ Inference test complete!")