=====================================
SAMO GoEmotions - DeBERTa-v3-large 
=====================================

In [None]:
import langchain_mistralai
import langchain_openai
import langchain_aws
import langchain_cohere
import langchain_google_vertexai
import langchain_ollama
print("All required langchain integrations imported successfully!")


In [None]:
# --- Vast.ai 2x3090 bootstrap: force 1 GPU + fast caches (fixed) ---
import os

# Hide the 2nd GPU before touching torch/cuda
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Keep HF/Datasets caches on fast NVMe (Vast) or fall back if not present
os.environ["HF_HOME"] = "/workspace/.cache/huggingface" if os.path.isdir("/workspace") else "/kaggle/working/hf"
os.environ["TRANSFORMERS_CACHE"] = os.environ["HF_HOME"]
os.environ["DATASETS_CACHE"] = os.environ["HF_HOME"]

# CUDA allocator options (colon syntax). Keep broadly compatible.
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

import torch
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Visible GPUs:", torch.cuda.device_count())
    print("GPU 0:", torch.cuda.get_device_name(0))
else:
    print("No CUDA device visible — check the Vast template/image.")


In [None]:
# ===================================================================================
# 1. ENVIRONMENT SETUP WITH GPU MEMORY MANAGEMENT (Vast/Kaggle SAFE)
# ===================================================================================
import os, sys, subprocess, warnings, gc, json, random
import numpy as np
import torch

print("🚀 [1/8] Setting up the environment...")

# -------- Paths & caches (Vast first, Kaggle fallback) --------
cache_root = "/workspace/.cache/huggingface" if os.path.isdir("/workspace") else "/kaggle/working/hf"
os.makedirs(cache_root, exist_ok=True)
os.environ["HF_HOME"] = cache_root
os.environ["TRANSFORMERS_CACHE"] = cache_root
os.environ["DATASETS_CACHE"] = cache_root
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# CUDA allocator: use correct colon syntax; keep it simple & compatible
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

# -------- GPU memory hygiene --------
if torch.cuda.is_available():
    try:
        torch.cuda.empty_cache()
        # Not all drivers support this; ignore if it errors
        torch.cuda.set_per_process_memory_fraction(0.95)
    except Exception:
        pass

warnings.filterwarnings("ignore")

def pip_install(pkgs):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "--no-input"] + pkgs)

# -------- Core libs (do NOT reinstall torch unless we must) --------
try:
    pip_install([
        "transformers==4.41.2",
        "datasets==2.19.0",
        "accelerate==0.31.0",
        "peft==0.10.0",
        "evaluate==0.4.2",
        "scikit-learn==1.5.0",
        "pandas>=2.0.0",
        "sentencepiece>=0.1.99",
    ])
    print("   ✅ Dependencies installed.")
except Exception as e:
    print("   ❌ Dependency install error:", e)
    raise

# Optional: if Torch is CPU-only, install a CUDA wheel automatically
def ensure_cuda_torch():
    import importlib
    needs_cuda = (not torch.cuda.is_available()) or ("+cpu" in torch.__version__)
    if not needs_cuda:
        return
    print("   ⚠️ Detected CPU-only PyTorch; installing CUDA build...")
    for cu_tag in ("cu121", "cu118"):  # try CUDA 12.1, then 11.8
        try:
            pip_install([
                f"--index-url=https://download.pytorch.org/whl/{cu_tag}",
                "torch", "torchvision", "torchaudio",
            ])
            importlib.reload(torch)
            if torch.cuda.is_available():
                print(f"   ✅ Installed CUDA PyTorch ({cu_tag}).")
                return
        except Exception:
            pass
    print("   ⚠️ Still no CUDA-enabled torch. Check base image / drivers.")

ensure_cuda_torch()

# -------- Print versions & GPU info --------
import transformers, datasets, accelerate, peft, pandas as pd  # noqa: E402

print("\n--- Library Versions ---")
print(f"   - Transformers: {transformers.__version__}, Datasets: {datasets.__version__}")
print(f"   - Accelerate:   {accelerate.__version__}, PEFT: {peft.__version__}")
print(f"   - PyTorch:      {torch.__version__}")
print("------------------------")
print(f"   - CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    try:
        print(f"   - GPU Device:     {torch.cuda.get_device_name(0)}")
        print(f"   - GPU Memory:     {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    except Exception:
        pass
print("-" * 25)


In [None]:
# --- SAFE CONFIG DEFAULTS + REPRO SEED (Vast 3090 tuned) ---
import os, random, numpy as np, torch

# Reset CONFIG to these defaults (avoid stale values overriding)
DEFAULTS = {
    "seed": 42,
    "output_dir": "/workspace/out/fast_vast3090" if os.path.isdir("/workspace") else "./out/fast_vast3090",

    # DATA
    "dataset_config": "raw",
    "use_subset": True,
    "subset_ratio": 0.60,

    # PRECISION / MEMORY
    "bf16": False,
    "fp16": True,
    "gradient_checkpointing": False,

    # MODEL / LORA
    "model_name_or_path": "microsoft/deberta-v3-large",
    "use_lora": True,
    "lora_r": 32,
    "lora_alpha": 64,
    "lora_dropout": 0.05,
    "lora_target_modules": ["query_proj","key_proj","value_proj"],

    # TRAINING CORE
    "num_train_epochs": 5,
    "per_device_train_batch_size": 20,   # 24GB-friendly at max_length 96
    "per_device_eval_batch_size": 64,
    "gradient_accumulation_steps": 2,
    "max_length": 96,
    "learning_rate": 3e-4,
    "weight_decay": 0.01,
    "warmup_ratio": 0.02,
    "lr_scheduler_type": "cosine",

    # DATALOADER
    "dataloader_num_workers": 0,

    # LOGGING/CHECKPOINTS
    "logging_steps": 200,
    "evaluation_strategy": "epoch",
    "save_strategy": "epoch",
    "save_total_limit": 1,
    "load_best_model_at_end": True,
    "metric_for_best_model": "f1_macro",
    "greater_is_better": True,

    # ASL (tuned)
    "asl_gamma_neg": 3.0,
    "asl_gamma_pos": 1.0,
    "asl_clip": 0.05,
    "asl_pos_alpha": 2.0,

    # MISC
    "fp16_full_eval": True,
    "report_to": "none",
}
CONFIG = DEFAULTS.copy()

def set_seed_all(seed: int):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed_all(CONFIG["seed"])
print("✅ Config ok. Seed:", CONFIG["seed"])
print("   Output dir:", CONFIG["output_dir"])
print("   fp16:", CONFIG["fp16"], "| grad ckpt:", CONFIG["gradient_checkpointing"])


In [None]:
# --- 2.5 Tokenizer prerequisites (must run before Cell 3) ---
import sys, subprocess, importlib

def _pip_install(pkgs):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "--no-input"] + pkgs)

# Install/upgrade sentencepiece and tokenizers
_pip_install(["sentencepiece>=0.1.99", "tokenizers>=0.15.2"])

import sentencepiece, tokenizers
print("sentencepiece:", sentencepiece.__version__, "| tokenizers:", tokenizers.__version__)


In [None]:
# ===================================================================================
# 3. DATA LOADING WITH SMART SAMPLING FOR SPEED  (ROBUST: raw vs simplified)
# ===================================================================================
print("\n💾 [3/8] Loading and preparing GoEmotions dataset...")

import gc, sys, subprocess, importlib, importlib.util
import numpy as np
import torch
from datasets import load_dataset, DatasetDict, disable_progress_bar, Sequence, Value

disable_progress_bar()

# --------- GPU cache hygiene ---------
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# --------- Helper: ensure sentencepiece + hard refresh transformers ---------
def ensure_sp_and_refresh_tf():
    # 1) Install sentencepiece if missing
    if importlib.util.find_spec("sentencepiece") is None:
        print("   - Installing sentencepiece (required for DeBERTa-v3 SPM)...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "--no-input", "sentencepiece>=0.1.99"])
    import sentencepiece  # noqa: F401
    # 2) Hard refresh transformers so optional backend checks re-run with SP present
    to_drop = [m for m in list(sys.modules) if m == "transformers" or m.startswith("transformers.")]
    for m in to_drop:
        sys.modules.pop(m, None)
    import transformers as tf  # fresh import
    return tf

transformers = ensure_sp_and_refresh_tf()

# --------- Load dataset (raw vs simplified) ---------
CFG_NAME = CONFIG.get("dataset_config", "raw")   # "raw" → per-emotion 0/1 columns, "simplified" → list[int]
ds_raw = load_dataset("go_emotions", CFG_NAME)

print("   - Creating 90/10 train/validation split...")
train_validation_split = ds_raw["train"].train_test_split(test_size=0.1, seed=CONFIG["seed"])
ds = DatasetDict({
    "train": train_validation_split["train"],
    "validation": train_validation_split["test"],
})

# --------- Optional subsampling ---------
if CONFIG.get("use_subset", False):
    subset_ratio = float(CONFIG.get("subset_ratio", 0.3))
    original_train_size = len(ds["train"])
    original_val_size   = len(ds["validation"])
    train_subset_size = max(1, int(original_train_size * subset_ratio))
    val_subset_size   = max(1, int(original_val_size * subset_ratio))
    ds["train"] = ds["train"].shuffle(seed=CONFIG["seed"]).select(range(train_subset_size))
    ds["validation"] = ds["validation"].shuffle(seed=CONFIG["seed"]).select(range(val_subset_size))
    print(f"   - Using {subset_ratio*100:.0f}% subset for faster training:")
    print(f"     • Train: {original_train_size:,} → {len(ds['train']):,} samples")
    print(f"     • Valid: {original_val_size:,} → {len(ds['validation']):,} samples")
else:
    print(f"   - Full dataset: {len(ds['train']):,} train, {len(ds['validation']):,} validation samples.")

# --------- Detect label schema & build label list ---------
features = ds_raw["train"].features
EXPECTED = [
    'admiration','amusement','anger','annoyance','approval','caring','confusion','curiosity','desire',
    'disappointment','disapproval','disgust','embarrassment','excitement','fear','gratitude','grief','joy',
    'love','nervousness','optimism','pride','realization','relief','remorse','sadness','surprise','neutral'
]

if "labels" in features:  # simplified schema
    LABEL_NAMES = list(features["labels"].feature.names)
    SCHEMA = "simplified"
else:                     # raw schema: per-emotion columns
    LABEL_NAMES = [name for name in EXPECTED if name in features]
    SCHEMA = "raw"

ID2LABEL = {i: n for i, n in enumerate(LABEL_NAMES)}
LABEL2ID = {n: i for i, n in enumerate(LABEL_NAMES)}
NUM_LABELS = len(LABEL_NAMES)
print(f"   - Detected schema: {SCHEMA}. Using {NUM_LABELS} emotion labels.")

# --------- Tokenizer (reliable slow path; SP ensured; TF reloaded) ---------
from transformers import DebertaV2Tokenizer
tok_name = (CONFIG.get("model_name") or
            CONFIG.get("model_name_or_path") or
            "microsoft/deberta-v3-large")
tokenizer = DebertaV2Tokenizer.from_pretrained(tok_name, use_fast=False)
print("   - Using slow DebertaV2Tokenizer (SentencePiece).")

# --------- Preprocess functions ---------
def preprocess_simplified(examples):
    enc = tokenizer(
        examples["text"],
        truncation=True,
        max_length=int(CONFIG.get("max_length", 128)),
        padding=False,
    )
    # list[list[int]] -> multi-hot
    batch_size = len(examples["text"])
    multi_hot = np.zeros((batch_size, NUM_LABELS), dtype=np.float32)
    for i, label_ids in enumerate(examples["labels"]):
        for lid in label_ids:
            if 0 <= lid < NUM_LABELS:
                multi_hot[i, lid] = 1.0
    enc["labels"] = multi_hot.tolist()
    return enc

def preprocess_raw(examples):
    enc = tokenizer(
        examples["text"],
        truncation=True,
        max_length=int(CONFIG.get("max_length", 128)),
        padding=False,
    )
    cols = [np.asarray(examples[name], dtype=np.float32) for name in LABEL_NAMES]
    multi_hot = np.stack(cols, axis=1)  # [B, NUM_LABELS]
    enc["labels"] = multi_hot.tolist()
    return enc

preprocess_fn = preprocess_simplified if SCHEMA == "simplified" else preprocess_raw

# --------- Map & format (NumPy 2.0 safe) ---------
print("   - Tokenizing and encoding dataset...")
# Remove ALL original columns; keep only what preprocess returns
all_cols = ds["train"].column_names
encoded_ds = ds.map(preprocess_fn, batched=True, remove_columns=all_cols)

# Safety: if anything extra survived, drop it
needed = {"input_ids", "attention_mask", "labels"}
extra  = [c for c in encoded_ds["train"].column_names if c not in needed]
if extra:
    encoded_ds = encoded_ds.remove_columns(extra)

# Fix labels to fixed-width list so metrics ops stay clean
encoded_ds = encoded_ds.cast_column("labels", Sequence(Value("float32"), length=NUM_LABELS))

# 🚫 Do NOT use torch formatter (NumPy 2.0 + ragged lists => object arrays)
# Keep python format and let DataCollatorWithPadding tensorize/pad at batch time.
encoded_ds = encoded_ds.with_format("python")

print("   ✅ Dataset successfully processed (python format; collator will pad/tensorize).")

# --------- Expose for downstream cells ---------
DATASETS = encoded_ds
LABEL_META = {"names": LABEL_NAMES, "id2label": ID2LABEL, "label2id": LABEL2ID, "num_labels": NUM_LABELS}


In [None]:
# ===================================================================================
# 4A. MODEL SETUP WITH MEMORY OPTIMIZATION (robust + LoRA autodetect)
# ===================================================================================
print("\n🤖 [4/8] Building DeBERTa-v3 model with memory-optimized LoRA...")

import os, gc, torch, torch.nn as nn, transformers
from transformers import AutoModelForSequenceClassification

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

model_name = (CONFIG.get("model_name") or CONFIG.get("model_name_or_path") or "microsoft/deberta-v3-large")
CONFIG["model_name"] = model_name

# Use LABEL_META if present (from Cell 3)
num_labels = LABEL_META["num_labels"] if "LABEL_META" in globals() else NUM_LABELS
id2label  = LABEL_META["id2label"]  if "LABEL_META" in globals() else ID2LABEL
label2id  = LABEL_META["label2id"]  if "LABEL_META" in globals() else LABEL2ID

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    problem_type="multi_label_classification",
    id2label=id2label,
    label2id=label2id,
)

if hasattr(model.config, "use_cache"):
    model.config.use_cache = False

if CONFIG.get("gradient_checkpointing", True):
    model.gradient_checkpointing_enable()
    print("   - Gradient checkpointing enabled for memory efficiency.")

if CONFIG.get("use_lora", True):
    from peft import LoraConfig, get_peft_model, TaskType
    requested = CONFIG.get("lora_target_modules") or ["query_proj","key_proj","value_proj","dense"]
    linear_names = [n for n, m in model.named_modules() if isinstance(m, nn.Linear)]
    active = [t for t in requested if any(t in n for n in linear_names)]
    if not active:
        active = [t for t in ["query_proj","key_proj","value_proj"] if any(t in n for n in linear_names)]
    if not active:
        raise RuntimeError("LoRA target detection failed. Provide CONFIG['lora_target_modules'].")

    lora_config = LoraConfig(
        r=int(CONFIG.get("lora_r", 16)),
        lora_alpha=int(CONFIG.get("lora_alpha", 32)),
        lora_dropout=float(CONFIG.get("lora_dropout", 0.05)),
        target_modules=sorted(set(active)),
        bias="none",
        task_type=TaskType.SEQ_CLS,
    )
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    os.makedirs(CONFIG["output_dir"], exist_ok=True)
    model.config.save_pretrained(CONFIG["output_dir"])
    print(f"   - LoRA applied: {sorted(set(active))}")

print("   ✅ Model built successfully.")


In [None]:
# === 4B. GENTLER BIAS INIT (fixes over-conservative predictions) ===
import numpy as np, torch, torch.nn as nn

print("🔧 Setting gentler classifier bias...")

# 1) Get label prevalence
train_py = DATASETS["train"].with_format("python")
labels_list = [row["labels"] for row in train_py]
Y = np.asarray(labels_list, dtype=np.float32)
del labels_list, train_py

# 2) Compute per-class prior but CLIP to prevent extreme biases
p = Y.mean(axis=0).clip(0.01, 0.5)  # ← CLIPPED to [1%, 50%] range
prior_logits = np.log(p / (1.0 - p)).astype(np.float32)

# CRITICAL: Scale down the biases to be less extreme
prior_logits = prior_logits * 0.5  # ← Reduce strength by 50%

# 3) Find the classification head
head, head_name = None, None
for name, module in model.named_modules():
    if isinstance(module, nn.Linear) and getattr(module, "out_features", None) == Y.shape[1]:
        head, head_name = module, name
        break

assert head is not None, "Could not locate classification head"

# 4) Set the bias
with torch.no_grad():
    device = next(model.parameters()).device
    b = torch.from_numpy(prior_logits).to(device=device, dtype=head.weight.dtype)
    if head.bias is None:
        head.bias = torch.nn.Parameter(torch.zeros_like(b))
    head.bias.copy_(b)

print(f"✅ Gentler bias set on '{head_name}'")
print(f"   Bias range: [{prior_logits.min():.2f}, {prior_logits.max():.2f}]")
print(f"   Bias mean: {prior_logits.mean():.3f}")

In [None]:
# ===================================================================================
# 5. TRAINER WITH MEMORY-EFFICIENT SETTINGS (FIXED for proper evaluation)
# ===================================================================================
print("\n📈 [5/8] Configuring memory-efficient Trainer...")

import os, numpy as np, torch, torch.nn as nn
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback
from sklearn.metrics import f1_score

# ---------- Collator with safety checks ----------
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    pad_to_multiple_of=8,
    return_tensors="pt"
)

# Prune any non-tensor keys before collation
TOKEN_KEYS = {"input_ids", "attention_mask", "token_type_ids", "labels"}
def safe_collate(features):
    pruned = [{k: v for k, v in f.items() if k in TOKEN_KEYS} for f in features]
    return data_collator(pruned)

# Check if we need the safe collator
needed_cols = {"input_ids", "attention_mask", "labels"}
train_cols = set(DATASETS["train"].column_names)
use_safe = not train_cols.issubset(needed_cols)
trainer_collate = safe_collate if use_safe else data_collator

# ---------- Metrics with SANE thresholds ----------
# Use 0.50 for model selection (matches your ~1.2 positives/sample distribution)
# Also log 0.30 for visibility but DON'T select on it
THRESH_SELECT = 0.50  # For model selection - matches your data distribution
THRESH_LOG = 0.30     # For logging only - more sensitive

def compute_metrics(eval_pred):
    if isinstance(eval_pred, (tuple, list)):
        logits, labels = eval_pred
    else:
        logits, labels = eval_pred.predictions, eval_pred.label_ids
    
    probs = 1.0 / (1.0 + np.exp(-logits))
    
    # Primary metrics for model selection (0.50)
    preds_50 = (probs >= THRESH_SELECT).astype(int)
    # Secondary metrics for visibility (0.30)
    preds_30 = (probs >= THRESH_LOG).astype(int)
    
    return {
        # PRIMARY - for model selection
        "f1_micro": f1_score(labels, preds_50, average="micro", zero_division=0),
        "f1_macro": f1_score(labels, preds_50, average="macro", zero_division=0),
        
        # SECONDARY - just for monitoring
        "f1_micro_t30": f1_score(labels, preds_30, average="micro", zero_division=0),
        "f1_macro_t30": f1_score(labels, preds_30, average="macro", zero_division=0),
    }

# ---------- Asymmetric Loss ----------
class AsymmetricLoss(nn.Module):
    def __init__(self, gamma_neg=3.0, gamma_pos=1.0, clip=0.05, eps=1e-8, pos_alpha=2.0):
        super().__init__()
        self.gamma_neg = float(gamma_neg)
        self.gamma_pos = float(gamma_pos)
        self.clip = float(clip)
        self.eps = float(eps)
        self.pos_alpha = float(pos_alpha)

    def forward(self, x, y):
        x_sigmoid = torch.sigmoid(x)
        xs_pos = x_sigmoid
        xs_neg = 1 - x_sigmoid
        if self.clip and self.clip > 0:
            xs_neg = (xs_neg + self.clip).clamp(max=1)
        los_pos = self.pos_alpha * y * torch.log(xs_pos.clamp(min=self.eps))
        los_neg = (1 - y) * torch.log(xs_neg.clamp(min=self.eps))
        loss = los_pos + los_neg
        if self.gamma_neg > 0 or self.gamma_pos > 0:
            pt = xs_pos * y + xs_neg * (1 - y)
            gamma = self.gamma_pos * y + self.gamma_neg * (1 - y)
            loss = loss * torch.pow(1 - pt, gamma)
        return -loss.mean()

class AsymmetricLossTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fct = AsymmetricLoss(
            gamma_neg=CONFIG.get("asl_gamma_neg", 3.0),
            gamma_pos=CONFIG.get("asl_gamma_pos", 1.0),
            clip=CONFIG.get("asl_clip", 0.05),
            pos_alpha=CONFIG.get("asl_pos_alpha", 2.0),
        )
    
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = self.loss_fct(outputs.logits, labels)
        return (loss, outputs) if return_outputs else loss

# ---------- Training Arguments (FIXED) ----------
training_args = TrainingArguments(
    output_dir=CONFIG["output_dir"],
    overwrite_output_dir=True,
    
    # Training duration
    num_train_epochs=5,  # Full 5 epochs
    per_device_train_batch_size=20,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=2,
    
    # Optimization
    learning_rate=3e-4,
    weight_decay=0.01,
    warmup_ratio=0.02,
    lr_scheduler_type="cosine",
    
    # CRITICAL FIX: Evaluate by EPOCH, not steps!
    # This prevents premature stopping mid-epoch
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    
    # CRITICAL FIX: Select on f1_macro at 0.50 threshold!
    metric_for_best_model="f1_macro",  # ← Uses 0.50 threshold now
    greater_is_better=True,
    
    # Memory optimization
    fp16=True,
    gradient_checkpointing=False,  # As per your CONFIG
    max_grad_norm=1.0,
    
    # Logging
    logging_steps=100,
    report_to="none",
    
    # DataLoader
    dataloader_num_workers=0,
    dataloader_pin_memory=True,
    dataloader_drop_last=True,
    remove_unused_columns=True,
)

# ---------- Build Trainer ----------
train_ds = DATASETS["train"]
val_ds = DATASETS["validation"]

# Early stopping with PATIENCE PER EPOCH (not per 500 steps!)
callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]

trainer = AsymmetricLossTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=trainer_collate,
    compute_metrics=compute_metrics,
    callbacks=callbacks,
)

print(f"   ✅ Trainer configured correctly:")
print(f"   - Evaluation: Every EPOCH (not every 500 steps)")
print(f"   - Selection metric: f1_macro @ 0.50 threshold")
print(f"   - Early stopping: 2 epochs patience")
print(f"   - Effective batch size: {20 * 2} = 40")

In [None]:
# === 5.5 Rare-label boosted sampler (optional, recommended) ===
import numpy as np, torch
from torch.utils.data import WeightedRandomSampler

train_py = DATASETS["train"].with_format("python")
L = LABEL_META["num_labels"]
names = LABEL_META["names"]
label2id = LABEL_META["label2id"]
neutral_idx = label2id.get("neutral", None)

# per-label prevalence
sums = np.zeros(L, dtype=np.float64)
for row in train_py:
    sums += np.asarray(row["labels"], dtype=np.float64)
p = sums / len(train_py)

# inverse frequency emphasis (don’t over-boost 'neutral')
inv = 1.0 / np.clip(p, 1e-6, 1.0)
if neutral_idx is not None:
    inv[neutral_idx] = 1.0

# per-sample weight = sum inv for positive labels (min 1.0), then normalize
weights = []
for row in train_py:
    y = np.asarray(row["labels"], dtype=np.float32)
    w = float((y * inv).sum())
    if w <= 0.0:
        w = 1.0
    weights.append(w)
weights = np.asarray(weights, dtype=np.float64)
weights /= weights.mean()

sampler = WeightedRandomSampler(
    torch.as_tensor(weights, dtype=torch.double),
    num_samples=len(weights),
    replacement=True
)

# Patch trainer to use our sampler (python-format dataset + your collator)
def _get_train_dataloader(self):
    return torch.utils.data.DataLoader(
        DATASETS["train"],
        batch_size=int(CONFIG.get("per_device_train_batch_size", 16)),
        sampler=sampler,
        collate_fn=data_collator,
        drop_last=True,
        pin_memory=True,
        num_workers=0,
    )
trainer.get_train_dataloader = _get_train_dataloader.__get__(trainer, type(trainer))
print("✅ Using WeightedRandomSampler (rare-label boosted).")


In [None]:
# ===================================================================================
# DIAGNOSTIC CELL - Run this AFTER Cell 5.5 and BEFORE Cell 6
# This will verify everything is configured correctly
# ===================================================================================
print("\n🔍 Running Pre-Training Diagnostics...")

import numpy as np
import torch

# 1. Check data distribution
val_ds = DATASETS["validation"]
train_ds = DATASETS["train"]

# Get label arrays safely
Yt = np.stack([row["labels"] for row in train_ds.with_format("python")])
Yv = np.stack([row["labels"] for row in val_ds.with_format("python")])

print("\n📊 Data Distribution Check:")
print(f"   Train samples: {len(Yt):,}")
print(f"   Val samples: {len(Yv):,}")
print(f"   Positives/sample (train): {Yt.sum(axis=1).mean():.3f}")
print(f"   Positives/sample (val): {Yv.sum(axis=1).mean():.3f}")

# Per-class prevalence
prevalence = Yv.mean(axis=0)
print(f"\n   Class prevalence (val):")
print(f"   - Min: {prevalence.min():.4f}")
print(f"   - Median: {np.median(prevalence):.4f}")
print(f"   - Max: {prevalence.max():.4f}")

# 2. Check trainer configuration
print("\n⚙️ Trainer Configuration Check:")
print(f"   Evaluation strategy: {trainer.args.evaluation_strategy}")
print(f"   Save strategy: {trainer.args.save_strategy}")
print(f"   Metric for best model: {trainer.args.metric_for_best_model}")
print(f"   Load best model at end: {trainer.args.load_best_model_at_end}")
print(f"   Early stopping: Configured with patience=2")

# 3. Verify metrics configuration
print("\n📈 Metrics Configuration:")
# Check if these variables exist from Cell 5
if 'THRESH_SELECT' in globals():
    print(f"   Selection threshold: {THRESH_SELECT}")
else:
    print(f"   Selection threshold: 0.50 (default)")
    
if 'THRESH_LOG' in globals():
    print(f"   Logging threshold: {THRESH_LOG}")
else:
    print(f"   Logging threshold: 0.30 (default)")

# 4. Check model parameters
try:
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"\n🤖 Model Configuration:")
    print(f"   Trainable params: {trainable_params:,} ({100*trainable_params/total_params:.2f}%)")
    print(f"   Total params: {total_params:,}")
except Exception as e:
    print(f"\n🤖 Model Configuration: Error - {e}")

# 5. Test a single forward pass
print("\n🧪 Testing Forward Pass:")
try:
    # Create a small test batch manually
    test_texts = ["This is a test sentence."]
    test_inputs = tokenizer(
        test_texts, 
        truncation=True, 
        max_length=96, 
        padding="max_length",
        return_tensors="pt"
    )
    
    # Move to device
    device = next(model.parameters()).device
    test_inputs = {k: v.to(device) for k, v in test_inputs.items()}
    
    # Forward pass
    with torch.no_grad():
        outputs = model(**test_inputs)
        logits = outputs.logits
        probs = torch.sigmoid(logits)
    
    print(f"   ✓ Forward pass successful")
    print(f"   - Output shape: {logits.shape}")
    print(f"   - Logits range: [{logits.min():.2f}, {logits.max():.2f}]")
    print(f"   - Probs mean: {probs.mean():.3f}")
    print(f"   - Predictions at 0.5: {(probs > 0.5).float().sum():.0f} labels")
    print(f"   - Predictions at 0.3: {(probs > 0.3).float().sum():.0f} labels")
        
except Exception as e:
    print(f"   ❌ Forward pass failed: {e}")

# 6. Verify loss function
print("\n💰 Loss Function Check:")
try:
    # Check if trainer has the custom loss
    if hasattr(trainer, 'loss_fct'):
        # Create dummy data
        dummy_logits = torch.randn(4, 28).to(device)
        dummy_labels = torch.zeros(4, 28).to(device)
        dummy_labels[0, [0, 5, 10]] = 1  # Some positive labels
        
        # Calculate loss
        loss = trainer.loss_fct(dummy_logits, dummy_labels)
        print(f"   ✓ AsymmetricLoss working: {loss.item():.4f}")
        
        # Check loss parameters
        print(f"   - gamma_neg: {trainer.loss_fct.gamma_neg}")
        print(f"   - gamma_pos: {trainer.loss_fct.gamma_pos}")
        print(f"   - clip: {trainer.loss_fct.clip}")
        print(f"   - pos_alpha: {trainer.loss_fct.pos_alpha}")
    else:
        print(f"   ⚠️ No custom loss function found - using default")
        
except Exception as e:
    print(f"   ❌ Loss function test failed: {e}")

# 7. Memory check
if torch.cuda.is_available():
    print("\n💾 GPU Memory Status:")
    allocated = torch.cuda.memory_allocated() / 1e9
    reserved = torch.cuda.memory_reserved() / 1e9
    total = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"   Allocated: {allocated:.2f}GB / {total:.2f}GB ({100*allocated/total:.1f}%)")
    print(f"   Reserved: {reserved:.2f}GB")
    print(f"   Free: {(total - allocated):.2f}GB")

# Final readiness check
print("\n" + "="*50)
issues = []

# Check critical settings
if hasattr(trainer, 'args'):
    if trainer.args.metric_for_best_model not in ["f1_macro", "f1_macro_t05"]:
        issues.append(f"❌ Metric for selection is '{trainer.args.metric_for_best_model}' (should be 'f1_macro')")
    if trainer.args.evaluation_strategy != "epoch":
        issues.append("❌ Evaluation strategy is not 'epoch'")
        
# Check data distribution
if Yt.sum(axis=1).mean() > 2.0:
    issues.append("⚠️ Unusual data distribution (>2 labels per sample)")
    
# Check memory
if torch.cuda.is_available() and allocated/total > 0.9:
    issues.append("⚠️ GPU memory usage >90% before training")

if issues:
    print("⚠️ ISSUES FOUND:")
    for issue in issues:
        print(f"   {issue}")
    print("\n   Fix these issues before training!")
else:
    print("✅ ALL CHECKS PASSED - Ready to train!")
    print("\n   Expected behavior:")
    print("   • Training will run for 5 epochs")
    print("   • Each epoch will take ~10-12 minutes")
    print("   • Evaluation after each epoch")
    print("   • Early stopping if no improvement for 2 epochs")
print("="*50)

In [None]:
# ===================================================================================
# 6. TRAINING (with safety checks and proper cleanup)
# ===================================================================================
print("\n🏋️ [6/8] Starting FAST model training...")

import os, math, gc, torch

# CRITICAL: Clean up any old threshold files that could contaminate metrics
print("   - Cleaning up old threshold files...")
for filename in ["val_thresholds.npy", "val_thresholds.json", "optimal_thresholds.json"]:
    filepath = os.path.join(CONFIG["output_dir"], filename)
    if os.path.exists(filepath):
        try:
            os.remove(filepath)
            print(f"     ✓ Removed old {filename}")
        except Exception as e:
            print(f"     ⚠️ Could not remove {filename}: {e}")

# Also check best_model directory
best_model_dir = os.path.join(CONFIG["output_dir"], "best_model")
if os.path.exists(best_model_dir):
    threshold_file = os.path.join(best_model_dir, "optimal_thresholds.json")
    if os.path.exists(threshold_file):
        try:
            os.remove(threshold_file)
            print(f"     ✓ Removed old best_model/optimal_thresholds.json")
        except:
            pass

train_ds = DATASETS["train"]

# Calculate training statistics
num_epochs = 5  # Full 5 epochs as configured
train_bs = 20    # Per device batch size
grad_accum = 2   # Gradient accumulation steps

steps_per_epoch = max(1, math.ceil(len(train_ds) / train_bs / grad_accum))
total_steps_est = int(steps_per_epoch * num_epochs)

print(f"\n   📊 Training Statistics:")
print(f"   - Dataset size: {len(train_ds):,} samples")
print(f"   - Batch size: {train_bs} × {grad_accum} = {train_bs * grad_accum} effective")
print(f"   - Steps/epoch: {steps_per_epoch:,}")
print(f"   - Total steps: {total_steps_est:,} ({num_epochs} epochs)")
print(f"   - Evaluation: After each epoch (not every 500 steps!)")

# Memory cleanup before training
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    # Show GPU memory status
    allocated = torch.cuda.memory_allocated() / 1e9
    reserved = torch.cuda.memory_reserved() / 1e9
    print(f"\n   🎮 GPU Memory: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved")

# Start training (NO resume from checkpoint - fresh start)
print(f"\n   🚀 Starting training from scratch...")
print(f"   ⏰ Estimated time: ~50-60 minutes for 5 epochs")

train_result = trainer.train(resume_from_checkpoint=False)

print("\n   🎉 Training finished successfully!")
print("   - Final Metrics:", train_result.metrics)

# Save the best model
best_model_path = os.path.join(CONFIG["output_dir"], "best_model")
os.makedirs(best_model_path, exist_ok=True)
model.save_pretrained(best_model_path)
tokenizer.save_pretrained(best_model_path)
print(f"   - Best model saved to: {best_model_path}")

In [None]:
!nvidia-smi

In [None]:
# RUN ANYTIME after DATASETS/encoded_ds exist
import numpy as np
val_ds = DATASETS["validation"] if "DATASETS" in globals() else encoded_ds["validation"]
train_ds = DATASETS["train"] if "DATASETS" in globals() else encoded_ds["train"]

Yt = np.stack(train_ds["labels"])
Yv = np.stack(val_ds["labels"])
print("Positives per sample -> train:", Yt.mean(axis=1).mean().round(3), " | val:", Yv.mean(axis=1).mean().round(3))
print("Per-class prevalence (val) min/median/max:",
      float(Yv.mean(axis=0).min()), float(np.median(Yv.mean(axis=0))), float(Yv.mean(axis=0).max()))


In [None]:
# === 6.1 VAL DIAGNOSTICS ===
import numpy as np
from sklearn.metrics import f1_score, classification_report

val_ds = DATASETS["validation"] if "DATASETS" in globals() else encoded_ds["validation"]
pred = trainer.predict(val_ds)
logits = pred.predictions
labels = pred.label_ids.astype(int)
probs = 1.0 / (1.0 + np.exp(-logits))

true_pps = labels.sum() / labels.shape[0]   # avg positives per sample (truth)
pred_pps_05 = (probs >= 0.5).sum() / labels.shape[0]  # avg positives/sample @0.5

print(f"Avg true positives/sample: {true_pps:.3f}")
print(f"Avg preds/sample @0.5:     {pred_pps_05:.3f}")

# Per-class F1 @ 0.5 to spot dead classes
preds05 = (probs >= 0.5).astype(int)
per_class_f1 = []
for j, name in enumerate(LABEL_META["names"] if "LABEL_META" in globals() else LABEL_NAMES):
    f = f1_score(labels[:, j], preds05[:, j], zero_division=0)
    per_class_f1.append((name, float(f), float(labels[:, j].mean())))
per_class_f1_sorted = sorted(per_class_f1, key=lambda x: x[1])
print("\nWorst 10 classes @0.5 threshold (name, f1, prevalence):")
for name, f, prev in per_class_f1_sorted[:10]:
    print(f"{name:14s}  f1={f:0.3f}  prev={prev:0.3f}")


In [None]:
# ===================================================================================
# 7. THRESHOLD OPTIMIZATION (self-contained + robust)
# ===================================================================================
print("\n🔍 [7/8] Optimizing per-label decision thresholds...")

import os, gc, json, numpy as np
from sklearn.metrics import f1_score

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

val_ds = DATASETS["validation"] if "DATASETS" in globals() else encoded_ds["validation"]

print("   - Predicting on the validation set...")
pred = trainer.predict(val_ds)
logits = pred.predictions
labels = pred.label_ids.astype(int)
probs  = 1.0 / (1.0 + np.exp(-logits))

if "LABEL_META" in globals():
    label_names = LABEL_META["names"]
    num_labels  = LABEL_META["num_labels"]
else:
    label_names = LABEL_NAMES
    num_labels  = len(label_names)

base_preds = (probs >= 0.5).astype(int)
base_f1_micro = f1_score(labels, base_preds, average="micro", zero_division=0)
base_f1_macro = f1_score(labels, base_preds, average="macro", zero_division=0)

print("   - Searching best F1 threshold per label...")
grid = np.linspace(0.05, 0.60, 12)
optimal_thresholds = np.full(num_labels, 0.5, dtype=np.float32)

for j in range(num_labels):
    y = labels[:, j]
    if y.sum() == 0:
        continue
    pj = probs[:, j]
    fmax, tbest = -1.0, 0.5
    for t in grid:
        f = f1_score(y, (pj >= t).astype(int), zero_division=0)
        if f > fmax:
            fmax, tbest = f, t
    optimal_thresholds[j] = tbest

tuned_preds = (probs >= optimal_thresholds.reshape(1, -1)).astype(int)
tuned_f1_micro = f1_score(labels, tuned_preds, average="micro", zero_division=0)
tuned_f1_macro = f1_score(labels, tuned_preds, average="macro", zero_division=0)

print("\n--- Validation (0.5 vs tuned thresholds) ---")
print(f"   F1 Micro @0.5 : {base_f1_micro:.4f}")
print(f"   F1 Macro @0.5 : {base_f1_macro:.4f}")
print(f"   F1 Micro tuned: {tuned_f1_micro:.4f}")
print(f"   F1 Macro tuned: {tuned_f1_macro:.4f}")
if "neutral" in label_names:
    idx = [i for i, n in enumerate(label_names) if n != "neutral"]
    f1_macro_no_neutral = f1_score(labels[:, idx], tuned_preds[:, idx], average="macro", zero_division=0)
    print(f"   F1 Macro tuned (no 'neutral'): {f1_macro_no_neutral:.4f}")
print("--------------------------------------------")

os.makedirs(CONFIG["output_dir"], exist_ok=True)
np.save(os.path.join(CONFIG["output_dir"], "val_thresholds.npy"), optimal_thresholds)
with open(os.path.join(CONFIG["output_dir"], "val_thresholds.json"), "w") as f:
    json.dump(optimal_thresholds.tolist(), f)

best_dir = os.path.join(CONFIG["output_dir"], "best_model")
os.makedirs(best_dir, exist_ok=True)
with open(os.path.join(best_dir, "optimal_thresholds.json"), "w") as f:
    json.dump(optimal_thresholds.tolist(), f)

print("   ✅ Thresholds saved: val_thresholds.npy/json and best_model/optimal_thresholds.json")


In [None]:
# ===================================================================================
# 8. INFERENCE DEMO
# ===================================================================================
print("\n🎯 [8/8] Running inference demo...")

from peft import PeftModel

# Clear memory for inference
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

final_model_path = os.path.join(CONFIG["output_dir"], "best_model")
with open(os.path.join(final_model_path, "optimal_thresholds.json")) as f:
    final_thresholds = np.array(json.load(f))

# Load model for inference
print("   - Loading model for inference...")
base_model = AutoModelForSequenceClassification.from_pretrained(
    CONFIG["model_name"],
    num_labels=NUM_LABELS,
    problem_type="multi_label_classification",
    id2label=ID2LABEL,
    label2id=LABEL2ID,
    # Don't force dtype
)

inference_model = PeftModel.from_pretrained(base_model, final_model_path)
inference_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inference_model.to(device)

test_texts = [
    "I'm so happy and grateful for this amazing day!",
    "This is really frustrating and annoying.",
    "I feel a bit anxious but also excited about tomorrow.",
    "The weather is nice today.",
]

print("\n--- Inference Results ---")
with torch.no_grad():
    for text in test_texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, 
                          max_length=CONFIG["max_length"], padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        outputs = inference_model(**inputs)
        probs = torch.sigmoid(outputs.logits).cpu().numpy()[0]
        
        detected_emotions = [LABEL_NAMES[i] for i, p in enumerate(probs) 
                           if p > final_thresholds[i]]
        
        top_indices = np.argsort(probs)[-3:][::-1]
        top_scores = {LABEL_NAMES[i]: f"{probs[i]:.3f}" for i in top_indices}
        
        print(f"\nText: \"{text}\"")
        print(f"   -> Top: {top_scores}")
        print(f"   -> Detected: {detected_emotions if detected_emotions else ['none']}")

print("\n✅ All steps completed successfully!")


In [None]:
# === 7B. Diagnostics ===
import numpy as np
val_ds = DATASETS["validation"] if "DATASETS" in globals() else encoded_ds["validation"]
Yv = np.stack(val_ds["labels"])
pred = trainer.predict(val_ds)
probs = 1/(1+np.exp(-pred.predictions))
print("Positives per sample (truth):", Yv.sum(axis=1).mean().round(3))
print("Positives per sample @0.3  :", (probs>=0.3).sum(axis=1).mean().round(3))
