In [1]:
!nvidia-smi
!pip -q install --upgrade pip
!pip -q install torch==2.3.1+cu118 torchvision==0.18.1+cu118 torchaudio==2.3.1+cu118 --index-url https://download.pytorch.org/whl/cu118
!pip -q install "transformers>=4.44" "accelerate>=0.31" "datasets>=2.20" "evaluate" "scikit-learn" "peft>=0.11" "tensorboard" "pyarrow<18"


Tue Sep  2 22:03:23 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.127.08             Driver Version: 550.127.08     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        On  |   00000000:C1:00.0 Off |                  N/A |
| 30%   26C    P8             38W /  350W |       2MiB /  24576MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA GeForce RTX 3090        On  |   00

In [2]:
# Write train_samo.py to the working directory
SCRIPT_PATH = "train_samo.py"
SCRIPT_CONTENT = r"""#!/usr/bin/env python
# -*- coding: utf-8 -*-
# SAMO - GoEmotions Multi-Label Trainer (2x3090-ready)
#
# Launch examples (2 GPUs):
#   accelerate launch --num_processes=2 --mixed_precision=fp16 \
#     train_samo.py --train_json /path/train.jsonl --val_json /path/val.jsonl \
#     --output_dir ./samo_out --thresholds_json ./optimal_thresholds.json
#
# Or with torchrun:
#   torchrun --standalone --nproc_per_node=2 train_samo.py \
#     --train_json /path/train.jsonl --val_json /path/val.jsonl
import os, json, random, argparse, warnings
from typing import List, Dict, Any

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)

from sklearn.metrics import f1_score, precision_recall_fscore_support


def set_seeds(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


def enable_tf32():
    try:
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
    except Exception:
        pass
    try:
        torch.set_float32_matmul_precision("high")
    except Exception:
        pass


class JsonlMultiLabelDataset(Dataset):
    """Assumes JSONL with {"text": str, "labels": List[int]} per line; labels are multi-hot."""
    def __init__(self, path: str, tokenizer, max_length: int):
        self.examples = []
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                obj = json.loads(line)
                text = obj.get("text", None)
                labels = obj.get("labels", None)
                if text is None or labels is None:
                    continue
                self.examples.append({"text": text, "labels": labels})
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        item = self.examples[idx]
        enc = self.tokenizer(
            item["text"],
            truncation=True,
            max_length=self.max_length,
            padding=False,
            return_tensors="pt",
        )
        enc = {k: v.squeeze(0) for k, v in enc.items()}
        labels = torch.tensor(item["labels"], dtype=torch.float)
        enc["labels"] = labels
        return enc


class AsymmetricLoss(nn.Module):
    def __init__(self, gamma_pos=0.0, gamma_neg=4.0, clip=0.05, eps=1e-8):
        super().__init__()
        self.gamma_pos = gamma_pos
        self.gamma_neg = gamma_neg
        self.clip = clip
        self.eps = eps

    def forward(self, logits, targets):
        x = torch.sigmoid(logits)
        if self.clip and self.clip > 0:
            x = (x - self.clip).clamp(min=0) / (1 - self.clip)
        xs_pos = x
        xs_neg = 1 - x
        pos_loss = targets * torch.log(xs_pos.clamp(min=self.eps)) * ((1 - xs_pos) ** self.gamma_pos)
        neg_loss = (1 - targets) * torch.log(xs_neg.clamp(min=self.eps)) * (xs_pos ** self.gamma_neg)
        loss = -(pos_loss + neg_loss)
        return loss.mean()


class ASLTrainer(Trainer):
    """Trainer subclass with ASL and optional class-balanced positive weighting."""
    def __init__(self, *args, class_weights: torch.Tensor | None = None, **kwargs):
        super().__init__(*args, **kwargs)
        self.criterion = AsymmetricLoss(gamma_pos=0.0, gamma_neg=4.0, clip=0.05)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = self.criterion(logits, labels)
        if self.class_weights is not None:
            pos_mask = (labels > 0.5).float()
            cw = self.class_weights.to(logits.device)
            weights = 1.0 + pos_mask * (cw - 1.0)  # emphasize positives of rare classes
            loss = (loss * weights.mean(dim=1)).mean()
        return (loss, outputs) if return_outputs else loss


def compute_thresholded_metrics(logits: np.ndarray, y_true: np.ndarray, thresholds: np.ndarray):
    probs = 1.0 / (1.0 + np.exp(-logits))
    y_pred = (probs >= thresholds[None, :]).astype(int)
    f1_micro = f1_score(y_true, y_pred, average="micro", zero_division=0)
    f1_macro = f1_score(y_true, y_pred, average="macro", zero_division=0)
    per_class = precision_recall_fscore_support(y_true, y_pred, average=None, zero_division=0)
    return f1_micro, f1_macro, per_class


def effective_num_weights(pos_counts: np.ndarray, beta: float = 0.999) -> np.ndarray:
    eff_num = 1.0 - np.power(beta, pos_counts)
    w = (1.0 - beta) / np.maximum(eff_num, 1e-8)
    return w / (w.mean() + 1e-12)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name", type=str, default="microsoft/deberta-v3-large")
    parser.add_argument("--train_json", type=str, required=True)
    parser.add_argument("--val_json", type=str, required=True)
    parser.add_argument("--max_length", type=int, default=160)

    parser.add_argument("--use_lora", type=lambda x: str(x).lower() == 'true', default=True)
    parser.add_argument("--lora_r", type=int, default=16)
    parser.add_argument("--lora_alpha", type=int, default=32)
    parser.add_argument("--lora_dropout", type=float, default=0.1)

    parser.add_argument("--per_device_train_batch_size", type=int, default=16)
    parser.add_argument("--per_device_eval_batch_size", type=int, default=32)
    parser.add_argument("--gradient_accumulation_steps", type=int, default=2)
    parser.add_argument("--num_train_epochs", type=int, default=4)

    parser.add_argument("--learning_rate", type=float, default=2e-5)
    parser.add_argument("--weight_decay", type=float, default=0.01)
    parser.add_argument("--lr_scheduler_type", type=str, default="cosine")
    parser.add_argument("--warmup_ratio", type=float, default=0.1)

    parser.add_argument("--fp16", type=lambda x: str(x).lower() == 'true', default=True)
    parser.add_argument("--tf32", type=lambda x: str(x).lower() == 'true', default=True)
    parser.add_argument("--gradient_checkpointing", type=lambda x: str(x).lower() == 'true', default=True)

    parser.add_argument("--ddp_backend", type=str, default="nccl")

    parser.add_argument("--thresholds_json", type=str, default=None)
    parser.add_argument("--min_threshold", type=float, default=0.25)

    parser.add_argument("--oversample_rare", type=lambda x: str(x).lower() == 'true', default=True)
    parser.add_argument("--effective_beta", type=float, default=0.999)

    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--output_dir", type=str, default="./samo_out")

    args = parser.parse_args()

    set_seeds(args.seed)
    if args.tf32:
        enable_tf32()

    os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")

    tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=True)
    config = AutoConfig.from_pretrained(
        args.model_name,
        problem_type="multi_label_classification",
    )

    train_ds = JsonlMultiLabelDataset(args.train_json, tokenizer, args.max_length)
    val_ds = JsonlMultiLabelDataset(args.val_json, tokenizer, args.max_length)

    # Infer num_labels from first sample
    first_labels = train_ds[0]["labels"]
    num_labels = int(first_labels.numel())
    config.num_labels = num_labels
    id2label = {i: f"label_{i}" for i in range(num_labels)}
    label2id = {v: k for k, v in id2label.items()}
    config.id2label = id2label
    config.label2id = label2id

    base_model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name, config=config
    )

    model = base_model
    if args.use_lora:
        try:
            from peft import LoraConfig, get_peft_model, TaskType
            lcfg = LoraConfig(
                r=args.lora_r,
                lora_alpha=args.lora_alpha,
                lora_dropout=args.lora_dropout,
                bias="none",
                task_type=TaskType.SEQ_CLS,
                target_modules=["query_proj", "key_proj", "value_proj", "dense"],
            )
            model = get_peft_model(base_model, lcfg)
        except Exception as e:
            warnings.warn(f"PEFT not available or failed ({e}); using base model")

    # Build label stats for class-balanced weights and oversampling
    train_labels = []
    for i in range(len(train_ds)):
        train_labels.append(train_ds[i]["labels"].numpy())
    Yt = np.stack(train_labels)  # [N, C]
    pos_counts = Yt.sum(axis=0)  # [C]

    class_w = effective_num_weights(pos_counts, beta=args.effective_beta)
    class_w_t = torch.tensor(class_w, dtype=torch.float)

    example_weights = (Yt * (1.0 / np.maximum(pos_counts, 1))).sum(axis=1)
    example_weights = example_weights / (example_weights.mean() + 1e-8)

    sampler = None
    if args.oversample_rare:
        sampler = WeightedRandomSampler(weights=example_weights, num_samples=len(example_weights), replacement=True)

    collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Patch Trainer dataloader to inject our sampler
    def get_train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.args.per_device_train_batch_size,
            sampler=(sampler if sampler is not None else None),
            shuffle=(sampler is None),
            collate_fn=self.data_collator,
            drop_last=False,
            num_workers=2,
            pin_memory=True,
        )
    ASLTrainer.get_train_dataloader = get_train_dataloader

    training_args = TrainingArguments(
        output_dir=args.output_dir,
        per_device_train_batch_size=args.per_device_train_batch_size,
        per_device_eval_batch_size=args.per_device_eval_batch_size,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        learning_rate=args.learning_rate,
        lr_scheduler_type=args.lr_scheduler_type,
        warmup_ratio=args.warmup_ratio,
        num_train_epochs=args.num_train_epochs,
        weight_decay=args.weight_decay,
        fp16=args.fp16,
        tf32=args.tf32,
        gradient_checkpointing=args.gradient_checkpointing,
        logging_steps=100,
        save_strategy="epoch",
        evaluation_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        ddp_backend=args.ddp_backend,
        ddp_find_unused_parameters=False,
        report_to=["tensorboard"],
    )

    trainer = ASLTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        data_collator=collator,
        tokenizer=tokenizer,
        class_weights=class_w_t,
    )

    train_out = trainer.train()
    trainer.save_model(args.output_dir)

    eval_out = trainer.predict(val_ds)
    logits = eval_out.predictions  # [N, C]
    y_true = np.stack([val_ds[i]["labels"].numpy() for i in range(len(val_ds))])

    if args.thresholds_json and os.path.exists(args.thresholds_json):
        with open(args.thresholds_json, "r") as f:
            thr = json.load(f)
        thr = np.asarray(thr, dtype=np.float32)
        if thr.shape[0] != logits.shape[1]:
            warnings.warn("Thresholds length mismatch - falling back to 0.4")
            thr = np.full((logits.shape[1],), 0.4, dtype=np.float32)
    else:
        thr = np.full((logits.shape[1],), 0.4, dtype=np.float32)

    thr = np.maximum(thr, args.min_threshold)

    grid = [0.2, 0.3, 0.4, 0.5, 0.6]
    print("\nPerformance at Different Thresholds:")
    print("Threshold  F1 Micro     F1 Macro")
    best_macro, best_t = -1, None
    for t in grid:
        tvec = np.maximum(np.full_like(thr, t), args.min_threshold)
        f1_micro, f1_macro, _ = compute_thresholded_metrics(logits, y_true, tvec)
        print(f"{t:<10} {f1_micro:>8.4f}    {f1_macro:>8.4f}")
        if f1_macro > best_macro:
            best_macro, best_t = f1_macro, tvec.copy()

    f1_micro, f1_macro, per_class = compute_thresholded_metrics(logits, y_true, best_t)
    print("\nBest: f1_micro=%.4f, f1_macro=%.4f" % (f1_micro, f1_macro))

    os.makedirs(args.output_dir, exist_ok=True)
    with open(os.path.join(args.output_dir, "optimal_thresholds.json"), "w") as f:
        json.dump(best_t.tolist(), f)

    per_prec, per_rec, per_f1, per_support = per_class
    report = {
        "f1_micro": float(f1_micro),
        "f1_macro": float(f1_macro),
        "per_class": {
            f"label_{i}": {
                "precision": float(per_prec[i]),
                "recall": float(per_rec[i]),
                "f1": float(per_f1[i]),
                "support": int(per_support[i]),
            } for i in range(len(per_f1))
        }
    }
    with open(os.path.join(args.output_dir, "eval_report.json"), "w") as f:
        json.dump(report, f, indent=2)

    print("\nSaved:")
    print("  -", os.path.join(args.output_dir, "pytorch_model.bin"))
    print("  -", os.path.join(args.output_dir, "optimal_thresholds.json"))
    print("  -", os.path.join(args.output_dir, "eval_report.json"))


if __name__ == "__main__":
    main()
"""
open(SCRIPT_PATH, "w", encoding="utf-8").write(SCRIPT_CONTENT)
import os, hashlib
print("Wrote", SCRIPT_PATH, "size:", os.path.getsize(SCRIPT_PATH), "bytes")
print("sha256:", hashlib.sha256(SCRIPT_CONTENT.encode()).hexdigest())


SyntaxError: invalid syntax (4222739976.py, line 55)

In [None]:
%%bash
accelerate config default
python - <<'PY'
from pathlib import Path
p = Path("~/.cache/huggingface/accelerate/default_config.yaml").expanduser()
txt = p.read_text()
txt = txt.replace("distributed_type: NO", "distributed_type: MULTI_GPU")
txt = txt.replace("mixed_precision: no", "mixed_precision: fp16")
txt = txt.replace("num_processes: 1", "num_processes: 2")
p.write_text(txt)
print("Accelerate config:", p)
print(p.read_text())
PY


In [None]:
THR_JSON = "/mnt/data/optimal_thresholds.json"  # adjust if needed
import os
print("Using thresholds exists?", os.path.exists(THR_JSON), "->", THR_JSON)


In [None]:
TRAIN_JSON = "/workspace/data/train.jsonl"  # <-- set your dataset path
VAL_JSON   = "/workspace/data/val.jsonl"    # <-- set your dataset path
OUT_DIR    = "./samo_out"
!mkdir -p "$OUT_DIR"
print("Train:", TRAIN_JSON)
print("Val  :", VAL_JSON)
print("Out  :", OUT_DIR)


In [None]:
!accelerate launch --num_processes=2 --mixed_precision=fp16   train_samo.py   --train_json "$TRAIN_JSON" --val_json "$VAL_JSON"   --output_dir "$OUT_DIR"   --thresholds_json "$THR_JSON"   --per_device_train_batch_size 16 --per_device_eval_batch_size 32   --gradient_accumulation_steps 2   --num_train_epochs 4   --learning_rate 2e-5 --lr_scheduler_type cosine --warmup_ratio 0.1   --weight_decay 0.01 --fp16 true --tf32 true --gradient_checkpointing true   --ddp_backend nccl


In [None]:
import json, os
rep_path = os.path.join("samo_out", "eval_report.json")
if os.path.exists(rep_path):
    rep = json.load(open(rep_path))
    print("F1_micro:", rep["f1_micro"], " F1_macro:", rep["f1_macro"])
    pc = rep["per_class"]
    items = sorted(pc.items(), key=lambda kv: kv[1]["f1"])
    print("\nWorst 5:")
    for k,v in items[:5]: print(k, v)
    print("\nBest 5:")
    for k,v in items[-5:]: print(k, v)
else:
    print("eval_report.json not found at", rep_path)
