# scGPT → Scores → CellSentences → C2S LoRA Fine-tuning (2 Varianten)

Dieses Notebook macht (rechen-sparsam) genau das:

1. Lädt die **Top‑K Gene + Werte** pro Zelle aus `processed/scgpt_inputs_topk.npz` (aus dem Preprocessing-Notebook).
2. Lädt **scGPT**, berechnet pro Zelle `mlm_output` für diese Top‑K Positionen.
3. Baut daraus **scGPT-CellSentences** (Gene nach `mlm_output` sortiert).
4. Fine-tuned **C2S** (als Causal LM) mit **LoRA** für *Cell-Label-Prediction* in zwei Varianten:
   - **Baseline:** CellSentences nach Expression-Ranking
   - **scGPT:** CellSentences nach scGPT-Score-Ranking

Am Ende werden beide Modelle auf dem **gleichen Testsplit** evaluiert.

> Hinweis: Modellpfade/Label-Spalte können je nach Setup variieren. Die Parameter unten sind bewusst als Variablen gesetzt.


In [1]:
# ======================
# 0) Imports
# ======================
import os
import json
import numpy as np
import pandas as pd

import scanpy as sc
from scipy import sparse
from tqdm.auto import tqdm

# HF / LoRA
import torch
from torch.utils.data import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
)

from peft import LoraConfig, get_peft_model

SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", DEVICE)


  from .autonotebook import tqdm as notebook_tqdm
Disabling PyTorch because PyTorch >= 2.4 is required but found 2.3.1+cu121
PyTorch was not found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


NameError: name 'LRScheduler' is not defined

## 1) Pfade & Parameter

Passe diese Variablen an dein Setup an.

- `PROCESSED_DIR` kommt aus dem Preprocessing-Notebook.
- `SCGPT_MODEL_DIR` muss auf einen lokalen scGPT-Checkpoint zeigen.
- `C2S_MODEL_NAME_OR_PATH` ist das Basismodell für C2S (Causal LM).  
  Wenn du ein lokales Modell nutzt, gib den Pfad an. Wenn es ein HF Hub Modell ist, den Namen.

**Compute-Tipp:** halte `K_TOP` klein (256/512) und LoRA rank niedrig (8/16).


In [None]:
# ======================
# 1) Konfiguration
# ======================
PROCESSED_DIR = "processed"

# aus Preprocessing
NPZ_TOPK = os.path.join(PROCESSED_DIR, "scgpt_inputs_topk.npz")
C2S_SENT_BASELINE = os.path.join(PROCESSED_DIR, "c2s_sentences_expression.txt")
ADATA_HVG = os.path.join(PROCESSED_DIR, "processed_adata_hvg.h5ad")
SPLITS_CSV = os.path.join(PROCESSED_DIR, "splits.csv")

# scGPT: lokaler Modellordner (Checkpoint + ggf. vocab config)
SCGPT_MODEL_DIR = "../models/scGPT"

# C2S Basismodell (Causal LM) – muss ein HF-kompatibles AutoModelForCausalLM sein
C2S_MODEL_NAME_OR_PATH = "path/to/c2s_base_model"

# Training
K_TOP = 512              # muss zum Preprocessing passen
MAX_LEN = 768            # Prompt+Answer Gesamtlänge (ggf. anpassen)

# scGPT masking (Inference)
MLM_PROB = 0.40        # z.B. 40% der Gene maskieren
MASK_VALUE = -1        # scGPT default mask_value
# Falls dein Checkpoint andere Konventionen hat:
# MASK_VALUE = 0
MASK_N_RUNS = 1        # später ggf. 3 und mitteln
BATCH_SIZE = 2
GRAD_ACCUM = 16
EPOCHS = 2
LR = 2e-4
WARMUP_RATIO = 0.03

# LoRA (sparsam)
LORA_R = 8 # Rang der Low-Rank-Approximation
LORA_ALPHA = 16 # Skalierungsfaktor für die LoRA-Updates
LORA_DROPOUT = 0.05 # Dropout für die LoRA-Adapter, um Überanpassung zu vermeiden

OUT_BASELINE = os.path.join(PROCESSED_DIR, "c2s_lora_baseline")
OUT_SCGPT = os.path.join(PROCESSED_DIR, "c2s_lora_scgpt")

os.makedirs(OUT_BASELINE, exist_ok=True)
os.makedirs(OUT_SCGPT, exist_ok=True)


## 2) Labels finden (aus `.h5ad`)

Wir laden die AnnData und versuchen automatisch eine sinnvolle Label-Spalte in `adata.obs` zu finden.  
Wenn der Automatismus die falsche Spalte wählt, setze `LABEL_COL` manuell.


In [None]:
# ======================
# 2) Lade AnnData und finde Label-Spalte
# ======================
adata = sc.read_h5ad(ADATA_HVG)
splits = pd.read_csv(SPLITS_CSV)

candidate_label_cols = [
    "cell_type", "celltype", "cell_type_original", "celltype_major",
    "celltype_minor", "cell_type_fine", "annotation", "labels", "label",
]

LABEL_COL = None
for c in candidate_label_cols:
    if c in adata.obs.columns:
        LABEL_COL = c
        break

if LABEL_COL is None:
    for c in adata.obs.columns:
        if (pd.api.types.is_categorical_dtype(adata.obs[c]) or adata.obs[c].dtype == object):
            nunique = adata.obs[c].nunique()
            if 2 <= nunique <= 200:
                LABEL_COL = c
                break

print("Chosen LABEL_COL:", LABEL_COL)
if LABEL_COL is None:
    raise ValueError("Keine Label-Spalte gefunden. Bitte LABEL_COL manuell setzen.")

if "cell_id" not in adata.obs.columns:
    adata.obs["cell_id"] = adata.obs_names.astype(str)

obs = adata.obs[["cell_id", LABEL_COL]].copy()
obs = obs.merge(splits, on="cell_id", how="inner")
print(obs["split"].value_counts())
print("n_labels:", obs[LABEL_COL].nunique())


Chosen LABEL_COL: cell_type
split
train    20841
val       4466
test      4466
Name: count, dtype: int64
n_labels: 35


## 3) Baseline-Sentences laden
Format: `cell_id<TAB>GENE1 GENE2 ...`


In [None]:
def load_sentence_tsv(path: str) -> pd.DataFrame:
    rows = []
    with open(path, "r") as f:
        for line in f:
            line = line.rstrip("\n")
            if not line:
                continue
            cid, sent = line.split("\t", 1)
            rows.append((cid, sent))
    return pd.DataFrame(rows, columns=["cell_id", "sentence"])

df_base = load_sentence_tsv(C2S_SENT_BASELINE)
print(df_base.head())
print("n_sentences:", len(df_base))


                         cell_id  \
0  Pan_T7935490_AAACCTGCAAATTGCC   
1  Pan_T7935490_AAACGGGCATCTGGTA   
2  Pan_T7935490_AAACGGGTCTTGCATT   
3  Pan_T7935490_AAAGCAATCATCGCTC   
4  Pan_T7935490_AAAGTAGCAGTCACTA   

                                            sentence  
0  HSP90AA1 FTH1 KLF6 HSPA1B MALAT1 ATF3 DUSP1 FO...  
1  MALAT1 HSPA1A HSP90AA1 RPS27 JUNB HSPA1B CD69 ...  
2  HSPA1A MALAT1 HSP90AA1 FOS DNAJB1 RPS27 CCL5 H...  
3  CCL4 MALAT1 CCL4L2 JUN CCL5 FOS IFNG DNAJB1 HS...  
4  GNLY GZMA MALAT1 CD7 CCL5 HSPA1A HSP90AA1 DNAJ...  
n_sentences: 29773


## 4) scGPT: `mlm_output` berechnen und scGPT-Sentences bauen

Wir nutzen **dieselben Top‑K Gene** wie die Baseline (aus `scgpt_inputs_topk.npz`), berechnen `mlm_output` und sortieren danach.


In [None]:
# ======================
# 4A) Lade Top-K Gene/Werte pro Zelle
# ======================
npz = np.load(NPZ_TOPK, allow_pickle=True)
cell_ids = npz["cell_id"]
topk_gene_symbols = npz["topk_gene_symbols"]   # (n_cells, K_TOP) strings
topk_values = npz["topk_values"]               # (n_cells, K_TOP) floats

assert topk_gene_symbols.shape[1] == K_TOP
assert topk_values.shape[1] == K_TOP

print("TopK arrays:", topk_gene_symbols.shape, topk_values.shape)


TopK arrays: (29773, 512) (29773, 512)


In [None]:
# ======================
# 4B) scGPT laden
# ======================
import os
import json
import inspect
import torch
from scgpt.tokenizer import GeneVocab

# je nach scgpt-Version
try:
    from scgpt.model import TransformerModel
except ImportError:
    from scgpt.models import TransformerModel

# 1) Vocab
vocab_path = os.path.join(SCGPT_MODEL_DIR, "vocab.json")
if not os.path.exists(vocab_path):
    raise FileNotFoundError(f"vocab.json nicht gefunden: {vocab_path}")
vocab = GeneVocab.from_file(vocab_path)

# 2) Config (config.json ODER args.json)
config_path = os.path.join(SCGPT_MODEL_DIR, "config.json")
args_path = os.path.join(SCGPT_MODEL_DIR, "args.json")

if os.path.exists(config_path):
    with open(config_path, "r") as f:
        cfg = json.load(f)
elif os.path.exists(args_path):
    with open(args_path, "r") as f:
        cfg = json.load(f)
else:
    raise FileNotFoundError(
        f"Weder config.json noch args.json gefunden in {SCGPT_MODEL_DIR}"
    )

# Key-Mapping zwischen config.json- und args.json-Schema
d_model = cfg.get("d_model", cfg.get("embsize", 512))
nhead = cfg.get("nhead", cfg.get("nheads", 8))
d_hid = cfg.get("d_hid", 512)
nlayers = cfg.get("nlayers", 12)
dropout = cfg.get("dropout", 0.0)
pad_token = cfg.get("pad_token", "<pad>")
pad_value = cfg.get("pad_value", -2)
do_mvc = cfg.get("do_mvc", cfg.get("MVC", False))
do_dab = cfg.get("do_dab", cfg.get("DAB", False))
use_batch_labels = cfg.get("use_batch_labels", False)
explicit_zero_prob = cfg.get("explicit_zero_prob", False)
use_fast_transformer = cfg.get("fast_transformer", True)
nlayers_cls = cfg.get("nlayers_cls", cfg.get("n_layers_cls", 3))
n_cls = cfg.get("n_cls", 1)

# 3) Modell bauen (versionsrobust: nur unterstützte kwargs übergeben)
candidate_kwargs = {
    "ntokens": len(vocab),
    "ntoken": len(vocab),
    "d_model": d_model,
    "nhead": nhead,
    "d_hid": d_hid,
    "nlayers": nlayers,
    "dropout": dropout,
    "pad_token_id": vocab[pad_token] if pad_token in vocab else 0,
    "pad_token": pad_token,
    "pad_value": pad_value,
    "do_mvc": do_mvc,
    "do_dab": do_dab,
    "use_batch_labels": use_batch_labels,
    "explicit_zero_prob": explicit_zero_prob,
    "use_fast_transformer": use_fast_transformer,
    "nlayers_cls": nlayers_cls,
    "n_cls": n_cls,
    "vocab": vocab,
}

sig = inspect.signature(TransformerModel.__init__)
supported = set(sig.parameters.keys())
model_kwargs = {k: v for k, v in candidate_kwargs.items() if k in supported}

model = TransformerModel(**model_kwargs)

# 4) Checkpoint laden
ckpt_candidates = [
    os.path.join(SCGPT_MODEL_DIR, "model.pt"),
    os.path.join(SCGPT_MODEL_DIR, "best_model.pt"),
]
ckpt_path = next((p for p in ckpt_candidates if os.path.exists(p)), None)
if ckpt_path is None:
    raise FileNotFoundError(
        f"Kein Checkpoint gefunden. Erwartet: {ckpt_candidates}"
    )

state = torch.load(ckpt_path, map_location="cpu")
if isinstance(state, dict) and "model_state_dict" in state:
    state = state["model_state_dict"]

missing, unexpected = model.load_state_dict(state, strict=False)

model.to(DEVICE)
model.eval()
print(f"Loaded scGPT from: {ckpt_path}")
print(f"Missing keys: {len(missing)} | Unexpected keys: {len(unexpected)}")




Loaded scGPT from: ../models/scGPT/best_model.pt
Missing keys: 34 | Unexpected keys: 25


In [None]:
# ======================
# 4C) Batch inference -> mlm_output
# ======================
PAD_ID = vocab[pad_token] if "pad_token" in globals() and pad_token in vocab else (vocab["<pad>"] if "<pad>" in vocab else 0)
UNK_ID = vocab["<unk>"] if "<unk>" in vocab else PAD_ID

def genes_to_ids(gene_sym_row):
    ids = []
    for g in gene_sym_row:
        g = str(g)
        if g in vocab:
            ids.append(vocab[g])
        else:
            ids.append(UNK_ID)
    return np.array(ids, dtype=np.int64)

def apply_value_mask(values_batch, mlm_prob, mask_value, seed):
    """
    values_batch: np.ndarray (B, K) float32
    maskt zufaellig mlm_prob-Anteil der Positionen.
    Gibt masked_values und mask_matrix zurueck.
    """
    rng = np.random.default_rng(seed)
    masked = values_batch.copy()

    B, K = masked.shape
    mask_matrix = rng.random((B, K)) < mlm_prob

    # Optional: falls du ein spezielles 1. Token haettest,
    # koenntest du mask_matrix[:, 0] = False setzen.
    masked[mask_matrix] = mask_value
    return masked, mask_matrix

def scgpt_mlm_scores_batch(gene_syms_batch, values_batch):
    src = np.stack([genes_to_ids(row) for row in gene_syms_batch], axis=0)  # (B,K)
    vals = values_batch.astype(np.float32)

    # deterministisches masking pro batch-call
    vals_masked, mask_matrix = apply_value_mask(
        vals, mlm_prob=MLM_PROB, mask_value=MASK_VALUE, seed=SEED + int(torch.randint(0, 10_000, (1,)).item())
    )

    src_t = torch.from_numpy(src).to(DEVICE)
    vals_t = torch.from_numpy(vals_masked).to(DEVICE)

    pad_mask = torch.zeros(src_t.shape, dtype=torch.bool, device=DEVICE)

    with torch.no_grad():
        out = model(
            src=src_t,
            values=vals_t,
            src_key_padding_mask=pad_mask,
            output_hidden_states=False,
            CLS=False,
        )
        mlm = out.get("mlm_output", None)
        if mlm is None:
            raise KeyError("mlm_output nicht im scGPT output. Prüfe scGPT-Version/Config.")
        if mlm.ndim == 3 and mlm.shape[-1] == 1:
            mlm = mlm.squeeze(-1)
        mlm = mlm.detach().float().cpu().numpy()
    return mlm  # (B,K)


In [None]:
# ======================
# 4D) Für alle Zellen scGPT-Scores berechnen und Sentence bauen
# ======================
B = 32 if DEVICE == "cuda" else 8
all_scores = np.zeros((len(cell_ids), K_TOP), dtype=np.float32)

for start in tqdm(range(0, len(cell_ids), B)):
    end = min(len(cell_ids), start + B)
    scores_acc = 0
    for r in range(MASK_N_RUNS):
        scores_acc = scores_acc + scgpt_mlm_scores_batch(
            topk_gene_symbols[start:end],
            topk_values[start:end],
        )
    scores = scores_acc / MASK_N_RUNS
    all_scores[start:end] = scores

scgpt_sentences = []
for i in range(len(cell_ids)):
    scores = all_scores[i]
    idx = np.argsort(-scores)[:K_TOP]
    genes_sorted = topk_gene_symbols[i, idx]
    scgpt_sentences.append(" ".join(map(str, genes_sorted)))

df_scgpt = pd.DataFrame({"cell_id": cell_ids, "sentence": scgpt_sentences})
scgpt_sent_path = os.path.join(PROCESSED_DIR, "c2s_sentences_scgpt_mlm.txt")
with open(scgpt_sent_path, "w") as f:
    for cid, sent in zip(df_scgpt["cell_id"].values, df_scgpt["sentence"].values):
        f.write(f"{cid}\t{sent}\n")

print("Wrote:", scgpt_sent_path)
df_scgpt.head()


  0%|          | 0/931 [00:00<?, ?it/s]


TypeError: TransformerModel.forward() got an unexpected keyword argument 'output_hidden_states'

## 5) Supervised Fine-tuning Daten bauen (Prompt → Label)

Wir trainieren ein Causal LM so, dass es nach `Answer:` das Label ausgibt.
Loss wird nur auf den Answer-Teil gerechnet (Prompt-Masking).


In [None]:
PROMPT_TEMPLATE = "Cell: {sentence}\nTask: predict cell type.\nAnswer:"

def build_supervised_df(df_sent: pd.DataFrame) -> pd.DataFrame:
    df = obs.merge(df_sent, on="cell_id", how="inner")
    df["prompt"] = df["sentence"].apply(lambda s: PROMPT_TEMPLATE.format(sentence=s))
    df["answer"] = df[LABEL_COL].astype(str)
    df["text_full"] = df["prompt"] + " " + df["answer"]
    return df[["cell_id", "split", "prompt", "answer", "text_full"]]

df_train_base = build_supervised_df(df_base)
df_train_scgpt = build_supervised_df(df_scgpt)

print(df_train_base.head(2))


## 6) Tokenisierung + Collator mit Prompt-Masking


In [None]:
from transformers import AutoTokenizer

class SFTDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer, max_len: int):
        self.df = df.reset_index(drop=True)
        self.tok = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
        row = self.df.iloc[i]
        full = row["text_full"]
        prompt = row["prompt"]

        enc_full = self.tok(full, truncation=True, max_length=self.max_len, padding=False)
        enc_prompt = self.tok(prompt, truncation=True, max_length=self.max_len, padding=False)

        input_ids = enc_full["input_ids"]
        attn = enc_full["attention_mask"]

        labels = np.array(input_ids, dtype=np.int64)
        prompt_len = len(enc_prompt["input_ids"])
        labels[:prompt_len] = -100

        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(attn, dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.long),
            "answer": row["answer"],
        }

def make_collate_fn(tokenizer):
    def collate(batch):
        max_len = max(len(x["input_ids"]) for x in batch)
        input_ids = []
        attention_mask = []
        labels = []
        answers = []
        for x in batch:
            pad = max_len - len(x["input_ids"])
            input_ids.append(torch.cat([x["input_ids"], torch.full((pad,), tokenizer.pad_token_id, dtype=torch.long)]))
            attention_mask.append(torch.cat([x["attention_mask"], torch.zeros((pad,), dtype=torch.long)]))
            labels.append(torch.cat([x["labels"], torch.full((pad,), -100, dtype=torch.long)]))
            answers.append(x["answer"])
        return {
            "input_ids": torch.stack(input_ids),
            "attention_mask": torch.stack(attention_mask),
            "labels": torch.stack(labels),
            "answers": answers,
        }
    return collate


## 7) C2S Basismodell laden + LoRA anhängen

Falls dein Basismodell keine `q_proj/v_proj` Module hat (z. B. GPT2), musst du `target_modules` anpassen.


In [None]:
tokenizer = AutoTokenizer.from_pretrained(C2S_MODEL_NAME_OR_PATH, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def make_lora_model():
    base = AutoModelForCausalLM.from_pretrained(
        C2S_MODEL_NAME_OR_PATH,
        torch_dtype=torch.float16 if (DEVICE == "cuda") else torch.float32,
    ).to(DEVICE)

    target_modules = ["q_proj", "v_proj"]  # ggf. anpassen!

    lora_cfg = LoraConfig(
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=target_modules,
    )
    model = get_peft_model(base, lora_cfg)
    model.print_trainable_parameters()
    return model


## 8) Training-Funktion (gleiche Settings für beide Varianten)


In [None]:
def train_one(df_all: pd.DataFrame, out_dir: str):
    df_tr = df_all[df_all["split"] == "train"].copy()
    df_va = df_all[df_all["split"] == "val"].copy()

    ds_tr = SFTDataset(df_tr, tokenizer, MAX_LEN)
    ds_va = SFTDataset(df_va, tokenizer, MAX_LEN)

    model = make_lora_model()
    collate_fn = make_collate_fn(tokenizer)

    args = TrainingArguments(
        output_dir=out_dir,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRAD_ACCUM,
        num_train_epochs=EPOCHS,
        learning_rate=LR,
        warmup_ratio=WARMUP_RATIO,
        evaluation_strategy="steps",
        eval_steps=200,
        save_steps=200,
        save_total_limit=2,
        logging_steps=50,
        fp16=(DEVICE == "cuda"),
        report_to="none",
        seed=SEED,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=ds_tr,
        eval_dataset=ds_va,
        data_collator=collate_fn,
        tokenizer=tokenizer,
    )

    trainer.train()
    trainer.save_model(out_dir)
    tokenizer.save_pretrained(out_dir)
    return trainer


## 9) Train Baseline-LoRA


In [None]:
trainer_base = train_one(df_train_base, OUT_BASELINE)


## 10) Train scGPT-LoRA


In [None]:
trainer_scgpt = train_one(df_train_scgpt, OUT_SCGPT)


## 11) Evaluation: Label generieren auf Testset

Wir generieren kurz nach `Answer:` und vergleichen mit Ground Truth.


In [None]:
from sklearn.metrics import accuracy_score, f1_score

@torch.no_grad()
def predict_labels(model_dir: str, df_all: pd.DataFrame, max_new_tokens: int = 12):
    tok = AutoTokenizer.from_pretrained(model_dir, use_fast=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        model_dir,
        torch_dtype=torch.float16 if (DEVICE == "cuda") else torch.float32,
    ).to(DEVICE)
    model.eval()

    df_te = df_all[df_all["split"] == "test"].copy().reset_index(drop=True)

    preds = []
    trues = df_te["answer"].tolist()

    for i in tqdm(range(len(df_te))):
        prompt = df_te.loc[i, "prompt"]
        enc = tok(prompt, return_tensors="pt", truncation=True, max_length=MAX_LEN).to(DEVICE)
        gen = model.generate(
            **enc,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            num_beams=1,
            pad_token_id=tok.pad_token_id,
            eos_token_id=tok.eos_token_id,
        )
        out = tok.decode(gen[0], skip_special_tokens=True)

        pred = out.split("Answer:", 1)[1].strip() if "Answer:" in out else out.strip()
        pred = pred.split("\n")[0].strip()
        pred = pred.split(".")[0].split(",")[0].strip()
        preds.append(pred)

    return trues, preds

def evaluate(trues, preds):
    return accuracy_score(trues, preds), f1_score(trues, preds, average="macro")


In [None]:
y_true_b, y_pred_b = predict_labels(OUT_BASELINE, df_train_base)
acc_b, f1_b = evaluate(y_true_b, y_pred_b)
print("BASELINE  acc:", acc_b, "macroF1:", f1_b)


In [None]:
y_true_s, y_pred_s = predict_labels(OUT_SCGPT, df_train_scgpt)
acc_s, f1_s = evaluate(y_true_s, y_pred_s)
print("scGPT     acc:", acc_s, "macroF1:", f1_s)


## Troubleshooting (kurz)

**OOM / zu langsam**
- `K_TOP=256`
- `MAX_LEN` runter
- `BATCH_SIZE=1`, `GRAD_ACCUM` hoch
- `LORA_R=4`

**LoRA target_modules passt nicht**
- Für GPT2-artige Modelle: oft `"c_attn"` / `"c_proj"` statt `q_proj/v_proj`.
- Einfach `print(model)` und nach Modulnamen suchen.

**Labels passen nicht**
- `LABEL_COL` manuell setzen.
