## Hate speech detection with guardrail models using class prototypes

# LLaMA-Guard

In [None]:
!wget 'https://github.com/upunaprosk/hate-speech-interpretability/raw/refs/heads/master/all_data_hate.zip'
!huggingface-cli login --token "hf_xxx" # to use llama model
!unzip "all_data_hate.zip"

In [None]:
import os, random, numpy as np, pandas as pd, torch
from typing import List, Tuple, Dict, DefaultDict
from collections import defaultdict
from sklearn.metrics import f1_score, accuracy_score
from transformers import AutoTokenizer, AutoModel

DATASETS = ["hatexplain", "olid", "sbic", "ihc"] 

SEEDS    = [0,1,2,3,4,5,6,7,8,9]   
MODEL_PATTERN     = "meta-llama/Llama-Guard-3-1B"
CSV_PATTERN_TRAIN = "{ds}_train.csv"
CSV_PATTERN_TEST  = "{ds}_test.csv"

TEXT_COL  = "sentence"
LABEL_COL = "label"

BATCH_SIZE  = 8
MAX_LENGTH  = 500
USE_FP16    = False
MAX_PROTOS_PER_CLASS = 500

def set_seed(seed: int):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)

def normalize_labels(series: pd.Series) -> pd.Series:
    def _to_int(x):
        if isinstance(x, str):
            xl = x.strip().lower()
            try: return int(x)
            except: raise ValueError(f"Unrecognized label: {x}")
        if isinstance(x, (int, np.integer)) and x in (0,1): return int(x)
        raise ValueError(f"Unsupported label value: {x}")
    return series.apply(_to_int)

class TextDS(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tok, max_len):
        self.texts, self.labels, self.tok, self.max_len = texts, labels, tok, max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, i):
        enc = self.tok(str(self.texts[i]),
                       truncation=True, padding="max_length",
                       max_length=self.max_len, return_tensors="pt")
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(int(self.labels[i])).long()
        return item

def make_loader(texts, labels, tok, max_len, bs, shuffle=False):
    return torch.utils.data.DataLoader(
        TextDS(texts, labels, tok, max_len),
        batch_size=bs, shuffle=shuffle, pin_memory=torch.cuda.is_available()
    )

@torch.no_grad()
def collect_last_token(model, loader, device) -> Tuple[np.ndarray, List[int]]:
    """
    Decoder-only (OPT): last hidden layer @ last non-padding token.
    Padding-side agnostic.
    """
    model.eval()
    feats, ys = [], []
    for batch in loader:
        ids = batch["input_ids"].to(device)
        att = batch["attention_mask"].to(device)  # 1 on tokens, 0 on pads
        ys.extend(batch["labels"].tolist())

        out = model(input_ids=ids, attention_mask=att, output_hidden_states=True, return_dict=True)
        h_last = out.hidden_states[-1]  # (B, T, D)

        last_idx = att.size(1) - 1 - torch.argmax(att.flip(1), dim=1)  # (B,)
        reps = h_last[torch.arange(h_last.size(0), device=h_last.device), last_idx, :]  # (B, D)
        feats.append(reps.detach().float().cpu().numpy())

    feats = np.concatenate(feats, axis=0) if feats else np.zeros((0, model.config.hidden_size))
    return feats, ys

def l2_normalize(x: np.ndarray, axis: int = -1, eps: float = 1e-8) -> np.ndarray:
    n = np.linalg.norm(x, axis=axis, keepdims=True)
    return x / (n + eps)

def build_class_means(feats: np.ndarray, labels: List[int]) -> Dict[int, np.ndarray]:
    y = np.array(labels)
    class_means = {}
    D = feats.shape[1] if feats.ndim == 2 else 0
    for c in (0, 1):
        fc = feats[y == c]
        if fc.size == 0:
            class_means[c] = np.zeros((D,), dtype=np.float32)
        else:
            fc = l2_normalize(fc, axis=1)
            mu = fc.mean(axis=0)
            mu = l2_normalize(mu[None, :], axis=1)[0]
            class_means[c] = mu
    return class_means

def cosine_classify(x: np.ndarray, p0: np.ndarray, p1: np.ndarray) -> np.ndarray:
    x = l2_normalize(x, axis=1)
    p0 = p0 / (np.linalg.norm(p0) + 1e-8)
    p1 = p1 / (np.linalg.norm(p1) + 1e-8)
    s0 = (x @ p0); s1 = (x @ p1)
    return np.stack([s0, s1], axis=1).argmax(axis=1)

def load_csv(ds: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    tr = pd.read_csv(CSV_PATTERN_TRAIN.format(ds=ds))
    te = pd.read_csv(CSV_PATTERN_TEST.format(ds=ds))
    for df in (tr, te):
        df.dropna(subset=[TEXT_COL, LABEL_COL], inplace=True)
        df["label"] = normalize_labels(df[LABEL_COL])
        df["text"]  = df[TEXT_COL].astype(str)
    return tr, te

def fmt_mean_std(vals: List[float]) -> str:
    if not vals:
        return "n/a"
    m, s = np.mean(vals), np.std(vals)
    return f"{m*100:.2f}±{s*100:.2f}"


DATA = {ds: load_csv(ds) for ds in DATASETS}  

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype  = torch.float16 if (USE_FP16 and torch.cuda.is_available()) else torch.float32

ResultsF1: DefaultDict[str, DefaultDict[str, DefaultDict[str, List[float]]]] = \
    defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
ResultsAcc: DefaultDict[str, DefaultDict[str, DefaultDict[str, List[float]]]] = \
    defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

for S in DATASETS:  
    print(f"\n=== {S.upper()} ===")
    for seed in SEEDS:
        set_seed(seed)
        model_name = MODEL_PATTERN
        try:
            tok = AutoTokenizer.from_pretrained("meta-llama/Llama-Guard-3-1B", use_fast=False)
            if tok.pad_token_id is None and tok.eos_token_id is not None:
                tok.pad_token = tok.eos_token
            model = AutoModel.from_pretrained(model_name, output_hidden_states=True)
        except Exception as e:
            print(f"[WARN] Could not load {model_name}: {e}")
            continue

        if USE_FP16 and device.type == "cuda":
            model = model.half()
        model.to(device)
        model.eval()

        proto_means: Dict[str, Dict[int, np.ndarray]] = {}
        for P in DATASETS:
            train_P, _ = DATA[P]
            tr0 = train_P[train_P["label"] == 0]
            tr1 = train_P[train_P["label"] == 1]
            p0  = tr0.sample(n=min(MAX_PROTOS_PER_CLASS, len(tr0)), random_state=seed) if len(tr0) else tr0
            p1  = tr1.sample(n=min(MAX_PROTOS_PER_CLASS, len(tr1)), random_state=seed) if len(tr1) else tr1
            protos_df = pd.concat([p0, p1], ignore_index=True)

            if len(protos_df) == 0:

                D = model.config.hidden_size
                proto_means[P] = {0: np.zeros((D,), np.float32), 1: np.zeros((D,), np.float32)}
                continue

            proto_loader = make_loader(protos_df["text"].tolist(), protos_df["label"].tolist(),
                                       tok, MAX_LENGTH, BATCH_SIZE, shuffle=False)
            proto_feats, proto_labels = collect_last_token(model, proto_loader, device)
            proto_means[P] = build_class_means(proto_feats, proto_labels)

        # Extrct test features once per target T
        test_cache: Dict[str, Tuple[np.ndarray, List[int]]] = {}
        for T in DATASETS:
            _, test_T = DATA[T]
            test_loader = make_loader(test_T["text"].tolist(), test_T["label"].tolist(),
                                      tok, MAX_LENGTH, BATCH_SIZE, shuffle=False)
            test_feats, test_labels = collect_last_token(model, test_loader, device)
            test_cache[T] = (test_feats, test_labels)

        # Evaluate for every P -> T
        for P in DATASETS:
            p0 = proto_means[P][0]; p1 = proto_means[P][1]
            for T in DATASETS:
                feats_T, labels_T = test_cache[T]
                if len(feats_T) == 0:
                    continue
                preds = cosine_classify(feats_T, p0, p1)
                acc = float(accuracy_score(labels_T, preds))
                f1m = float(f1_score(labels_T, preds, average="macro"))
                ResultsAcc[S][P][T].append(acc)
                ResultsF1[S][P][T].append(f1m)
                os.makedirs("predictions-llama-full-protos", exist_ok=True)
                out_df = pd.DataFrame({
                    "pred": preds,
                    "true": labels_T,
                })
                out_path = f"predictions-llama-test/preds_{S}_s{seed}_proto{P}_to_{T}.csv.gz"
                out_df.to_csv(out_path, index=False, compression="gzip")

        del model
        torch.cuda.empty_cache()

    print("\nMacro-F1 (mean±std %, rows = prototype from P, cols = evaluated on T)")
    print("P→T\t" + "\t".join(DATASETS))
    for P in DATASETS:
        row = [P]
        for T in DATASETS:
            row.append(fmt_mean_std(ResultsF1[S][P][T]))
        print("\t".join(row))

    print("\nAccuracy (mean±std %, rows = prototype from P, cols = evaluated on T)")
    print("P→T\t" + "\t".join(DATASETS))
    for P in DATASETS:
        row = [P]
        for T in DATASETS:
            row.append(fmt_mean_std(ResultsAcc[S][P][T]))
        print("\t".join(row))

    print("\nLaTeX rows (F1):")
    for P in DATASETS:
        cells = [fmt_mean_std(ResultsF1[S][P][T]) for T in DATASETS]
        print(f"{S} (protos={P}) & " + " & ".join(cells) + r" \\")



=== Encoder family: fine-tuned on HATEXPLAIN ===


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/53.2k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.00G [00:00<?, ?B/s]


Macro-F1 (mean±std %, rows = prototype from P, cols = evaluated on T)
P→T	hatexplain	olid	sbic	ihc
hatexplain	67.59±0.46	51.76±2.76	41.89±3.19	59.03±0.82
olid	49.33±7.40	60.96±1.28	61.11±3.51	47.96±4.56
sbic	46.97±4.86	59.09±2.17	70.33±2.08	43.35±3.74
ihc	59.41±1.66	56.26±1.65	58.15±2.82	62.27±1.34

Accuracy (mean±std %, rows = prototype from P, cols = evaluated on T)
P→T	hatexplain	olid	sbic	ihc
hatexplain	67.95±0.48	72.77±0.45	42.98±2.72	61.68±1.49
olid	61.81±1.39	71.26±1.02	63.01±4.45	48.58±4.33
sbic	61.39±1.18	68.67±4.77	74.07±2.57	44.87±2.59
ihc	60.96±2.00	62.07±4.05	65.88±3.06	64.28±1.40

LaTeX rows (F1):
hatexplain (protos=hatexplain) & 67.59±0.46 & 51.76±2.76 & 41.89±3.19 & 59.03±0.82 \\
hatexplain (protos=olid) & 49.33±7.40 & 60.96±1.28 & 61.11±3.51 & 47.96±4.56 \\
hatexplain (protos=sbic) & 46.97±4.86 & 59.09±2.17 & 70.33±2.08 & 43.35±3.74 \\
hatexplain (protos=ihc) & 59.41±1.66 & 56.26±1.65 & 58.15±2.82 & 62.27±1.34 \\

=== Encoder family: fine-tuned on OLID ===

Macro-F1 (

## BLOOMZ-Guard

In [None]:
import os, random, numpy as np, pandas as pd, torch
from typing import List, Tuple, Dict, DefaultDict
from collections import defaultdict
from sklearn.metrics import f1_score, accuracy_score
from transformers import AutoTokenizer, AutoModel

DATASETS = ["hatexplain", "olid", "sbic", "ihc"] 

SEEDS    = list(range(5))
MODEL_PATTERN     = "cmarkea/bloomz-3b-guardrail"
CSV_PATTERN_TRAIN = "{ds}_train.csv"
CSV_PATTERN_TEST  = "{ds}_test.csv"

TEXT_COL  = "sentence"
LABEL_COL = "label"

BATCH_SIZE  = 8
MAX_LENGTH  = 500
USE_FP16    = False
MAX_PROTOS_PER_CLASS = 500

def set_seed(seed: int):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)

def normalize_labels(series: pd.Series) -> pd.Series:
    def _to_int(x):
        if isinstance(x, str):
            xl = x.strip().lower()
            try: return int(x)
            except: raise ValueError(f"Unrecognized label: {x}")
        if isinstance(x, (int, np.integer)) and x in (0,1): return int(x)
        raise ValueError(f"Unsupported label value: {x}")
    return series.apply(_to_int)

class TextDS(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tok, max_len):
        self.texts, self.labels, self.tok, self.max_len = texts, labels, tok, max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, i):
        enc = self.tok(str(self.texts[i]),
                       truncation=True, padding="max_length",
                       max_length=self.max_len, return_tensors="pt")
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(int(self.labels[i])).long()
        return item

def make_loader(texts, labels, tok, max_len, bs, shuffle=False):
    return torch.utils.data.DataLoader(
        TextDS(texts, labels, tok, max_len),
        batch_size=bs, shuffle=shuffle, pin_memory=torch.cuda.is_available()
    )

@torch.no_grad()
def collect_last_token(model, loader, device) -> Tuple[np.ndarray, List[int]]:
    """
    Decoder-only (OPT): last hidden layer @ last non-padding token.
    Padding-side agnostic.
    """
    model.eval()
    feats, ys = [], []
    for batch in loader:
        ids = batch["input_ids"].to(device)
        att = batch["attention_mask"].to(device)  
        ys.extend(batch["labels"].tolist())

        out = model(input_ids=ids, attention_mask=att, output_hidden_states=True, return_dict=True)
        h_last = out.hidden_states[-1]  # (B, T, D)
        last_idx = att.size(1) - 1 - torch.argmax(att.flip(1), dim=1)  # (B,)
        reps = h_last[torch.arange(h_last.size(0), device=h_last.device), last_idx, :]  # (B, D)
        feats.append(reps.detach().float().cpu().numpy())

    feats = np.concatenate(feats, axis=0) if feats else np.zeros((0, model.config.hidden_size))
    return feats, ys

def l2_normalize(x: np.ndarray, axis: int = -1, eps: float = 1e-8) -> np.ndarray:
    n = np.linalg.norm(x, axis=axis, keepdims=True)
    return x / (n + eps)

def build_class_means(feats: np.ndarray, labels: List[int]) -> Dict[int, np.ndarray]:
    y = np.array(labels)
    class_means = {}
    D = feats.shape[1] if feats.ndim == 2 else 0
    for c in (0, 1):
        fc = feats[y == c]
        if fc.size == 0:
            class_means[c] = np.zeros((D,), dtype=np.float32)
        else:
            fc = l2_normalize(fc, axis=1)
            mu = fc.mean(axis=0)
            mu = l2_normalize(mu[None, :], axis=1)[0]
            class_means[c] = mu
    return class_means

def cosine_classify(x: np.ndarray, p0: np.ndarray, p1: np.ndarray) -> np.ndarray:
    x = l2_normalize(x, axis=1)
    p0 = p0 / (np.linalg.norm(p0) + 1e-8)
    p1 = p1 / (np.linalg.norm(p1) + 1e-8)
    s0 = (x @ p0); s1 = (x @ p1)
    return np.stack([s0, s1], axis=1).argmax(axis=1)

def load_csv(ds: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    tr = pd.read_csv(CSV_PATTERN_TRAIN.format(ds=ds))
    te = pd.read_csv(CSV_PATTERN_TEST.format(ds=ds))
    for df in (tr, te):
        df.dropna(subset=[TEXT_COL, LABEL_COL], inplace=True)
        df["label"] = normalize_labels(df[LABEL_COL])
        df["text"]  = df[TEXT_COL].astype(str)
    return tr, te

def fmt_mean_std(vals: List[float]) -> str:
    if not vals:
        return "n/a"
    m, s = np.mean(vals), np.std(vals)
    return f"{m*100:.2f}±{s*100:.2f}"

DATA = {ds: load_csv(ds) for ds in DATASETS}  

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype  = torch.float16 if (USE_FP16 and torch.cuda.is_available()) else torch.float32

ResultsF1: DefaultDict[str, DefaultDict[str, DefaultDict[str, List[float]]]] = \
    defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
ResultsAcc: DefaultDict[str, DefaultDict[str, DefaultDict[str, List[float]]]] = \
    defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

for S in DATASETS:
    print(f"\n=== Encoder family: fine-tuned on {S.upper()} ===")
    for seed in SEEDS:
        set_seed(seed)
        model_name = MODEL_PATTERN
        try:
            tok = AutoTokenizer.from_pretrained('cmarkea/bloomz-3b-guardrail')
            if tok.pad_token_id is None and tok.eos_token_id is not None:
                tok.pad_token = tok.eos_token
            model = AutoModel.from_pretrained("cmarkea/bloomz-3b-guardrail", output_hidden_states=True)
        except Exception as e:
            print(f"[WARN] Could not load {model_name}: {e}")
            continue

        if USE_FP16 and device.type == "cuda":
            model = model.half()
        model.to(device)
        model.eval()
        proto_means: Dict[str, Dict[int, np.ndarray]] = {}
        for P in DATASETS:
            train_P, _ = DATA[P]
            tr0 = train_P[train_P["label"] == 0]
            tr1 = train_P[train_P["label"] == 1]
            p0  = tr0.sample(n=min(MAX_PROTOS_PER_CLASS, len(tr0)), random_state=seed) if len(tr0) else tr0
            p1  = tr1.sample(n=min(MAX_PROTOS_PER_CLASS, len(tr1)), random_state=seed) if len(tr1) else tr1
            protos_df = pd.concat([p0, p1], ignore_index=True)

            if len(protos_df) == 0:
                D = model.config.hidden_size
                proto_means[P] = {0: np.zeros((D,), np.float32), 1: np.zeros((D,), np.float32)}
                continue

            proto_loader = make_loader(protos_df["text"].tolist(), protos_df["label"].tolist(),
                                       tok, MAX_LENGTH, BATCH_SIZE, shuffle=False)
            proto_feats, proto_labels = collect_last_token(model, proto_loader, device)
            proto_means[P] = build_class_means(proto_feats, proto_labels)

        test_cache: Dict[str, Tuple[np.ndarray, List[int]]] = {}
        for T in DATASETS:
            _, test_T = DATA[T]
            test_loader = make_loader(test_T["text"].tolist(), test_T["label"].tolist(),
                                      tok, MAX_LENGTH, BATCH_SIZE, shuffle=False)
            test_feats, test_labels = collect_last_token(model, test_loader, device)
            test_cache[T] = (test_feats, test_labels)

        for P in DATASETS:
            p0 = proto_means[P][0]; p1 = proto_means[P][1]
            for T in DATASETS:
                feats_T, labels_T = test_cache[T]
                if len(feats_T) == 0:
                    continue
                preds = cosine_classify(feats_T, p0, p1)
                acc = float(accuracy_score(labels_T, preds))
                f1m = float(f1_score(labels_T, preds, average="macro"))
                ResultsAcc[S][P][T].append(acc)
                ResultsF1[S][P][T].append(f1m)
                os.makedirs("predictions-bloomz-full-protos", exist_ok=True)
                out_df = pd.DataFrame({
                    "pred": preds,
                    "true": labels_T,
                })
                out_path = f"predictions-bloomz-full-protos/preds_{S}_s{seed}_proto{P}_to_{T}.csv.gz"
                out_df.to_csv(out_path, index=False, compression="gzip")

        del model
        torch.cuda.empty_cache()

    print("\nMacro-F1 (mean±std %, rows = prototype from P, cols = evaluated on T)")
    print("P→T\t" + "\t".join(DATASETS))
    for P in DATASETS:
        row = [P]
        for T in DATASETS:
            row.append(fmt_mean_std(ResultsF1[S][P][T]))
        print("\t".join(row))

    print("\nAccuracy (mean±std %, rows = prototype from P, cols = evaluated on T)")
    print("P→T\t" + "\t".join(DATASETS))
    for P in DATASETS:
        row = [P]
        for T in DATASETS:
            row.append(fmt_mean_std(ResultsAcc[S][P][T]))
        print("\t".join(row))
    print("\nLaTeX rows (F1):")
    for P in DATASETS:
        cells = [fmt_mean_std(ResultsF1[S][P][T]) for T in DATASETS]
        print(f"{S} (protos={P}) & " + " & ".join(cells) + r" \\")



=== Encoder family: fine-tuned on HATEXPLAIN ===

Macro-F1 (mean±std %, rows = prototype from P, cols = evaluated on T)
P→T	hatexplain	olid	sbic	ihc
hatexplain	61.43±1.01	69.55±0.60	48.32±0.49	48.44±0.17
olid	62.02±0.44	73.50±0.39	58.01±0.27	55.22±0.34
sbic	63.32±0.27	71.32±1.06	54.45±1.12	53.02±1.42
ihc	61.32±0.98	70.24±2.33	66.00±0.37	60.92±0.54

Accuracy (mean±std %, rows = prototype from P, cols = evaluated on T)
P→T	hatexplain	olid	sbic	ihc
hatexplain	61.75±1.03	79.56±0.51	48.33±0.50	66.15±0.22
olid	66.76±0.21	77.49±0.24	59.59±0.35	61.60±0.56
sbic	65.28±0.43	78.60±0.42	55.01±1.28	65.62±0.19
ihc	66.29±0.54	71.81±2.50	69.28±0.51	63.36±0.55

LaTeX rows (F1):
hatexplain (protos=hatexplain) & 61.43±1.01 & 69.55±0.60 & 48.32±0.49 & 48.44±0.17 \\
hatexplain (protos=olid) & 62.02±0.44 & 73.50±0.39 & 58.01±0.27 & 55.22±0.34 \\
hatexplain (protos=sbic) & 63.32±0.27 & 71.32±1.06 & 54.45±1.12 & 53.02±1.42 \\
hatexplain (protos=ihc) & 61.32±0.98 & 70.24±2.33 & 66.00±0.37 & 60.92±0.54 \\

=== 

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipython-input-276367315.py", line 193, in <cell line: 0>
    os.makedirs("predictions-bloomz-full-protos", exist_ok=True)
  File "<frozen os>", line 225, in makedirs
OSError: [Errno 107] Transport endpoint is not connected: 'predictions-bloomz-full-protos'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 2099, in showtraceback
    stb = value._render_traceback_()
          ^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'OSError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py

In [None]:
# "hatexplain", "olid", "sbic",
DATASETS = ["ihc"] 
SEEDS    = list(range(5))
for S in DATASETS:
    print(f"\n=== Encoder family: fine-tuned on {S.upper()} ===")
    for seed in SEEDS:
        set_seed(seed)
        model_name = MODEL_PATTERN
        try:
            tok = AutoTokenizer.from_pretrained('cmarkea/bloomz-3b-guardrail')
            if tok.pad_token_id is None and tok.eos_token_id is not None:
                tok.pad_token = tok.eos_token
            model = AutoModel.from_pretrained("cmarkea/bloomz-3b-guardrail", output_hidden_states=True)
        except Exception as e:
            print(f"[WARN] Could not load {model_name}: {e}")
            continue

        if USE_FP16 and device.type == "cuda":
            model = model.half()
        model.to(device)
        model.eval()

        proto_means: Dict[str, Dict[int, np.ndarray]] = {}
        for P in DATASETS:
            train_P, _ = DATA[P]
            tr0 = train_P[train_P["label"] == 0]
            tr1 = train_P[train_P["label"] == 1]
            p0  = tr0.sample(n=min(MAX_PROTOS_PER_CLASS, len(tr0)), random_state=seed) if len(tr0) else tr0
            p1  = tr1.sample(n=min(MAX_PROTOS_PER_CLASS, len(tr1)), random_state=seed) if len(tr1) else tr1
            protos_df = pd.concat([p0, p1], ignore_index=True)

            if len(protos_df) == 0:

                D = model.config.hidden_size
                proto_means[P] = {0: np.zeros((D,), np.float32), 1: np.zeros((D,), np.float32)}
                continue

            proto_loader = make_loader(protos_df["text"].tolist(), protos_df["label"].tolist(),
                                       tok, MAX_LENGTH, BATCH_SIZE, shuffle=False)
            proto_feats, proto_labels = collect_last_token(model, proto_loader, device)
            proto_means[P] = build_class_means(proto_feats, proto_labels)

        test_cache: Dict[str, Tuple[np.ndarray, List[int]]] = {}
        for T in DATASETS:
            _, test_T = DATA[T]
            test_loader = make_loader(test_T["text"].tolist(), test_T["label"].tolist(),
                                      tok, MAX_LENGTH, BATCH_SIZE, shuffle=False)
            test_feats, test_labels = collect_last_token(model, test_loader, device)
            test_cache[T] = (test_feats, test_labels)


        for P in DATASETS:
            p0 = proto_means[P][0]; p1 = proto_means[P][1]
            for T in DATASETS:
                feats_T, labels_T = test_cache[T]
                if len(feats_T) == 0:
                    continue
                preds = cosine_classify(feats_T, p0, p1)
                acc = float(accuracy_score(labels_T, preds))
                f1m = float(f1_score(labels_T, preds, average="macro"))
                ResultsAcc[S][P][T].append(acc)
                ResultsF1[S][P][T].append(f1m)
                os.makedirs("predictions-bloomz-full-protos", exist_ok=True)
                out_df = pd.DataFrame({
                    "pred": preds,
                    "true": labels_T,
                })
                out_path = f"predictions-bloomz-full-protos/preds_{S}_s{seed}_proto{P}_to_{T}.csv.gz"
                out_df.to_csv(out_path, index=False, compression="gzip")

        del model
        torch.cuda.empty_cache()

    print("\nMacro-F1 (mean±std %, rows = prototype from P, cols = evaluated on T)")
    print("P→T\t" + "\t".join(DATASETS))
    for P in DATASETS:
        row = [P]
        for T in DATASETS:
            row.append(fmt_mean_std(ResultsF1[S][P][T]))
        print("\t".join(row))

    print("\nAccuracy (mean±std %, rows = prototype from P, cols = evaluated on T)")
    print("P→T\t" + "\t".join(DATASETS))
    for P in DATASETS:
        row = [P]
        for T in DATASETS:
            row.append(fmt_mean_std(ResultsAcc[S][P][T]))
        print("\t".join(row))
    print("\nLaTeX rows (F1):")
    for P in DATASETS:
        cells = [fmt_mean_std(ResultsF1[S][P][T]) for T in DATASETS]
        print(f"{S} (protos={P}) & " + " & ".join(cells) + r" \\")



=== Encoder family: fine-tuned on IHC ===


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/992 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/6.01G [00:00<?, ?B/s]


Macro-F1 (mean±std %, rows = prototype from P, cols = evaluated on T)
P→T	ihc
ihc	60.92±0.54

Accuracy (mean±std %, rows = prototype from P, cols = evaluated on T)
P→T	ihc
ihc	63.36±0.55

LaTeX rows (F1):
ihc (protos=ihc) & 60.92±0.54 \\
