In [2]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from statsmodels.stats.contingency_tables import mcnemar

LABELS_PATH = "/content/drive/Shareddrives/cs685/preds/labels_mixed_test7.npy"

PRED_A_PATH = "/content/drive/Shareddrives/cs685/preds/preds_sft4_domain_tokens_mixed_test4.npy"
PRED_B_PATH = "/content/drive/Shareddrives/cs685/preds/preds_sft7_lora_fixed_merged_mixed_test7.npy"

y = np.load(LABELS_PATH)
pa = np.load(PRED_A_PATH)
pb = np.load(PRED_B_PATH)

assert len(y) == len(pa) == len(pb)

# ---- Report metrics for both ----
def report(name, pred):
    return {
        "acc": accuracy_score(y, pred),
        "macro": f1_score(y, pred, average="macro"),
        "weighted": f1_score(y, pred, average="weighted"),
    }

A = report("A", pa)
B = report("B", pb)
print("A:", A)
print("B:", B)
print("ΔAcc   :", A["acc"] - B["acc"])
print("ΔMacro :", A["macro"] - B["macro"])

# ---- McNemar (paired) for Accuracy ----
a_correct = (pa == y)
b_correct = (pb == y)
b = np.sum(a_correct & ~b_correct)  # A right, B wrong
c = np.sum(~a_correct & b_correct)  # A wrong, B right
table = [[0, b],
         [c, 0]]
res = mcnemar(table, exact=True)
print(f"McNemar exact: b={b}, c={c}, p={res.pvalue:.4g}")

# ---- Paired bootstrap for ΔAccuracy and ΔMacro-F1 ----
def paired_bootstrap(y_true, pred_a, pred_b, B=10000, seed=42):
    rng = np.random.default_rng(seed)
    n = len(y_true)
    diffs_acc = np.empty(B)
    diffs_f1  = np.empty(B)

    for i in range(B):
        idx = rng.integers(0, n, size=n)
        yt = y_true[idx]
        pa_ = pred_a[idx]
        pb_ = pred_b[idx]
        diffs_acc[i] = accuracy_score(yt, pa_) - accuracy_score(yt, pb_)
        diffs_f1[i]  = f1_score(yt, pa_, average="macro") - f1_score(yt, pb_, average="macro")

    ci_acc = (np.quantile(diffs_acc, 0.025), np.quantile(diffs_acc, 0.975))
    ci_f1  = (np.quantile(diffs_f1,  0.025), np.quantile(diffs_f1,  0.975))
    return ci_acc, ci_f1

ci_acc, ci_f1 = paired_bootstrap(y, pa, pb, B=10000)
print("Bootstrap 95% CI for ΔAcc  :", ci_acc)
print("Bootstrap 95% CI for ΔMacro:", ci_f1)


A: {'acc': 0.64499659632403, 'macro': 0.6372314136208436, 'weighted': 0.645966614283667}
B: {'acc': 0.6249149081007488, 'macro': 0.6177390837235109, 'weighted': 0.6269296773032487}
ΔAcc   : 0.020081688223281158
ΔMacro : 0.019492329897332628
McNemar exact: b=324, c=265, p=0.01678
Bootstrap 95% CI for ΔAcc  : (np.float64(0.0037440435670524908), np.float64(0.036427842069434925))
Bootstrap 95% CI for ΔMacro: (np.float64(0.003254709501336876), np.float64(0.03597877149775931))


In [1]:
import math

def wilson_ci(k, n, z=1.96):
    # k correct out of n
    if n == 0:
        return (0.0, 0.0)
    p = k / n
    denom = 1 + (z**2)/n
    center = (p + (z**2)/(2*n)) / denom
    half = (z * math.sqrt((p*(1-p) + (z**2)/(4*n)) / n)) / denom
    return center - half, center + half

# Example: if N=5000 and acc=0.5852
n = 5000
k = round(0.5852 * n)
print(f"zero shot for sft4 accuacy confidence interval {wilson_ci(k, n)}")

zero shot for sft4 accuacy confidence interval (0.5714830755873965, 0.5987861031971597)


In [2]:
import math

def wilson_ci(k, n, z=1.96):
    # k correct out of n
    if n == 0:
        return (0.0, 0.0)
    p = k / n
    denom = 1 + (z**2)/n
    center = (p + (z**2)/(2*n)) / denom
    half = (z * math.sqrt((p*(1-p) + (z**2)/(4*n)) / n)) / denom
    return center - half, center + half

# Example: if N=5000 and acc=0.5852
n = 5000
k = round(0.5486 * n)
print(f"zero shot for sft7 accuacy confidence interval {wilson_ci(k, n)}")

zero shot for sft7 accuacy confidence interval (0.5347742664084661, 0.5623511102221609)


In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
!pip -q install transformers datasets accelerate scikit-learn statsmodels

import json
import numpy as np
from pathlib import Path
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

MODEL_DIR = "/content/drive/MyDrive/models/sft4_domain_tokens"
TEST_PATH = "/content/drive/Shareddrives/cs685/final_data_SFT/label_mixed_3_test.jsonl"

# -------- Load JSONL test set --------
texts, labels = [], []
with open(TEST_PATH, "r", encoding="utf-8") as f:
    for line in f:
        ex = json.loads(line)
        texts.append(ex["text"])
        labels.append(int(ex["label"]))
labels = np.array(labels)

# -------- Load model/tokenizer --------
device = "cuda" if torch.cuda.is_available() else "cpu"
tok = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR).to(device)
model.eval()

# -------- Batched inference --------
def predict(texts, batch_size=32, max_len=256):
    preds = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tok(
            batch,
            truncation=True,
            padding=True,
            max_length=max_len,
            return_tensors="pt",
        ).to(device)
        with torch.no_grad():
            logits = model(**inputs).logits
        preds.extend(logits.argmax(dim=-1).cpu().numpy().tolist())
    return np.array(preds)

preds = predict(texts, batch_size=32)

# -------- Metrics --------
acc = accuracy_score(labels, preds)
macro = f1_score(labels, preds, average="macro")
weighted = f1_score(labels, preds, average="weighted")
print("N =", len(labels))
print("Accuracy   :", acc)
print("Macro F1   :", macro)
print("Weighted F1:", weighted)

# -------- Save predictions (paired testing later) --------
OUT_DIR = "/content/drive/Shareddrives/cs685/preds"
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)

out_path = f"{OUT_DIR}/preds_{Path(MODEL_DIR).name}_mixed_test4.npy"
np.save(out_path, preds)
np.save(f"{OUT_DIR}/labels_mixed_test4.npy", labels)
print("Saved:", out_path)


N = 2938
Accuracy   : 0.64499659632403
Macro F1   : 0.6372314136208436
Weighted F1: 0.645966614283667
Saved: /content/drive/Shareddrives/cs685/preds/preds_sft4_domain_tokens_mixed_test4.npy


In [5]:
import json
import numpy as np
from pathlib import Path
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

MODEL_DIR = "/content/drive/MyDrive/models/sft7_lora_fixed_merged"
TEST_PATH = "/content/drive/Shareddrives/cs685/final_data_SFT/label_mixed_3_test.jsonl"

# -------- Load JSONL test set --------
texts, labels = [], []
with open(TEST_PATH, "r", encoding="utf-8") as f:
    for line in f:
        ex = json.loads(line)
        texts.append(ex["text"])
        labels.append(int(ex["label"]))
labels = np.array(labels)

# -------- Load model/tokenizer --------
device = "cuda" if torch.cuda.is_available() else "cpu"
tok = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR).to(device)
model.eval()

# -------- Batched inference --------
def predict(texts, batch_size=32, max_len=256):
    preds = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tok(
            batch,
            truncation=True,
            padding=True,
            max_length=max_len,
            return_tensors="pt",
        ).to(device)
        with torch.no_grad():
            logits = model(**inputs).logits
        preds.extend(logits.argmax(dim=-1).cpu().numpy().tolist())
    return np.array(preds)

preds = predict(texts, batch_size=32)

# -------- Metrics --------
acc = accuracy_score(labels, preds)
macro = f1_score(labels, preds, average="macro")
weighted = f1_score(labels, preds, average="weighted")
print("N =", len(labels))
print("Accuracy   :", acc)
print("Macro F1   :", macro)
print("Weighted F1:", weighted)

# -------- Save predictions (paired testing later) --------
OUT_DIR = "/content/drive/Shareddrives/cs685/preds"
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)

out_path = f"{OUT_DIR}/preds_{Path(MODEL_DIR).name}_mixed_test7.npy"
np.save(out_path, preds)
np.save(f"{OUT_DIR}/labels_mixed_test7.npy", labels)
print("Saved:", out_path)


N = 2938
Accuracy   : 0.6249149081007488
Macro F1   : 0.6177390837235109
Weighted F1: 0.6269296773032487
Saved: /content/drive/Shareddrives/cs685/preds/preds_sft7_lora_fixed_merged_mixed_test7.npy
