# Cross-Validated Multi-Label IPV Prediction (Qwen)
End-to-end notebook for sampling test sets, running Qwen multilabel predictions, computing metrics, and persisting cross-validated runs to JSON.

In [None]:
from __future__ import annotations

import json
import random
import re
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Sequence

import numpy as np
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
DATA_PATH = Path("../Dataset/reddit_data.csv")
RESULTS_PATH = Path("results_crossval.json")
PROMPT_KEY = "zeroshot"
N_RUNS = 10
MAX_NEW_TOKENS = 128

torch.set_grad_enabled(False)


In [None]:
# 1) Load data and derive label arrays

df = pd.read_csv(DATA_PATH)

if "items" not in df.columns:
    raise ValueError("Expected 'items' column in the dataset.")

physical = df["Physical Abuse"].astype(bool)
emotional = df["Emotional Abuse"].astype(bool)
sexual = df["Sexual Abuse"].astype(bool)
tag = df["Tag"].astype(bool)
notipv = (~tag) & (~physical) & (~emotional) & (~sexual)

label_arrays = {
    "physical": physical.astype(int).to_numpy(),
    "emotional": emotional.astype(int).to_numpy(),
    "sexual": sexual.astype(int).to_numpy(),
    "not_ipv": notipv.astype(int).to_numpy(),
}

k = int(sexual.sum())
print(f"Loaded {len(df)} rows | sexual positives (k) = {k}")


In [None]:
# 2) Multilabel prompts (copied from w4_qwen.ipynb)

prompt_multilabel_zeroshot = """
You are identifying which forms of Intimate Partner Violence (IPV) appear in a sentence.

Decide independently for emotional, physical, and sexual abuse.Decide independently for emotional, physical, and sexual abuse. If it is a particular type of IPV, set emotional, physical, or sexual to 1, otherwise set it to 0. Multiple IPV types can be true or none at all.
Return ONLY one JSON object enclosed between <json> and </json> with the keys
'id', 'emotional', 'physical', and 'sexual'.

Sentence: "{text}"
Sample ID: "{sample_id}"
<json>
{{
  "id": "{sample_id}",
  "emotional": 0 or 1,
  "physical": 0 or 1,
  "sexual": 0 or 1
}}
</json>
""".strip()

prompt_multilabel_fewshot = """
You are labeling sentences for types of Intimate Partner Violence (IPV).
Use the examples to stay calibrated. For each category (emotional, physical, sexual), output 1 if it is clearly present, else 0.

Examples:
1. "He insults me daily and forbids me from leaving the house." -> emotional: 1, physical: 0, sexual: 0
2. "She slapped me when I disagreed with her." -> emotional: 0, physical: 1, sexual: 0
3. "They pressured me into intimacy when I said no." -> emotional: 0, physical: 0, sexual: 1
4. "We spent the evening cooking together peacefully." -> emotional: 0, physical: 0, sexual: 0

Sentence: "{text}"
Sample ID: "{sample_id}"
<json>
{{
  "id": "{sample_id}",
  "emotional": 0 or 1,
  "physical": 0 or 1,
  "sexual": 0 or 1
}}
</json>
""".strip()

prompt_multilabel_cot = """
Reason silently about whether emotional, physical, or sexual IPV occurs in the sentence.
Use relationship context, threats, coercion, and bodily harm cues.
After your hidden reasoning, output ONLY the JSON block specified below.

Sentence: "{text}"
Sample ID: "{sample_id}"
<json>
{{
  "id": "{sample_id}",
  "emotional": 0 or 1,
  "physical": 0 or 1,
  "sexual": 0 or 1
}}
</json>
""".strip()

prompt_multilabel_meta = """
Act as a cautious social-behavioral analyst.
Label a subtype as 1 only when the text clearly shows that form of IPV; otherwise return 0.
Favor precision to avoid false positives.

Sentence: "{text}"
Sample ID: "{sample_id}"
<json>
{{
  "id": "{sample_id}",
  "emotional": 0 or 1,
  "physical": 0 or 1,
  "sexual": 0 or 1
}}
</json>
""".strip()

prompt_multilabel_selfconsistency = """
Independently evaluate the sentence multiple times to reduce uncertainty.
After internal self-consistency voting, output the majority decision for each subtype in the JSON schema below.
Do not reveal the intermediate thoughts.

Sentence: "{text}"
Sample ID: "{sample_id}"
<json>
{{
  "id": "{sample_id}",
  "emotional": 0 or 1,
  "physical": 0 or 1,
  "sexual": 0 or 1
}}
</json>
""".strip()

multilabel_prompts = {
    "zeroshot": prompt_multilabel_zeroshot,
    "fewshot": prompt_multilabel_fewshot,
    "cot": prompt_multilabel_cot,
    "meta": prompt_multilabel_meta,
    "selfconsistency": prompt_multilabel_selfconsistency,
}


In [None]:
# 3) Load Qwen model

print(f"Loading {MODEL_NAME} ...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
model.eval()
print("Model ready.")


In [None]:
# 4) Prediction helpers using Qwen

def format_prompt(text: str, sample_id: int, prompt_key: str = PROMPT_KEY) -> str:
    template = multilabel_prompts.get(prompt_key, prompt_multilabel_zeroshot)
    return template.format(text=text, sample_id=sample_id)

def parse_multilabel_response(raw_response: str, sample_id: int) -> Dict[str, int]:
    match = re.search(r"<json[^>]*>(.*?)</json>", raw_response, re.DOTALL | re.IGNORECASE)
    block = match.group(1).strip() if match else raw_response

    payload: Dict[str, object] = {}
    try:
        payload = json.loads(block)
    except Exception:
        fallback = re.search(r"\{.*\}", block, re.DOTALL)
        payload = json.loads(fallback.group()) if fallback else {}

    def normalize_label(value: object) -> int:
        if isinstance(value, bool):
            return int(value)
        try:
            return 1 if int(value) == 1 else 0
        except Exception:
            if isinstance(value, str):
                if value.strip().lower() in {"true", "yes", "y", "ipv"}:
                    return 1
        return 0

    if not isinstance(payload, dict):
        payload = {}

    return {
        "id": int(payload.get("id", sample_id)) if isinstance(payload.get("id", sample_id), (int, str)) else sample_id,
        "emotional": normalize_label(payload.get("emotional", 0)),
        "physical": normalize_label(payload.get("physical", 0)),
        "sexual": normalize_label(payload.get("sexual", 0)),
    }

def qwen_predict(
    items: Sequence[str],
    sample_ids: Sequence[int] | None = None,
    prompt_key: str = PROMPT_KEY,
    max_new_tokens: int = MAX_NEW_TOKENS,
) -> List[Dict[str, int]]:
    if sample_ids is None:
        sample_ids = list(range(len(items)))
    if len(sample_ids) != len(items):
        raise ValueError("sample_ids length must match items length")

    predictions: List[Dict[str, int]] = []
    for text, sid in zip(items, sample_ids):
        prompt = format_prompt(text, sample_id=int(sid), prompt_key=prompt_key)
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,
            )

        generated = outputs[0][inputs["input_ids"].shape[-1]:]
        decoded = tokenizer.decode(generated, skip_special_tokens=True)
        parsed = parse_multilabel_response(decoded, sample_id=int(sid))
        parsed["raw_response"] = decoded
        predictions.append(parsed)

    return predictions


In [None]:
# 5) Sampling, evaluation, and aggregation utilities

def compute_confusion_stats(y_true: Sequence[int], y_pred: Sequence[int]) -> Dict[str, float]:
    y_true = np.array(y_true, dtype=int)
    y_pred = np.array(y_pred, dtype=int)
    if y_true.shape != y_pred.shape:
        raise ValueError("Mismatched shapes for ground truth and predictions.")

    tp = int(((y_true == 1) & (y_pred == 1)).sum())
    fp = int(((y_true == 0) & (y_pred == 1)).sum())
    tn = int(((y_true == 0) & (y_pred == 0)).sum())
    fn = int(((y_true == 1) & (y_pred == 0)).sum())

    precision = tp / (tp + fp) if (tp + fp) else 0.0
    recall = tp / (tp + fn) if (tp + fn) else 0.0
    accuracy = (tp + tn) / len(y_true) if len(y_true) else 0.0

    return {
        "TP": tp,
        "FP": fp,
        "TN": tn,
        "FN": fn,
        "Precision": round(precision, 4),
        "Recall": round(recall, 4),
        "Accuracy": round(accuracy, 4),
    }

def sample_test_set(df: pd.DataFrame, k: int, seed: int) -> tuple[pd.DataFrame, List[int]]:
    if k <= 0:
        raise ValueError("k must be > 0 (requires sexual positives present in the dataset).")

    rng = np.random.default_rng(seed)

    sexual_df = df[df["Sexual Abuse"].astype(bool)]
    emotional_df = df[df["Emotional Abuse"].astype(bool)]
    physical_df = df[df["Physical Abuse"].astype(bool)]
    negative_mask = (
        (df["Tag"] == False)
        & (df["Physical Abuse"] == False)
        & (df["Emotional Abuse"] == False)
        & (df["Sexual Abuse"] == False)
    )
    negative_df = df[negative_mask]

    emotional_sample = emotional_df.sample(
        n=min(k, len(emotional_df)),
        random_state=seed,
        replace=False,
    )
    physical_sample = physical_df.sample(
        n=min(k, len(physical_df)),
        random_state=seed,
        replace=False,
    )

    neg_count = int(rng.integers(3 * k, 5 * k + 1))
    negative_sample = negative_df.sample(
        n=min(neg_count, len(negative_df)),
        random_state=seed,
        replace=False,
    )

    combined = pd.concat(
        [sexual_df, emotional_sample, physical_sample, negative_sample],
        axis=0,
    )
    combined = combined.loc[~combined["items"].duplicated(keep="first")]
    return combined, combined.index.tolist()

def evaluate_run(test_df: pd.DataFrame, predictions: List[Dict[str, int]]) -> Dict[str, Dict[str, float]]:
    if len(test_df) != len(predictions):
        raise ValueError("Number of predictions must match the test set size.")

    truth = {
        "physical": test_df["Physical Abuse"].astype(int).to_numpy(),
        "emotional": test_df["Emotional Abuse"].astype(int).to_numpy(),
        "sexual": test_df["Sexual Abuse"].astype(int).to_numpy(),
    }
    truth["not_ipv"] = (
        (test_df["Tag"] == False)
        & (test_df["Physical Abuse"] == False)
        & (test_df["Emotional Abuse"] == False)
        & (test_df["Sexual Abuse"] == False)
    ).astype(int).to_numpy()

    pred_physical = np.array([int(bool(pred.get("physical", 0))) for pred in predictions])
    pred_emotional = np.array([int(bool(pred.get("emotional", 0))) for pred in predictions])
    pred_sexual = np.array([int(bool(pred.get("sexual", 0))) for pred in predictions])
    pred_not_ipv = ((pred_physical == 0) & (pred_emotional == 0) & (pred_sexual == 0)).astype(int)

    preds = {
        "physical": pred_physical,
        "emotional": pred_emotional,
        "sexual": pred_sexual,
        "not_ipv": pred_not_ipv,
    }

    metrics: Dict[str, Dict[str, float]] = {}
    for label in ["physical", "emotional", "sexual", "not_ipv"]:
        metrics[label] = compute_confusion_stats(truth[label], preds[label])
    return metrics

def compute_aggregate_metrics(runs: List[Dict[str, object]]) -> Dict[str, Dict[str, float]]:
    if not runs:
        return {}

    labels = ["physical", "emotional", "sexual", "not_ipv"]
    aggregate: Dict[str, Dict[str, float]] = {}

    for label in labels:
        tp_vals = [int(run["labels"][label]["TP"]) for run in runs]
        fp_vals = [int(run["labels"][label]["FP"]) for run in runs]
        tn_vals = [int(run["labels"][label]["TN"]) for run in runs]
        fn_vals = [int(run["labels"][label]["FN"]) for run in runs]
        precision_vals = [float(run["labels"][label]["Precision"]) for run in runs]
        recall_vals = [float(run["labels"][label]["Recall"]) for run in runs]
        accuracy_vals = [float(run["labels"][label]["Accuracy"]) for run in runs]

        aggregate[label] = {
            "TP": int(sum(tp_vals)),
            "FP": int(sum(fp_vals)),
            "TN": int(sum(tn_vals)),
            "FN": int(sum(fn_vals)),
            "Precision_mean": float(np.mean(precision_vals)),
            "Precision_std": float(np.std(precision_vals)),
            "Recall_mean": float(np.mean(recall_vals)),
            "Recall_std": float(np.std(recall_vals)),
            "Accuracy_mean": float(np.mean(accuracy_vals)),
            "Accuracy_std": float(np.std(accuracy_vals)),
        }

    return aggregate

def run_evaluation(
    N: int = N_RUNS,
    start_run_id: int = 1,
    prompt_key: str = PROMPT_KEY,
    base_seed: int | None = None,
) -> List[Dict[str, object]]:
    k_local = int(df["Sexual Abuse"].astype(bool).sum())
    rng = np.random.default_rng(base_seed or int(datetime.now().timestamp()))

    results: List[Dict[str, object]] = []
    for i in range(N):
        seed = int(rng.integers(0, 1_000_000_000))
        test_df, sample_indices = sample_test_set(df, k=k_local, seed=seed)
        preds = qwen_predict(
            test_df["items"].tolist(),
            sample_ids=test_df.index.tolist(),
            prompt_key=prompt_key,
        )
        label_metrics = evaluate_run(test_df, preds)
        results.append(
            {
                "run_id": int(start_run_id + i),
                "seed": seed,
                "n_samples": int(len(test_df)),
                "labels": label_metrics,
                "samples_used_indices": [int(idx) for idx in sample_indices],
            }
        )
    return results

def ensure_results_file(path: Path = RESULTS_PATH) -> Dict[str, object]:
    if path.exists():
        with open(path, "r", encoding="utf-8") as fp:
            return json.load(fp)
    return {"model_name": "qwen_ipv_classifier", "runs": [], "aggregate": {}}

def save_results_to_json(run_results: List[Dict[str, object]], path: Path = RESULTS_PATH) -> Dict[str, object]:
    path.parent.mkdir(parents=True, exist_ok=True)
    data = ensure_results_file(path)
    data.setdefault("model_name", "qwen_ipv_classifier")
    data.setdefault("runs", [])
    data.setdefault("aggregate", {})

    data["runs"].extend(run_results)
    data["aggregate"] = compute_aggregate_metrics(data["runs"])

    with open(path, "w", encoding="utf-8") as fp:
        json.dump(data, fp, indent=2)
    return data


In [None]:
# 6) Execute cross-validated evaluation and persist results

existing = ensure_results_file(RESULTS_PATH)
start_id = len(existing.get("runs", [])) + 1
print(f"Existing runs: {start_id - 1}. Starting new runs at id {start_id}.")

new_runs = run_evaluation(N=N_RUNS, start_run_id=start_id, prompt_key=PROMPT_KEY)
results_json = save_results_to_json(new_runs, path=RESULTS_PATH)

print(f"Saved {len(new_runs)} runs to {RESULTS_PATH.resolve()}")
print("Aggregate metrics (across all stored runs):")
print(json.dumps(results_json.get("aggregate", {}), indent=2))
