# Multilabel IPV Prediction Generation with Qwen

This notebook generates multilabel predictions for IPV classification using Qwen model.


## Configuration

Edit the settings below before running the notebook.


In [None]:
# !git clone https://github.com/zelaneroz/ipvresearch25
# %cd ipvresearch25/1_LLM_Eval

In [None]:
# ========== CONFIGURATION - EDIT THESE SETTINGS ==========

# Prompt template (use {text} for sentence and {sample_id} for ID)
PROMPT_TEMPLATE = """
You are identifying which forms of Intimate Partner Violence (IPV) appear in a sentence.

Decide independently for emotional, physical, and sexual abuse. If it is a particular type of IPV, set emotional, physical, or sexual to 1, otherwise set it to 0. Multiple IPV types can be true or none at all.

Return ONLY one JSON object enclosed between <json> and </json> with the keys 'id', 'emotional', 'physical', and 'sexual'.

Sentence: "{text}"
Sample ID: "{sample_id}"

<json>
{{
  "id": "{sample_id}",
  "emotional": 0 or 1,
  "physical": 0 or 1,
  "sexual": 0 or 1
}}
</json>
""".strip()

# Results directory (will be created if it doesn't exist)
RESULTS_DIR = "w4/qwen"

# Dataset path
DATASET_PATH = "../Dataset/reddit_data_fortesting.csv"

# Model name
MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"

# Number of samples to process (set to None to process all)
NUM_SAMPLES = None

# Output filename (without extension)
OUTPUT_FILENAME = "multilabel_predictions"

# =========================================================


## Install Dependencies


In [None]:
# Install required packages
%pip install -q transformers torch accelerate


## Load Dataset


In [None]:
import pandas as pd
from pathlib import Path

# Load dataset
df = pd.read_csv(DATASET_PATH)
print(f"Dataset loaded: {len(df)} rows")

# Limit to NUM_SAMPLES if specified
if NUM_SAMPLES is not None:
    df = df.head(NUM_SAMPLES)
    print(f"Processing {len(df)} samples")

# Display first few rows
df.head()


## Load Qwen Model


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

print(f"Loading {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
model.eval()
print("Model loaded and ready!")


## Generate Predictions


In [None]:
import json
import re
from typing import Dict, Any, Optional

def extract_json_from_response(response: str) -> Optional[Dict[str, Any]]:
    """Extract JSON from model response, handling various formats."""
    # Try to find JSON between <json> tags
    match = re.search(r"<json[^>]*>\s*(.*?)\s*</json>", response, re.DOTALL | re.IGNORECASE)
    if match:
        json_str = match.group(1).strip()
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            pass
    
    # Try to find any JSON object in the response
    match = re.search(r"\{[^{}]*\"(id|emotional|physical|sexual)\"[^{}]*\}", response, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(0))
        except json.JSONDecodeError:
            pass
    
    return None

def make_prediction(text: str, sample_id: int) -> Dict[str, Any]:
    """Make a prediction for a single text sample."""
    # Format the prompt
    prompt = PROMPT_TEMPLATE.format(text=text, sample_id=sample_id)
    
    try:
        # Tokenize and generate
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=128,
                temperature=0.0,
                do_sample=False,
            )
        
        # Decode the response
        generated_tokens = outputs[0][inputs["input_ids"].shape[-1]:]
        response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
        
        # Extract JSON from response
        prediction = extract_json_from_response(response)
        
        if prediction is None:
            # Fallback: try to extract values from text
            prediction = {
                "id": sample_id,
                "emotional": 1 if "emotional" in response.lower() and ("1" in response or "true" in response.lower()) else 0,
                "physical": 1 if "physical" in response.lower() and ("1" in response or "true" in response.lower()) else 0,
                "sexual": 1 if "sexual" in response.lower() and ("1" in response or "true" in response.lower()) else 0,
            }
        
        # Ensure ID is set
        prediction["id"] = sample_id
        
        # Ensure binary values are integers (0 or 1)
        for key in ["emotional", "physical", "sexual"]:
            if key in prediction:
                val = prediction[key]
                if isinstance(val, bool):
                    prediction[key] = 1 if val else 0
                elif isinstance(val, (int, float)):
                    prediction[key] = 1 if val > 0 else 0
                else:
                    prediction[key] = 0
        
        return {
            "id": sample_id,
            "text": text,
            "emotional": prediction.get("emotional", 0),
            "physical": prediction.get("physical", 0),
            "sexual": prediction.get("sexual", 0),
            "raw_response": response,
        }
    
    except Exception as e:
        return {
            "id": sample_id,
            "text": text,
            "emotional": 0,
            "physical": 0,
            "sexual": 0,
            "raw_response": f"ERROR: {str(e)}",
        }

print("Starting prediction generation...")
predictions = []

for idx, row in df.iterrows():
    text = row["items"] if "items" in df.columns else str(row.iloc[0])
    sample_id = int(idx)
    
    print(f"Processing sample {sample_id}...", end=" ")
    pred = make_prediction(text, sample_id)
    predictions.append(pred)
    print(f"Done (emotional={pred['emotional']}, physical={pred['physical']}, sexual={pred['sexual']})")

print(f"\nGenerated {len(predictions)} predictions!")


In [None]:
from pathlib import Path
from datetime import datetime

# Create results directory if it doesn't exist
results_path = Path(RESULTS_DIR)
results_path.mkdir(parents=True, exist_ok=True)

# Generate output filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
output_file = results_path / f"{OUTPUT_FILENAME}_{timestamp}.json"

# Save predictions to JSON
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(predictions, f, indent=2, ensure_ascii=False)

print(f"Results saved to: {output_file}")
print(f"Total predictions: {len(predictions)}")

# Display summary statistics
emotional_count = sum(1 for p in predictions if p["emotional"] == 1)
physical_count = sum(1 for p in predictions if p["physical"] == 1)
sexual_count = sum(1 for p in predictions if p["sexual"] == 1)

print(f"\nSummary:")
print(f"  Emotional abuse: {emotional_count} ({100*emotional_count/len(predictions):.1f}%)")
print(f"  Physical abuse: {physical_count} ({100*physical_count/len(predictions):.1f}%)")
print(f"  Sexual abuse: {sexual_count} ({100*sexual_count/len(predictions):.1f}%)")


## Generate Predictions with Chain-of-Thought (COT) / Meta Prompts

This section is specifically for prompts that include reasoning/chain-of-thought output, such as COT or Meta prompt styles. The reasoning steps will be extracted and saved along with the predictions.


In [None]:
# ========== CONFIGURATION FOR COT/META PROMPTS - EDIT THESE SETTINGS ==========

# Prompt template with reasoning steps (use {text} for sentence and {sample_id} for ID)
PROMPT_TEMPLATE_COT = """
You are tasked to determine whether the sentence contains emotional abuse, physical abuse, sexual abuse, or none. A sentence may contain multiple types or none at all.

Your chain-of-thought should check for:
• Physical IPV cues: hitting, kicking, choking, slapping, pushing, grabbing, burning, using weapons, restraining, threats of physical harm.
• Sexual IPV cues: coercion, pressuring for sex, unwanted touching, sexual name-calling, lack of consent, intoxicated/incapacitated scenarios, forced penetration, sexual intimidation.
• Emotional IPV cues: manipulation, humiliation, gaslighting, isolation, jealousy, threatening abandonment, unpredictable anger, degradation, economic control, insults.

Other cues involve:
• relationship context (partner, ex, spouse, boyfriend/girlfriend)
• threats, coercion, domination, intimidation
• whether harm or risk is implied even without explicit force
• whether the described act is consensual or not

After your reasoning, output the JSON block below.

Sentence: "{text}"
Sample ID: "{sample_id}"

<json>
{{
  "id": "{sample_id}",
  "emotional": 0 or 1,
  "physical": 0 or 1,
  "sexual": 0 or 1,
  "reasoning_steps": "One short, high-level explanation of why each label was chosen, describing a summary of your chain-of-thought."
}}
</json>
""".strip()

# Results directory (will be created if it doesn't exist)
RESULTS_DIR_COT = "w4/qwen"

# Dataset path (same as above, or can be different)
DATASET_PATH_COT = "../Dataset/reddit_data_fortesting.csv"

# Model name (same as above)
MODEL_NAME_COT = "Qwen/Qwen2.5-7B-Instruct"

# Number of samples to process (set to None to process all)
NUM_SAMPLES_COT = None

# Output filename (without extension)
OUTPUT_FILENAME_COT = "multi_cot"

# =========================================================


In [None]:
import json
import re
from typing import Dict, Any, Optional

def extract_json_from_response_cot(response: str) -> Optional[Dict[str, Any]]:
    """Extract JSON from model response, handling various formats. Also extracts reasoning_steps."""
    # Try to find JSON between <json> tags
    match = re.search(r"<json[^>]*>\s*(.*?)\s*</json>", response, re.DOTALL | re.IGNORECASE)
    if match:
        json_str = match.group(1).strip()
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            pass
    
    # Try to find any JSON object in the response
    match = re.search(r"\{[^{}]*\"(id|emotional|physical|sexual|reasoning_steps)\"[^{}]*\}", response, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(0))
        except json.JSONDecodeError:
            pass
    
    return None

def make_prediction_with_reasoning(text: str, sample_id: int) -> Dict[str, Any]:
    """Make a prediction for a single text sample and extract reasoning steps."""
    # Format the prompt
    prompt = PROMPT_TEMPLATE_COT.format(text=text, sample_id=sample_id)
    
    try:
        # Tokenize and generate
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=256,  # Increased for reasoning steps
                temperature=0.0,
                do_sample=False,
            )
        
        # Decode the response
        generated_tokens = outputs[0][inputs["input_ids"].shape[-1]:]
        response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
        
        # Extract JSON from response
        prediction = extract_json_from_response_cot(response)
        
        # Extract reasoning_steps if available (either from JSON or from raw response)
        reasoning_steps = ""
        if prediction and "reasoning_steps" in prediction:
            reasoning_steps = str(prediction.get("reasoning_steps", ""))
        elif not prediction:
            # Try to extract reasoning from raw response if JSON extraction failed
            # Look for text before <json> tags or after reasoning markers
            reasoning_match = re.search(r"reasoning[:\s]*(.*?)(?:<json>|$)", response, re.DOTALL | re.IGNORECASE)
            if reasoning_match:
                reasoning_steps = reasoning_match.group(1).strip()
        
        if prediction is None:
            # Fallback: try to extract values from text
            prediction = {
                "id": sample_id,
                "emotional": 1 if "emotional" in response.lower() and ("1" in response or "true" in response.lower()) else 0,
                "physical": 1 if "physical" in response.lower() and ("1" in response or "true" in response.lower()) else 0,
                "sexual": 1 if "sexual" in response.lower() and ("1" in response or "true" in response.lower()) else 0,
            }
        
        # Ensure ID is set
        prediction["id"] = sample_id
        
        # Ensure binary values are integers (0 or 1)
        for key in ["emotional", "physical", "sexual"]:
            if key in prediction:
                val = prediction[key]
                if isinstance(val, bool):
                    prediction[key] = 1 if val else 0
                elif isinstance(val, (int, float)):
                    prediction[key] = 1 if val > 0 else 0
                else:
                    prediction[key] = 0
        
        return {
            "id": sample_id,
            "text": text,
            "emotional": prediction.get("emotional", 0),
            "physical": prediction.get("physical", 0),
            "sexual": prediction.get("sexual", 0),
            "reasoning_steps": reasoning_steps,
            "raw_response": response,
        }
    
    except Exception as e:
        return {
            "id": sample_id,
            "text": text,
            "emotional": 0,
            "physical": 0,
            "sexual": 0,
            "reasoning_steps": f"ERROR: {str(e)}",
            "raw_response": f"ERROR: {str(e)}",
        }

print("Starting prediction generation with COT/Meta reasoning...")
predictions_cot = []

# Load dataset for COT if different path, otherwise use existing df
if DATASET_PATH_COT != DATASET_PATH or 'df_cot' not in locals():
    df_cot = pd.read_csv(DATASET_PATH_COT)
    if NUM_SAMPLES_COT is not None:
        df_cot = df_cot.head(NUM_SAMPLES_COT)
    print(f"Dataset loaded for COT: {len(df_cot)} rows")
else:
    df_cot = df.copy()
    if NUM_SAMPLES_COT is not None:
        df_cot = df_cot.head(NUM_SAMPLES_COT)

for idx, row in df_cot.iterrows():
    text = row["items"] if "items" in df_cot.columns else str(row.iloc[0])
    sample_id = int(idx)
    
    print(f"Processing sample {sample_id}...", end=" ")
    pred = make_prediction_with_reasoning(text, sample_id)
    predictions_cot.append(pred)
    has_reasoning = len(pred.get('reasoning_steps', '')) > 0
    print(f"Done (emotional={pred['emotional']}, physical={pred['physical']}, sexual={pred['sexual']}, reasoning={has_reasoning})")

print(f"\nGenerated {len(predictions_cot)} predictions with reasoning steps!")


## Save Results


In [None]:
from pathlib import Path
from datetime import datetime

# Create results directory if it doesn't exist
results_path = Path(RESULTS_DIR_COT)
results_path.mkdir(parents=True, exist_ok=True)

# Generate output filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
output_file = results_path / f"{OUTPUT_FILENAME_COT}_{timestamp}.json"

# Save predictions to JSON (including reasoning_steps)
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(predictions_cot, f, indent=2, ensure_ascii=False)

print(f"Results saved to: {output_file}")
print(f"Total predictions: {len(predictions_cot)}")

# Display summary statistics
emotional_count = sum(1 for p in predictions_cot if p["emotional"] == 1)
physical_count = sum(1 for p in predictions_cot if p["physical"] == 1)
sexual_count = sum(1 for p in predictions_cot if p["sexual"] == 1)
reasoning_count = sum(1 for p in predictions_cot if p.get("reasoning_steps") and len(str(p.get("reasoning_steps", ""))) > 0)

print(f"\nSummary:")
print(f"  Emotional abuse: {emotional_count} ({100*emotional_count/len(predictions_cot):.1f}%)")
print(f"  Physical abuse: {physical_count} ({100*physical_count/len(predictions_cot):.1f}%)")
print(f"  Sexual abuse: {sexual_count} ({100*sexual_count/len(predictions_cot):.1f}%)")
print(f"  Predictions with reasoning steps: {reasoning_count} ({100*reasoning_count/len(predictions_cot):.1f}%)")

# Show example with reasoning
if reasoning_count > 0:
    print(f"\nExample prediction with reasoning:")
    example = next((p for p in predictions_cot if p.get("reasoning_steps") and len(str(p.get("reasoning_steps", ""))) > 0), None)
    if example:
        print(f"  ID: {example['id']}")
        print(f"  Text: {example['text'][:100]}...")
        print(f"  Labels: emotional={example['emotional']}, physical={example['physical']}, sexual={example['sexual']}")
        print(f"  Reasoning: {example['reasoning_steps'][:200]}...")


## Evaluate COT/Meta Predictions and Save Metrics to multitype_result.json

This cell evaluates the COT predictions against ground truth and saves metrics to multitype_result.json, including reasoning_steps metadata.


In [None]:
# Evaluate COT/Meta predictions and save metrics to multitype_result.json
import json
from pathlib import Path
from datetime import datetime
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Configuration
GT_PATH_COT = "../Dataset/reddit_data.csv"  # Ground truth CSV path
MULTITYPE_RESULT_JSON_PATH = "../results/multitype_result.json"  # Path to results JSON
PROMPT_TYPE_COT = "cot"  # Update this to match your prompt type (e.g., "cot", "meta")

# Load ground truth
print("Loading ground truth...")
df_gt_cot = pd.read_csv(GT_PATH_COT)

# Ensure ID column exists
if "id" not in df_gt_cot.columns:
    df_gt_cot = df_gt_cot.reset_index().rename(columns={"index": "id"})
else:
    df_gt_cot["id"] = df_gt_cot["id"].astype(int)

print(f"Ground truth loaded: {len(df_gt_cot)} samples")

# Convert predictions to DataFrame
if len(predictions_cot) == 0:
    print("ERROR: No predictions found. Please run the prediction generation cell first.")
else:
    print(f"Predictions loaded: {len(predictions_cot)} samples")
    
    # Create predictions DataFrame
    preds_df = pd.DataFrame(predictions_cot)
    preds_df["id"] = preds_df["id"].astype(int)
    
    # Ensure binary values (0/1) in predictions
    for col in ["emotional", "physical", "sexual"]:
        if col in preds_df.columns:
            preds_df[col] = preds_df[col].astype(int).clip(0, 1)
    
    # Merge with ground truth
    merged_df = df_gt_cot.merge(
        preds_df[["id", "emotional", "physical", "sexual", "reasoning_steps"]],
        on='id',
        how='inner',
        suffixes=('_true', '_pred')
    )
    
    print(f"Merged dataset: {len(merged_df)} samples")
    
    # Prepare ground truth columns (convert boolean to int)
    merged_df["emotional_true"] = merged_df["Emotional Abuse"].astype(int)
    merged_df["physical_true"] = merged_df["Physical Abuse"].astype(int)
    merged_df["sexual_true"] = merged_df["Sexual Abuse"].astype(int)
    
    # Prepare prediction columns
    merged_df["emotional_pred"] = merged_df["emotional"].astype(int)
    merged_df["physical_pred"] = merged_df["physical"].astype(int)
    merged_df["sexual_pred"] = merged_df["sexual"].astype(int)
    
    # Compute metrics for each label
    def compute_binary_metrics(y_true, y_pred):
        """Compute detailed binary classification metrics."""
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        cm = confusion_matrix(y_true, y_pred)
        
        # Extract TP, FP, TN, FN from confusion matrix
        if cm.shape == (2, 2):
            tn, fp, fn, tp = cm.ravel()
        else:
            # Handle edge cases
            if len(cm) == 1:
                if y_true.sum() == 0:
                    tn, fp, fn, tp = len(y_true), 0, 0, 0
                else:
                    tn, fp, fn, tp = 0, 0, 0, len(y_true)
            else:
                tn, fp, fn, tp = 0, 0, 0, 0
        
        return {
            "accuracy": float(accuracy),
            "precision": float(precision),
            "recall": float(recall),
            "f1": float(f1),
            "true_positives": int(tp),
            "false_positives": int(fp),
            "true_negatives": int(tn),
            "false_negatives": int(fn),
        }
    
    # Compute metrics for each abuse type
    physical_metrics = compute_binary_metrics(merged_df["physical_true"], merged_df["physical_pred"])
    emotional_metrics = compute_binary_metrics(merged_df["emotional_true"], merged_df["emotional_pred"])
    sexual_metrics = compute_binary_metrics(merged_df["sexual_true"], merged_df["sexual_pred"])
    
    # Compute reasoning_steps statistics
    reasoning_stats = {}
    reasoning_steps_list = merged_df["reasoning_steps"].fillna("").astype(str)
    valid_reasoning = reasoning_steps_list[reasoning_steps_list.str.len() > 0]
    
    reasoning_stats = {
        "total_samples": len(merged_df),
        "samples_with_reasoning": int(valid_reasoning.count()),
        "reasoning_coverage_percent": float(100 * valid_reasoning.count() / len(merged_df)),
        "avg_reasoning_length": float(valid_reasoning.str.len().mean()) if len(valid_reasoning) > 0 else 0.0,
        "min_reasoning_length": int(valid_reasoning.str.len().min()) if len(valid_reasoning) > 0 else 0,
        "max_reasoning_length": int(valid_reasoning.str.len().max()) if len(valid_reasoning) > 0 else 0,
    }
    
    # Format the results entry
    result_entry = {
        "model": "qwen2.5-7b",
        "prompt_type": PROMPT_TYPE_COT,
        "date_tested": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "total_samples": len(merged_df),
        "metrics": {
            "physical": physical_metrics,
            "emotional": emotional_metrics,
            "sexual": sexual_metrics,
        },
        "reasoning_steps_stats": reasoning_stats,
        "notes": f"Evaluation of {PROMPT_TYPE_COT} prompt with chain-of-thought reasoning"
    }
    
    # Load existing results or create new list
    result_json_path = Path(MULTITYPE_RESULT_JSON_PATH)
    result_json_path.parent.mkdir(parents=True, exist_ok=True)
    
    if result_json_path.exists():
        with open(result_json_path, 'r', encoding='utf-8') as f:
            existing_results = json.load(f)
        # Ensure it's a list
        if not isinstance(existing_results, list):
            existing_results = [existing_results] if existing_results else []
    else:
        existing_results = []
    
    # Append new result
    existing_results.append(result_entry)
    
    # Save to JSON file
    with open(result_json_path, 'w', encoding='utf-8') as f:
        json.dump(existing_results, f, indent=2, ensure_ascii=False)
    
    print(f"\n{'='*80}")
    print(f"Results saved to: {result_json_path.resolve()}")
    print(f"{'='*80}")
    print(f"Entry added: qwen2.5-7b - {PROMPT_TYPE_COT}")
    print(f"\nMetrics Summary:")
    print(f"  Physical: Accuracy={physical_metrics['accuracy']:.4f}, F1={physical_metrics['f1']:.4f}")
    print(f"  Emotional: Accuracy={emotional_metrics['accuracy']:.4f}, F1={emotional_metrics['f1']:.4f}")
    print(f"  Sexual: Accuracy={sexual_metrics['accuracy']:.4f}, F1={sexual_metrics['f1']:.4f}")
    print(f"\nReasoning Steps Statistics:")
    print(f"  Samples with reasoning: {reasoning_stats['samples_with_reasoning']}/{reasoning_stats['total_samples']} ({reasoning_stats['reasoning_coverage_percent']:.1f}%)")
    print(f"  Average reasoning length: {reasoning_stats['avg_reasoning_length']:.0f} characters")
    print(f"  Min/Max reasoning length: {reasoning_stats['min_reasoning_length']}/{reasoning_stats['max_reasoning_length']} characters")
    print(f"{'='*80}")
