# Error Analysis: Binary & Multilabel Predictions

This notebook analyzes model errors for both binary and multilabel IPV classification.

In [None]:
import pandas as pd
import json
import numpy as np
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
from typing import Optional

# ========== CONFIGURATION ==========
# Choose analysis mode: "binary" or "multilabel"
ANALYSIS_MODE = "binary"  # Change to "multilabel" for multilabel analysis

# Ground truth CSV path
TRUTH_CSV_PATH = "../../Dataset/reddit_data.csv"

# Predictions JSON path (adjust based on mode)
if ANALYSIS_MODE == "binary":
    PREDICTIONS_JSON_PATH = "../w3/qwen/binary_fewshot.json"
elif ANALYSIS_MODE == "multilabel":
    PREDICTIONS_JSON_PATH = "../w4/qwen/multilabel_predictions.json"  # Update with your path

# ====================================

print(f"Analysis mode: {ANALYSIS_MODE}")
print(f"Truth CSV: {TRUTH_CSV_PATH}")
print(f"Predictions JSON: {PREDICTIONS_JSON_PATH}")

In [None]:
## 1. Load Ground Truth & Predictions


Unnamed: 0,id,prompt_type,extracted_label
0,0,fewshot,NOT_IPV
1,1,fewshot,NOT_IPV
2,2,fewshot,NOT_IPV
3,3,fewshot,NOT_IPV
4,4,fewshot,NOT_IPV
...,...,...,...
613,613,fewshot,NOT_IPV
614,614,fewshot,NOT_IPV
615,615,fewshot,NOT_IPV
616,616,fewshot,NOT_IPV


In [None]:
# Load ground truth CSV
truth = pd.read_csv(TRUTH_CSV_PATH)

# Standardize column names
truth = truth.rename(columns={
    "Emotional Abuse": "emotional",
    "Physical Abuse": "physical",
    "Sexual Abuse": "sexual",
    "items": "text"
})

# Ensure ID column exists
if "id" not in truth.columns:
    truth["id"] = truth.index

# For binary analysis: create Tag column from abuse types
if ANALYSIS_MODE == "binary":
    truth["Tag"] = (truth["emotional"] | truth["physical"] | truth["sexual"]).astype(bool)

print(f"Ground truth loaded: {len(truth)} samples")
print(f"Columns: {truth.columns.tolist()}")
truth.head()

Unnamed: 0.1,Unnamed: 0,items,Physical Abuse,Emotional Abuse,Sexual Abuse,Tag,type
0,0,I'm sitting here with a goofy smile and feelin...,False,False,False,False,soft
1,1,It's really boosting my confidence when he say...,False,False,False,False,soft
2,2,I never imagined that someone could make me s...,False,False,False,False,soft
3,3,He motivates me to become the best version of ...,False,False,False,False,soft
4,4,He’s like a best friend that I can also live w...,False,False,False,False,soft
...,...,...,...,...,...,...,...
613,613,He has never said anything negative about the ...,False,False,False,False,soft
614,614,You'd think that in theory online dating would...,False,False,False,False,soft
615,615,I searched for a similar type post because I a...,False,False,False,False,soft
616,616,"So let me explain first, my fiance and me have...",False,False,False,False,soft


In [None]:
# Load predictions JSON
with open(PREDICTIONS_JSON_PATH) as f:
    preds_data = json.load(f)

preds = pd.DataFrame(preds_data)

# Handle binary vs multilabel differently
if ANALYSIS_MODE == "binary":
    # Binary predictions: extract label
    if "extracted_label" in preds.columns:
        preds["predicted_ipv"] = (preds["extracted_label"] == "IPV").astype(bool)
    elif "label" in preds.columns:
        preds["predicted_ipv"] = (preds["label"] == "IPV").astype(bool)
    else:
        raise ValueError("Binary predictions must have 'extracted_label' or 'label' column")
    
    # Ensure ID column
    if "id" not in preds.columns:
        preds["id"] = preds.index
    
    # Merge on ID
    df = truth.merge(preds[["id", "predicted_ipv"]], on="id", how="inner")
    
    # Compute wrong predictions (binary)
    df["wrong"] = df["Tag"] != df["predicted_ipv"]
    df["predicted_label"] = df["predicted_ipv"].map({True: "IPV", False: "NOT_IPV"})
    
elif ANALYSIS_MODE == "multilabel":
    # Multilabel predictions: extract emotional, physical, sexual
    if "id" not in preds.columns:
        preds["id"] = preds.index
    
    # Ensure binary values (0/1) in predictions
    for col in ["emotional", "physical", "sexual"]:
        if col in preds.columns:
            preds[f"{col}_pred"] = preds[col].astype(int).clip(0, 1)
        else:
            preds[f"{col}_pred"] = 0
    
    # Prepare truth columns (convert boolean to int)
    truth_for_merge = truth[["id", "text", "emotional", "physical", "sexual"]].copy()
    truth_for_merge["emotional_true"] = truth_for_merge["emotional"].astype(int)
    truth_for_merge["physical_true"] = truth_for_merge["physical"].astype(int)
    truth_for_merge["sexual_true"] = truth_for_merge["sexual"].astype(int)
    
    # Merge on ID
    df = truth_for_merge.merge(
        preds[["id", "emotional_pred", "physical_pred", "sexual_pred"]],
        on="id",
        how="inner"
    )
    
    # Compute wrong predictions (any mismatch)
    df["wrong"] = (
        (df["emotional_true"] != df["emotional_pred"]) |
        (df["physical_true"] != df["physical_pred"]) |
        (df["sexual_true"] != df["sexual_pred"])
    )
    
    # Create error pattern column
    df["error_pattern"] = (
        (df["physical_true"] != df["physical_pred"]).map({True: "P", False: ""}) +
        (df["emotional_true"] != df["emotional_pred"]).map({True: "E", False: ""}) +
        (df["sexual_true"] != df["sexual_pred"]).map({True: "S", False: ""})
    ).replace({"": "None"})

print(f"Predictions loaded: {len(preds)} samples")
print(f"Merged dataset: {len(df)} samples")
print(f"Wrong predictions: {df['wrong'].sum()} ({100*df['wrong'].sum()/len(df):.1f}%)")
preds.head()

Unnamed: 0,items,Physical Abuse,Emotional Abuse,Sexual Abuse,Prediction
16,"Sometimes when my partner speaks to me, I feel...",False,True,False,NOT_IPV
18,I know that if she senses frustration directed...,False,True,False,NOT_IPV
20,"As she was yelling, I continued to ask why she...",False,True,False,NOT_IPV
22,She got angry and we blew up again.,False,True,False,NOT_IPV
28,"He says a lot of this was insecurity, validati...",False,True,False,NOT_IPV
...,...,...,...,...,...
390,"Don't come at me saying you're the ""relationsh...",False,False,False,IPV
424,Because calling them sexless virgins is an eas...,False,False,False,IPV
452,You can't be out here getting offended when I ...,False,False,False,IPV
540,"I am in bed now, tearing up after being hung u...",False,False,False,IPV


## 2. Extract Wrong Predictions

In [None]:
# Extract wrong predictions
wrongs = df[df["wrong"]].copy()

print(f"Total wrong predictions: {len(wrongs)}")
print(f"Wrong percentage: {100*len(wrongs)/len(df):.2f}%")

if ANALYSIS_MODE == "binary":
    false_negatives = wrongs[wrongs["predicted_label"] == "NOT_IPV"]
    false_positives = wrongs[wrongs["predicted_label"] == "IPV"]
    
    print(f"\nBinary Classification Errors:")
    print(f"  False Negatives (IPV → NOT_IPV): {len(false_negatives)}")
    print(f"  False Positives (NOT_IPV → IPV): {len(false_positives)}")
    
elif ANALYSIS_MODE == "multilabel":
    print(f"\nMultilabel Error Patterns:")
    print(wrongs["error_pattern"].value_counts())

wrongs.head(20)

59


Unnamed: 0,items,Physical Abuse,Emotional Abuse,Sexual Abuse,Prediction
16,"Sometimes when my partner speaks to me, I feel...",False,True,False,NOT_IPV
18,I know that if she senses frustration directed...,False,True,False,NOT_IPV
20,"As she was yelling, I continued to ask why she...",False,True,False,NOT_IPV
22,She got angry and we blew up again.,False,True,False,NOT_IPV
28,"He says a lot of this was insecurity, validati...",False,True,False,NOT_IPV
29,The lies are what killed me.,False,True,False,NOT_IPV
37,She has a very short fuse and once she gets an...,False,True,False,NOT_IPV
39,Shes still angry at me about it now hours on.,False,True,False,NOT_IPV
45,"We’ve also fought about a lot of other stuff,...",False,True,False,NOT_IPV
50,He ended up breaking his phone out of anger.,False,True,False,NOT_IPV


## 3. Binary Analysis: False Negatives & False Positives

In [None]:
if ANALYSIS_MODE == "binary":
    # False Negatives: IPV cases predicted as NOT_IPV
    fn = wrongs[wrongs["predicted_label"] == "NOT_IPV"].copy()
    print(f"False Negatives ({len(fn)} cases):")
    print("=" * 80)
    fn_display = fn[["text", "emotional", "physical", "sexual", "predicted_label"]].copy()
    fn_display.head(20)

7
If your answer to a direct consequence of your own willing actions is "dismemberment" or "poisoning" of anyone who is inconveniencing you (aka fetus) then you can't be trusted with the reins of your own life, let alone with the (indirect) reins of 4 years of an entire nation's future.What will change my view?Let's start with what will not change it, since I fear most of you will try at least one of these:1.
Some of his rules are;- don’t get married or have kids until you are 30- always wear a condom and make sure you supply the condom- put hot sauce in your condoms after use so if the bitch tries to put your semen in her after sex, she gets quite the burn - never spend more than $40 on a date- never take a woman to get food after the bar because you don’t want her to sober up nor will most women put out with jumbo jack breath- never answer your phones on weekends.
Don't come at me saying you're the "relationship  kind of guy".
Because calling them sexless virgins is an easy way to sh

## 4. Binary Analysis: False Positives

In [None]:
if ANALYSIS_MODE == "binary":
    # False Positives: NOT_IPV cases predicted as IPV
    fp = wrongs[wrongs["predicted_label"] == "IPV"].copy()
    print(f"False Positives ({len(fp)} cases):")
    print("=" * 80)
    
    for idx, row in fp.iterrows():
        print(f"\nSample {row['id']}:")
        print(f"Text: {row['text'][:200]}...")
        print(f"Predicted: {row['predicted_label']}")
        print("-" * 80)


## 5. Multilabel Analysis: Error Patterns by Abuse Type

In [None]:
if ANALYSIS_MODE == "multilabel":
    # Count errors by abuse type
    error_counts = {
        "physical_only": wrongs[
            (wrongs["physical_true"] != wrongs["physical_pred"]) &
            (wrongs["emotional_true"] == wrongs["emotional_pred"]) &
            (wrongs["sexual_true"] == wrongs["sexual_pred"])
        ],
        "emotional_only": wrongs[
            (wrongs["emotional_true"] != wrongs["emotional_pred"]) &
            (wrongs["physical_true"] == wrongs["physical_pred"]) &
            (wrongs["sexual_true"] == wrongs["sexual_pred"])
        ],
        "sexual_only": wrongs[
            (wrongs["sexual_true"] != wrongs["sexual_pred"]) &
            (wrongs["physical_true"] == wrongs["physical_pred"]) &
            (wrongs["emotional_true"] == wrongs["emotional_pred"])
        ],
        "multiple_errors": wrongs[
            ((wrongs["physical_true"] != wrongs["physical_pred"]).astype(int) +
             (wrongs["emotional_true"] != wrongs["emotional_pred"]).astype(int) +
             (wrongs["sexual_true"] != wrongs["sexual_pred"]).astype(int)) > 1
        ]
    }
    
    print("Error breakdown by abuse type:")
    print("=" * 80)
    for error_type, df_err in error_counts.items():
        print(f"\n{error_type}: {len(df_err)} cases")
        if len(df_err) > 0:
            display_cols = ["text", "emotional_true", "emotional_pred", 
                          "physical_true", "physical_pred", 
                          "sexual_true", "sexual_pred"]
            print(df_err[display_cols].head(10))


## 6. Error Pattern Distribution


In [None]:
if ANALYSIS_MODE == "multilabel":
    # Count error patterns
    error_pattern_counts = wrongs["error_pattern"].value_counts()
    
    print("Error Pattern Distribution:")
    print("=" * 80)
    print(error_pattern_counts)
    print("\nLegend:")
    print("  P = Physical abuse error")
    print("  E = Emotional abuse error")
    print("  S = Sexual abuse error")
    print("  PE = Physical + Emotional errors")
    print("  etc.")
    
    # Visualize error patterns
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Pie chart
    error_pattern_counts.plot(kind="pie", ax=axes[0], autopct="%1.1f%%", startangle=90)
    axes[0].set_title("Error Pattern Distribution")
    axes[0].set_ylabel("")
    
    # Bar chart
    error_pattern_counts.plot(kind="bar", ax=axes[1])
    axes[1].set_title("Error Pattern Counts")
    axes[1].set_xlabel("Error Pattern")
    axes[1].set_ylabel("Count")
    axes[1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()


## 7. Visualize Errors by Abuse Type

In [None]:
if ANALYSIS_MODE == "multilabel":
    # Count wrong predictions per abuse type
    wrong_by_type = pd.DataFrame({
        "Emotional": (wrongs["emotional_true"] != wrongs["emotional_pred"]).sum(),
        "Physical": (wrongs["physical_true"] != wrongs["physical_pred"]).sum(),
        "Sexual": (wrongs["sexual_true"] != wrongs["sexual_pred"]).sum(),
    }, index=[0])
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Bar chart of wrong counts per type
    wrong_by_type.T.plot(kind="bar", ax=axes[0], legend=False)
    axes[0].set_title("Wrong Predictions per Abuse Type")
    axes[0].set_xlabel("Abuse Type")
    axes[0].set_ylabel("Number of Errors")
    axes[0].tick_params(axis='x', rotation=0)
    
    # Pie chart
    wrong_by_type.T[0].plot(kind="pie", ax=axes[1], autopct="%1.1f%%", legend=False)
    axes[1].set_title("Error Distribution by Abuse Type")
    axes[1].set_ylabel("")
    
    plt.tight_layout()
    plt.show()
    
    print("Error counts by type:")
    print(wrong_by_type)


## 8. Linguistic Pattern Analysis: Soft vs Hard Statements

In [None]:
# Merge with wrongs to get type information
if "type" in truth.columns:
    wrongs_with_type = wrongs.merge(
        truth[["id", "type"]], 
        on="id", 
        how="left"
    )
    
    print("Error distribution by statement type:")
    print("=" * 80)
    
    if ANALYSIS_MODE == "binary":
        type_errors = wrongs_with_type.groupby("type").size()
        print(type_errors)
        
        # Visualize
        type_errors.plot(kind="bar", figsize=(10, 5))
        plt.title("Error Counts by Statement Type")
        plt.xlabel("Statement Type")
        plt.ylabel("Number of Errors")
        plt.tick_params(axis='x', rotation=45)
        plt.tight_layout()
        plt.show()
        
    elif ANALYSIS_MODE == "multilabel":
        # Cross-tabulation of type vs error pattern
        type_error_cross = pd.crosstab(wrongs_with_type["type"], wrongs_with_type["error_pattern"])
        print("\nCross-tabulation: Statement Type vs Error Pattern")
        print(type_error_cross)
        
        # Visualize
        type_error_cross.plot(kind="bar", stacked=True, figsize=(12, 6))
        plt.title("Error Patterns by Statement Type")
        plt.xlabel("Statement Type")
        plt.ylabel("Number of Errors")
        plt.legend(title="Error Pattern", bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tick_params(axis='x', rotation=45)
        plt.tight_layout()
        plt.show()
else:
    print("'type' column not found in ground truth. Skipping linguistic pattern analysis.")


## 9. Detailed Wrong Examples by Category

In [None]:
if ANALYSIS_MODE == "multilabel":
    # Show examples for each error pattern
    print("Examples of Wrong Predictions by Error Pattern:")
    print("=" * 80)
    
    for pattern in wrongs["error_pattern"].unique():
        if pattern == "None":
            continue
        
        pattern_wrongs = wrongs[wrongs["error_pattern"] == pattern]
        print(f"\n{'='*80}")
        print(f"Error Pattern: {pattern} ({len(pattern_wrongs)} cases)")
        print(f"{'='*80}")
        
        for idx, row in pattern_wrongs.head(5).iterrows():
            print(f"\nSample ID: {row['id']}")
            print(f"Text: {row['text'][:300]}...")
            print(f"True:  E={row['emotional_true']}, P={row['physical_true']}, S={row['sexual_true']}")
            print(f"Pred:  E={row['emotional_pred']}, P={row['physical_pred']}, S={row['sexual_pred']}")
            print("-" * 80)


## 10. Summary Statistics

In [None]:
print("Summary Statistics")
print("=" * 80)
print(f"Total samples: {len(df)}")
print(f"Correct predictions: {len(df) - len(wrongs)} ({100*(len(df)-len(wrongs))/len(df):.2f}%)")
print(f"Wrong predictions: {len(wrongs)} ({100*len(wrongs)/len(df):.2f}%)")

if ANALYSIS_MODE == "binary":
    print(f"\nBinary Classification:")
    print(f"  False Negatives: {(wrongs['predicted_label'] == 'NOT_IPV').sum()}")
    print(f"  False Positives: {(wrongs['predicted_label'] == 'IPV').sum()}")
    
elif ANALYSIS_MODE == "multilabel":
    print(f"\nMultilabel Classification:")
    print(f"  Physical abuse errors: {(wrongs['physical_true'] != wrongs['physical_pred']).sum()}")
    print(f"  Emotional abuse errors: {(wrongs['emotional_true'] != wrongs['emotional_pred']).sum()}")
    print(f"  Sexual abuse errors: {(wrongs['sexual_true'] != wrongs['sexual_pred']).sum()}")
    
    print(f"\nMost common error pattern: {wrongs['error_pattern'].mode().iloc[0] if len(wrongs) > 0 else 'N/A'}")
    
    # Per-label accuracy
    print(f"\nPer-Label Accuracy:")
    for label in ["emotional", "physical", "sexual"]:
        correct = (df[f"{label}_true"] == df[f"{label}_pred"]).sum()
        accuracy = 100 * correct / len(df)
        print(f"  {label.capitalize()}: {accuracy:.2f}% ({correct}/{len(df)})")


## 11. Confusion Matrix Analysis

from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score

if ANALYSIS_MODE == "binary":
    # Binary confusion matrix
    y_true = df["Tag"].astype(int)
    y_pred = df["predicted_ipv"].astype(int)
    
    cm = confusion_matrix(y_true, y_pred)
    
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
                xticklabels=['NOT_IPV', 'IPV'],
                yticklabels=['NOT_IPV', 'IPV'])
    ax.set_xlabel('Predicted')
    ax.set_ylabel('True')
    ax.set_title('Binary Classification Confusion Matrix')
    plt.tight_layout()
    plt.show()
    
    print("\nClassification Report:")
    print("=" * 80)
    print(classification_report(y_true, y_pred, target_names=['NOT_IPV', 'IPV']))
    
elif ANALYSIS_MODE == "multilabel":
    # Confusion matrices for each label
    labels = ["emotional", "physical", "sexual"]
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    for idx, label in enumerate(labels):
        y_true = df[f"{label}_true"]
        y_pred = df[f"{label}_pred"]
        
        cm = confusion_matrix(y_true, y_pred)
        
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                    xticklabels=['0', '1'],
                    yticklabels=['0', '1'])
        axes[idx].set_xlabel('Predicted')
        axes[idx].set_ylabel('True')
        axes[idx].set_title(f'{label.capitalize()} Abuse Confusion Matrix')
    
    plt.tight_layout()
    plt.show()
    
    # Per-label metrics
    print("\nPer-Label Classification Metrics:")
    print("=" * 80)
    for label in labels:
        y_true = df[f"{label}_true"]
        y_pred = df[f"{label}_pred"]
        
        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        
        print(f"\n{label.capitalize()}:")
        print(f"  Precision: {precision:.3f}")
        print(f"  Recall: {recall:.3f}")
        print(f"  F1-Score: {f1:.3f}")


## 12. Text Length Analysis: Does Text Length Correlate with Errors?


In [None]:
# Add text length column
df["text_length"] = df["text"].str.len()
df["word_count"] = df["text"].str.split().str.len()

# Compare text lengths for correct vs wrong predictions
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Character length distribution
df.boxplot(column="text_length", by="wrong", ax=axes[0])
axes[0].set_title("Text Length (characters) by Prediction Accuracy")
axes[0].set_xlabel("Wrong Prediction")
axes[0].set_ylabel("Character Count")
axes[0].set_xticklabels(['Correct', 'Wrong'])

# Word count distribution
df.boxplot(column="word_count", by="wrong", ax=axes[1])
axes[1].set_title("Word Count by Prediction Accuracy")
axes[1].set_xlabel("Wrong Prediction")
axes[1].set_ylabel("Word Count")
axes[1].set_xticklabels(['Correct', 'Wrong'])

plt.tight_layout()
plt.show()

# Statistical summary
print("Text Length Statistics:")
print("=" * 80)
print("\nCorrect predictions:")
print(df[~df["wrong"]][["text_length", "word_count"]].describe())
print("\nWrong predictions:")
print(df[df["wrong"]][["text_length", "word_count"]].describe())


## 13. Save Wrong Predictions for Further Analysis


In [None]:
from datetime import datetime

# Prepare wrong predictions for export
output_dir = Path("error_analysis_output")
output_dir.mkdir(exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

if ANALYSIS_MODE == "binary":
    wrongs_export = wrongs[[
        "id", "text", "emotional", "physical", "sexual",
        "Tag", "predicted_ipv", "predicted_label"
    ]].copy()
    wrongs_export.columns = [
        "id", "text", "emotional_true", "physical_true", "sexual_true",
        "ipv_true", "ipv_predicted", "predicted_label"
    ]
    
elif ANALYSIS_MODE == "multilabel":
    wrongs_export = wrongs[[
        "id", "text", "emotional_true", "physical_true", "sexual_true",
        "emotional_pred", "physical_pred", "sexual_pred", "error_pattern"
    ]].copy()

# Save to CSV
output_file = output_dir / f"wrong_predictions_{ANALYSIS_MODE}_{timestamp}.csv"
wrongs_export.to_csv(output_file, index=False)
print(f"Wrong predictions saved to: {output_file}")
print(f"Total wrong predictions exported: {len(wrongs_export)}")

# Also save full analysis data
full_output_file = output_dir / f"full_analysis_{ANALYSIS_MODE}_{timestamp}.csv"
df_export = df.copy()
if "text" in df_export.columns:
    # Truncate long texts for CSV readability
    df_export["text"] = df_export["text"].str[:500] + "..."
df_export.to_csv(full_output_file, index=False)
print(f"Full analysis data saved to: {full_output_file}")


## 14. Keyword Analysis: Common Words in Wrong Predictions


In [None]:
import re
from collections import Counter

# Simple word frequency analysis
def get_words(text, min_length=3):
    """Extract words from text, filtering common stop words."""
    # Basic stop words
    stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 
                  'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
                  'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
                  'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that',
                  'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they',
                  'me', 'him', 'her', 'us', 'them', 'my', 'your', 'his', 'her', 'its',
                  'our', 'their', 'what', 'which', 'who', 'whom', 'whose', 'where',
                  'when', 'why', 'how', 'all', 'each', 'every', 'both', 'few', 'more',
                  'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own',
                  'same', 'so', 'than', 'too', 'very', 'just'}
    
    words = re.findall(r'\b[a-z]+\b', text.lower())
    return [w for w in words if len(w) >= min_length and w not in stop_words]

# Analyze words in wrong vs correct predictions
wrong_words = []
correct_words = []

for idx, row in df.iterrows():
    words = get_words(str(row["text"]))
    if row["wrong"]:
        wrong_words.extend(words)
    else:
        correct_words.extend(words)

# Count frequencies
wrong_word_counts = Counter(wrong_words)
correct_word_counts = Counter(correct_words)

# Get top words unique to wrong predictions (not in top correct words)
top_wrong = set([w for w, _ in wrong_word_counts.most_common(50)])
top_correct = set([w for w, _ in correct_word_counts.most_common(50)])
unique_wrong_words = top_wrong - top_correct

print("Top 20 words in WRONG predictions:")
print("=" * 80)
for word, count in wrong_word_counts.most_common(20):
    print(f"{word}: {count}")

print(f"\nTop 20 words unique to WRONG predictions (not in top correct):")
print("=" * 80)
unique_wrong_counts = {w: wrong_word_counts[w] for w in unique_wrong_words}
for word, count in sorted(unique_wrong_counts.items(), key=lambda x: x[1], reverse=True)[:20]:
    print(f"{word}: {count}")

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Top words in wrong predictions
top_wrong_df = pd.DataFrame(wrong_word_counts.most_common(15), columns=['word', 'count'])
top_wrong_df.plot(kind='barh', x='word', y='count', ax=axes[0], legend=False)
axes[0].set_title('Top 15 Words in Wrong Predictions')
axes[0].set_xlabel('Frequency')

# Top words in correct predictions
top_correct_df = pd.DataFrame(correct_word_counts.most_common(15), columns=['word', 'count'])
top_correct_df.plot(kind='barh', x='word', y='count', ax=axes[1], legend=False, color='green')
axes[1].set_title('Top 15 Words in Correct Predictions')
axes[1].set_xlabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
print("=" * 80)
print("COMPREHENSIVE ERROR ANALYSIS SUMMARY")
print("=" * 80)
print(f"\nAnalysis Mode: {ANALYSIS_MODE.upper()}")
print(f"Predictions File: {PREDICTIONS_JSON_PATH}")
print(f"Ground Truth File: {TRUTH_CSV_PATH}")
print(f"\nDate: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

print("\n" + "=" * 80)
print("DATASET OVERVIEW")
print("=" * 80)
print(f"Total samples: {len(df)}")
print(f"Correct predictions: {len(df) - len(wrongs)} ({100*(len(df)-len(wrongs))/len(df):.2f}%)")
print(f"Wrong predictions: {len(wrongs)} ({100*len(wrongs)/len(df):.2f}%)")

if ANALYSIS_MODE == "binary":
    print("\n" + "=" * 80)
    print("BINARY CLASSIFICATION METRICS")
    print("=" * 80)
    fn_count = (wrongs['predicted_label'] == 'NOT_IPV').sum()
    fp_count = (wrongs['predicted_label'] == 'IPV').sum()
    print(f"False Negatives (IPV → NOT_IPV): {fn_count} ({100*fn_count/len(wrongs) if len(wrongs) > 0 else 0:.1f}% of errors)")
    print(f"False Positives (NOT_IPV → IPV): {fp_count} ({100*fp_count/len(wrongs) if len(wrongs) > 0 else 0:.1f}% of errors)")
    
elif ANALYSIS_MODE == "multilabel":
    print("\n" + "=" * 80)
    print("MULTILABEL CLASSIFICATION METRICS")
    print("=" * 80)
    for label in ["emotional", "physical", "sexual"]:
        correct = (df[f"{label}_true"] == df[f"{label}_pred"]).sum()
        accuracy = 100 * correct / len(df)
        errors = (wrongs[f"{label}_true"] != wrongs[f"{label}_pred"]).sum()
        print(f"{label.capitalize()}: {accuracy:.2f}% accuracy, {errors} errors")
    
    print(f"\nMost common error pattern: {wrongs['error_pattern'].mode().iloc[0] if len(wrongs) > 0 else 'N/A'}")
    print("\nError pattern distribution:")
    print(wrongs['error_pattern'].value_counts().head(10))

if "type" in truth.columns and len(wrongs) > 0:
    print("\n" + "=" * 80)
    print("ERRORS BY STATEMENT TYPE")
    print("=" * 80)
    wrongs_with_type = wrongs.merge(truth[["id", "type"]], on="id", how="left")
    type_error_counts = wrongs_with_type.groupby("type").size()
    print(type_error_counts)

print("\n" + "=" * 80)
print("TEXT CHARACTERISTICS")
print("=" * 80)
if len(wrongs) > 0:
    print(f"Avg text length (characters) - Correct: {df[~df['wrong']]['text_length'].mean():.0f}")
    print(f"Avg text length (characters) - Wrong: {df[df['wrong']]['text_length'].mean():.0f}")
    print(f"Avg word count - Correct: {df[~df['wrong']]['word_count'].mean():.0f}")
    print(f"Avg word count - Wrong: {df[df['wrong']]['word_count'].mean():.0f}")

print("\n" + "=" * 80)
print("FILES SAVED")
print("=" * 80)
print(f"Wrong predictions CSV: error_analysis_output/wrong_predictions_{ANALYSIS_MODE}_*.csv")
print(f"Full analysis CSV: error_analysis_output/full_analysis_{ANALYSIS_MODE}_*.csv")
print("\n" + "=" * 80)
