In [None]:
import os
import sys
import json
import re
from pathlib import Path
from collections import Counter, defaultdict

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import cohen_kappa_score, confusion_matrix

# Add src to path
sys.path.insert(0, str(Path('.').resolve().parent / 'src'))

from config import RAW_DATA_DIR, PROCESSED_DATA_DIR, TEST_FOLDER
from utils import load_json_annotations, parse_label_studio_export, clean_text

plt.style.use('seaborn-v0_8-whitegrid')

## 1. Load Raw Consensus Data

In [None]:
# Find consensus directory
consensus_dir = None
for path in RAW_DATA_DIR.rglob(TEST_FOLDER):
    if path.is_dir():
        consensus_dir = path
        break

if consensus_dir:
    print(f'Found consensus directory: {consensus_dir}')
    json_files = list(consensus_dir.glob('*.json'))
    print(f'Found {len(json_files)} JSON files')
else:
    print('Consensus directory not found')
    json_files = []

In [None]:
# Load all annotations from different annotators
annotator_data = {}

for json_file in json_files:
    annotator_name = json_file.stem
    try:
        data = load_json_annotations(json_file)
        parsed = parse_label_studio_export(data)
        annotator_data[annotator_name] = parsed
        print(f'{annotator_name}: {len(parsed)} annotations')
    except Exception as e:
        print(f'Error loading {annotator_name}: {e}')

## 2. Build Annotation Matrix

In [None]:
# Create a mapping of texts to annotations from each annotator
text_annotations = defaultdict(dict)

for annotator, annotations in annotator_data.items():
    for text, rating in annotations:
        clean = clean_text(text)
        text_annotations[clean][annotator] = rating

print(f'Total unique texts: {len(text_annotations)}')

# Create DataFrame
rows = []
for text, annotations in text_annotations.items():
    row = {'text': text}
    for annotator in annotator_data.keys():
        row[annotator] = annotations.get(annotator, np.nan)
    rows.append(row)

annotation_df = pd.DataFrame(rows)
print(f'\nDataFrame shape: {annotation_df.shape}')
display(annotation_df.head())

## 3. Inter-Annotator Agreement

In [None]:
# Get annotator columns
annotator_cols = [col for col in annotation_df.columns if col != 'text']

if len(annotator_cols) >= 2:
    # Pairwise Cohen's Kappa
    kappa_matrix = np.zeros((len(annotator_cols), len(annotator_cols)))
    
    for i, ann1 in enumerate(annotator_cols):
        for j, ann2 in enumerate(annotator_cols):
            # Get common annotations
            mask = annotation_df[[ann1, ann2]].notna().all(axis=1)
            if mask.sum() > 0:
                ratings1 = annotation_df.loc[mask, ann1].values
                ratings2 = annotation_df.loc[mask, ann2].values
                try:
                    kappa = cohen_kappa_score(ratings1, ratings2)
                    kappa_matrix[i, j] = kappa
                except:
                    kappa_matrix[i, j] = np.nan
            else:
                kappa_matrix[i, j] = np.nan
    
    # Plot heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(kappa_matrix, annot=True, fmt='.2f', 
                xticklabels=annotator_cols, yticklabels=annotator_cols,
                cmap='RdYlGn', vmin=-1, vmax=1, center=0)
    plt.title("Pairwise Cohen's Kappa Between Annotators")
    plt.tight_layout()
    plt.show()
    
    # Average Kappa
    valid_kappas = kappa_matrix[~np.isnan(kappa_matrix) & (np.eye(len(annotator_cols)) == 0)]
    print(f'\nAverage pairwise Kappa: {np.mean(valid_kappas):.3f}')
else:
    print('Not enough annotators for inter-annotator agreement analysis')

In [None]:
# Calculate agreement for each text
def calculate_agreement(row):
    ratings = row[annotator_cols].dropna().values
    if len(ratings) < 2:
        return np.nan, np.nan, np.nan
    
    # Majority vote
    counter = Counter(ratings)
    majority = counter.most_common(1)[0]
    agreement = majority[1] / len(ratings)
    
    # Standard deviation
    std = np.std(ratings)
    
    return majority[0], agreement, std

if len(annotator_cols) >= 2:
    results = annotation_df.apply(calculate_agreement, axis=1)
    annotation_df['consensus_label'] = [r[0] for r in results]
    annotation_df['agreement'] = [r[1] for r in results]
    annotation_df['std'] = [r[2] for r in results]
    
    print('Agreement Statistics:')
    print(annotation_df[['agreement', 'std']].describe())

## 4. Label Distribution by Annotator

In [None]:
if len(annotator_cols) > 0:
    # Create distribution comparison
    fig, axes = plt.subplots(1, min(len(annotator_cols), 4), figsize=(16, 4))
    if len(annotator_cols) == 1:
        axes = [axes]
    
    for i, annotator in enumerate(annotator_cols[:4]):
        ratings = annotation_df[annotator].dropna()
        counts = ratings.value_counts().sort_index()
        
        axes[i].bar(counts.index, counts.values, color='steelblue', alpha=0.7)
        axes[i].set_xlabel('Rating')
        axes[i].set_ylabel('Count')
        axes[i].set_title(f'{annotator}')
        axes[i].set_xticks([1, 2, 3, 4, 5])
    
    plt.tight_layout()
    plt.show()

In [None]:
# Compare annotator means
if len(annotator_cols) > 0:
    annotator_stats = []
    for annotator in annotator_cols:
        ratings = annotation_df[annotator].dropna()
        annotator_stats.append({
            'Annotator': annotator,
            'Count': len(ratings),
            'Mean': ratings.mean(),
            'Std': ratings.std(),
            'Min': ratings.min(),
            'Max': ratings.max()
        })
    
    stats_df = pd.DataFrame(annotator_stats)
    display(stats_df.round(2))

## 5. Disagreement Analysis

In [None]:
# Find texts with highest disagreement
if 'agreement' in annotation_df.columns:
    disagreements = annotation_df.nsmallest(10, 'agreement')[['text', 'consensus_label', 'agreement', 'std'] + annotator_cols]
    
    print('Top 10 Most Disputed Texts:')
    print('=' * 60)
    
    for idx, row in disagreements.iterrows():
        print(f'\nText: {row["text"][:100]}...')
        print(f'Consensus: {row["consensus_label"]}, Agreement: {row["agreement"]:.2%}')
        ratings = [f'{ann}: {row[ann]}' for ann in annotator_cols if pd.notna(row[ann])]
        print(f'Ratings: {ratings}')

In [None]:
# Disagreement by rating category
if 'consensus_label' in annotation_df.columns and 'agreement' in annotation_df.columns:
    agreement_by_label = annotation_df.groupby('consensus_label')['agreement'].agg(['mean', 'std', 'count'])
    
    print('\nAgreement by Consensus Label:')
    display(agreement_by_label.round(3))
    
    plt.figure(figsize=(8, 5))
    plt.bar(agreement_by_label.index, agreement_by_label['mean'], 
            yerr=agreement_by_label['std'], capsize=5, color='teal', alpha=0.7)
    plt.xlabel('Consensus Rating')
    plt.ylabel('Average Agreement')
    plt.title('Annotator Agreement by Rating Category')
    plt.xticks([1, 2, 3, 4, 5])
    plt.tight_layout()
    plt.show()

## 6. Confusion Between Adjacent Ratings

In [None]:
# Analyze confusion patterns between annotators
if len(annotator_cols) >= 2:
    all_confusions = np.zeros((5, 5))
    
    for i, ann1 in enumerate(annotator_cols):
        for j, ann2 in enumerate(annotator_cols):
            if i < j:  # Only count each pair once
                mask = annotation_df[[ann1, ann2]].notna().all(axis=1)
                if mask.sum() > 0:
                    ratings1 = annotation_df.loc[mask, ann1].astype(int).values
                    ratings2 = annotation_df.loc[mask, ann2].astype(int).values
                    cm = confusion_matrix(ratings1, ratings2, labels=[1, 2, 3, 4, 5])
                    all_confusions += cm
    
    # Plot
    plt.figure(figsize=(8, 6))
    sns.heatmap(all_confusions, annot=True, fmt='.0f', cmap='Blues',
                xticklabels=[1, 2, 3, 4, 5], yticklabels=[1, 2, 3, 4, 5])
    plt.xlabel('Annotator 2 Rating')
    plt.ylabel('Annotator 1 Rating')
    plt.title('Overall Confusion Matrix Between Annotators')
    plt.tight_layout()
    plt.show()

## 7. Save Processed Consensus Data

In [None]:
# Save the annotation analysis
if 'consensus_label' in annotation_df.columns:
    # Clean and save
    final_df = annotation_df[['text', 'consensus_label', 'agreement', 'std']].dropna()
    final_df = final_df.rename(columns={'consensus_label': 'label'})
    final_df['label'] = final_df['label'].astype(int)
    
    output_path = PROCESSED_DATA_DIR / 'consensus_analysis.csv'
    final_df.to_csv(output_path, index=False)
    print(f'Saved consensus analysis to {output_path}')
    print(f'Total samples: {len(final_df)}')

In [None]:
# Summary statistics
print('\n' + '=' * 60)
print('LABEL ANALYSIS SUMMARY')
print('=' * 60)

if len(annotator_cols) > 0:
    print(f'\nNumber of annotators: {len(annotator_cols)}')
    print(f'Total unique texts: {len(annotation_df)}')
    
    if 'agreement' in annotation_df.columns:
        print(f'\nAgreement Statistics:')
        print(f'  Mean agreement: {annotation_df["agreement"].mean():.2%}')
        print(f'  High agreement (>75%): {(annotation_df["agreement"] > 0.75).sum()} texts')
        print(f'  Low agreement (<50%): {(annotation_df["agreement"] < 0.5).sum()} texts')