# Reproducing Tables from the Paper

This notebook reproduces all tables from "Disentangling Generation and LLM-Judge Effects in Workplace Emails: Gender-Coded Differences Across Models"

**Statistical approach:** Paired Wilcoxon signed-rank tests with 95% CIs and BH-FDR correction.

In [None]:
import sys
sys.path.append('../src')

import json
import pandas as pd
import numpy as np
from pathlib import Path
from analyze_results import (load_emails, load_ratings, 
                             paired_pattern_analysis, paired_rating_analysis,
                             apply_bh_correction)
import re

In [None]:
# Load data
data_dir = Path('../data/raw')

gpt_emails = load_emails(data_dir / 'emails_gpt52.json')
gemini_emails = load_emails(data_dir / 'emails_gemini.json')

ratings = {
    'gemini_nat': load_ratings(data_dir / 'ratings_gemini_naturalistic.json'),
    'gemini_deb': load_ratings(data_dir / 'ratings_gemini_debiased.json'),
    'gemini_blind': load_ratings(data_dir / 'ratings_gemini_blinded.json'),
    'gpt52_nat': load_ratings(data_dir / 'ratings_gpt52_naturalistic.json'),
    'gpt52_deb': load_ratings(data_dir / 'ratings_gpt52_debiased.json'),
    'gpt52_blind': load_ratings(data_dir / 'ratings_gpt52_blinded.json'),
}

print(f"Emails: {len(gpt_emails)} GPT-5.2, {len(gemini_emails)} Gemini")
print(f"Ratings: {sum(len(v) for v in ratings.values())} total")

## Table 2: Generation Patterns

In [None]:
patterns = [
    ('i_believe', r'\bi believe\b', re.IGNORECASE),
    ('given_my', r'given my', re.IGNORECASE),
    ('wanted_to', r'wanted to', re.IGNORECASE),
    ('follow_up', r'follow.?up', re.IGNORECASE),
    ('clarify', r'\bclarify\b', re.IGNORECASE),
    ('full_name_sig', r'\n[A-Z][a-z]+ [A-Z][a-z]+\s*$', 0),
]

all_patterns = []
for scenario in ['S01', 'S02']:
    for model_name, emails in [('GPT-5.2', gpt_emails), ('Gemini 2.0', gemini_emails)]:
        for pattern_name, pattern, flags in patterns:
            res = paired_pattern_analysis(emails, scenario, pattern, flags)
            all_patterns.append({
                'Scenario': scenario, 'Model': model_name, 'Pattern': pattern_name, **res
            })

all_patterns = apply_bh_correction(all_patterns)

# Show significant patterns
sig_df = pd.DataFrame([p for p in all_patterns if p['significant']])
sig_df[['Scenario', 'Model', 'Pattern', 'f_pct', 'm_pct', 'diff', 'ci_low', 'ci_high', 'p', 'd']]

## Table 3: S01 Evaluation (Salary Negotiation)

In [None]:
s01_results = []
for setting, condition, key in [
    ('GPT-5.2 → Gemini 2.0', 'Naturalistic', 'gemini_nat'),
    ('GPT-5.2 → Gemini 2.0', 'Debiased', 'gemini_deb'),
    ('GPT-5.2 → Gemini 2.0', 'Blinded', 'gemini_blind'),
    ('Gemini 2.0 → GPT-5.2', 'Naturalistic', 'gpt52_nat'),
    ('Gemini 2.0 → GPT-5.2', 'Debiased', 'gpt52_deb'),
    ('Gemini 2.0 → GPT-5.2', 'Blinded', 'gpt52_blind'),
]:
    res = paired_rating_analysis(ratings[key], 'S01', 'likelihood_to_grant_raise')
    if res:
        s01_results.append({'Setting': setting, 'Condition': condition, **res})

pd.DataFrame(s01_results)[['Setting', 'Condition', 'diff', 'ci_low', 'ci_high', 'p']]

## Table 4: S02 Evaluation (Credit Attribution)

In [None]:
s02_results = []
for setting, condition, key in [
    ('GPT-5.2 → Gemini 2.0', 'Naturalistic', 'gemini_nat'),
    ('GPT-5.2 → Gemini 2.0', 'Debiased', 'gemini_deb'),
    ('GPT-5.2 → Gemini 2.0', 'Blinded', 'gemini_blind'),
    ('Gemini 2.0 → GPT-5.2', 'Naturalistic', 'gpt52_nat'),
    ('Gemini 2.0 → GPT-5.2', 'Debiased', 'gpt52_deb'),
    ('Gemini 2.0 → GPT-5.2', 'Blinded', 'gpt52_blind'),
]:
    res = paired_rating_analysis(ratings[key], 'S02', 'likelihood_to_send_correction')
    if res:
        s02_results.append({'Setting': setting, 'Condition': condition, 'key': key, **res})

pd.DataFrame(s02_results)[['Setting', 'Condition', 'diff', 'ci_low', 'ci_high', 'p', 'd']]

## Table 5: Bias Decomposition

In [None]:
s02_by_key = {r['key']: r for r in s02_results}

decomp = []
for evaluator, nat_key, blind_key, deb_key in [
    ('Gemini 2.0', 'gemini_nat', 'gemini_blind', 'gemini_deb'),
    ('GPT-5.2', 'gpt52_nat', 'gpt52_blind', 'gpt52_deb'),
]:
    nat = s02_by_key[nat_key]
    blind = s02_by_key[blind_key]
    deb = s02_by_key[deb_key]
    
    decomp.append({
        'Evaluator': evaluator,
        'Unblinded': f"+{nat['diff']:.2f} (d={nat['d']:.2f})",
        'Blinded': f"+{blind['diff']:.2f} (d={blind['d']:.2f})",
        'Debiased': f"+{deb['diff']:.2f}",
        'Name Component': f"{nat['diff'] - blind['diff']:+.2f}",
        'Style Component': f"+{blind['diff']:.2f}",
        'Interpretation': 'Pure style' if abs(nat['diff'] - blind['diff']) < 0.05 else 'Name + style'
    })

pd.DataFrame(decomp)