# Reproducing Tables from "Disentangling Generation and LLM-Judge Effects in Workplace Emails: Gender-Coded Differences Across Models"

This notebook reproduces all tables from the paper using the raw data.

**Key methodological note:** The unit of analysis is the *persona* (n=30 per gender), not individual emails. Each persona generates 3 emails, which are averaged before statistical testing. This respects the clustering structure of the data.

In [None]:
import json
import re
import numpy as np
import pandas as pd
from scipy import stats
from collections import defaultdict
from pathlib import Path

## Load Data

In [None]:
DATA_DIR = Path('../data/raw')

def load_emails(filepath):
    with open(filepath, 'r') as f:
        return json.load(f)

def load_ratings(filepath):
    with open(filepath, 'r') as f:
        data = json.load(f)
    # Fix parse errors
    for r in data:
        if r.get('parse_error') and r.get('raw_response'):
            raw = r['raw_response']
            for field in ['likelihood_to_grant_raise', 'professionalism', 
                         'perceived_confidence', 'perceived_competence',
                         'likelihood_to_send_correction', 'perceived_reasonableness', 
                         'seems_entitled']:
                if field not in r:
                    match = re.search(rf'"{field}":\s*(\d)', raw)
                    if match:
                        r[field] = int(match.group(1))
    return data

# Load emails
gpt_emails = load_emails(DATA_DIR / 'emails_gpt52.json')
gemini_emails = load_emails(DATA_DIR / 'emails_gemini.json')

# Load ratings
ratings = {
    'gemini_nat': load_ratings(DATA_DIR / 'ratings_gemini_naturalistic.json'),
    'gemini_deb': load_ratings(DATA_DIR / 'ratings_gemini_debiased.json'),
    'gemini_blind': load_ratings(DATA_DIR / 'ratings_gemini_blinded.json'),
    'gpt52_nat': load_ratings(DATA_DIR / 'ratings_gpt52_naturalistic.json'),
    'gpt52_deb': load_ratings(DATA_DIR / 'ratings_gpt52_debiased.json'),
    'gpt52_blind': load_ratings(DATA_DIR / 'ratings_gpt52_blinded.json'),
}

print(f"GPT-5.2 emails: {len(gpt_emails)}")
print(f"Gemini emails: {len(gemini_emails)}")
print(f"Total ratings: {sum(len(v) for v in ratings.values())}")

## Helper Functions

In [None]:
def calc_persona_pattern(emails, scenario, pattern, flags=re.IGNORECASE):
    """Calculate pattern frequency aggregated to persona level."""
    subset = [e for e in emails if e['scenario_id'] == scenario and e.get('email_text')]
    
    f_props = defaultdict(list)
    m_props = defaultdict(list)
    
    for e in subset:
        has_pattern = 1 if re.search(pattern, e['email_text'], flags) else 0
        if e['gender'] == 'F':
            f_props[e['persona_id']].append(has_pattern)
        else:
            m_props[e['persona_id']].append(has_pattern)
    
    f_means = [np.mean(v) for v in f_props.values()]
    m_means = [np.mean(v) for v in m_props.values()]
    
    f_mean = np.mean(f_means)
    m_mean = np.mean(m_means)
    
    _, p = stats.mannwhitneyu(f_means, m_means, alternative='two-sided')
    
    pooled_std = np.sqrt((np.std(f_means, ddof=1)**2 + np.std(m_means, ddof=1)**2) / 2)
    d = (f_mean - m_mean) / pooled_std if pooled_std > 0 else 0
    
    return {'f_pct': f_mean * 100, 'm_pct': m_mean * 100, 'p': p, 'd': d}


def calc_persona_rating(data, scenario, measure):
    """Calculate rating difference aggregated to persona level."""
    subset = [r for r in data if r['scenario_id'] == scenario]
    
    f_vals = defaultdict(list)
    m_vals = defaultdict(list)
    
    for r in subset:
        val = r.get(measure)
        if val is not None:
            if r['gender'] == 'F':
                f_vals[r['persona_id']].append(val)
            else:
                m_vals[r['persona_id']].append(val)
    
    f_means = [np.mean(v) for v in f_vals.values()]
    m_means = [np.mean(v) for v in m_vals.values()]
    
    f_mean = np.mean(f_means)
    m_mean = np.mean(m_means)
    diff = f_mean - m_mean
    
    _, p = stats.mannwhitneyu(f_means, m_means, alternative='two-sided')
    
    pooled_std = np.sqrt((np.std(f_means, ddof=1)**2 + np.std(m_means, ddof=1)**2) / 2)
    d = diff / pooled_std if pooled_std > 0 else 0
    
    return {'f_mean': f_mean, 'm_mean': m_mean, 'diff': diff, 'p': p, 'd': d}

## Table 2: Generation Patterns

Shows linguistic patterns that differ significantly by gender after FDR correction.

In [None]:
patterns = [
    ('Gemini 2.0', 'S01', 'I believe', r'\bi believe\b', re.IGNORECASE),
    ('GPT-5.2', 'S02', 'clarify', r'\bclarify\b', re.IGNORECASE),
    ('Gemini 2.0', 'S02', 'follow-up', r'follow.?up', re.IGNORECASE),
    ('GPT-5.2', 'S02', 'wanted to', r'wanted to', re.IGNORECASE),
    ('GPT-5.2', 'S02', 'full name sig', r'\n[A-Z][a-z]+ [A-Z][a-z]+\s*$', 0),
]

results = []
for model, scenario, name, pattern, flags in patterns:
    emails = gpt_emails if model == 'GPT-5.2' else gemini_emails
    res = calc_persona_pattern(emails, scenario, pattern, flags)
    results.append({
        'Model': model,
        'Pattern': name,
        'F%': f"{res['f_pct']:.1f}%",
        'M%': f"{res['m_pct']:.1f}%",
        'p': f"{res['p']:.3f}" if res['p'] >= 0.001 else '<.001',
        'd': f"{res['d']:+.2f}"
    })

table2 = pd.DataFrame(results)
print("Table 2: Gender differences in generated email style (FDR-corrected, n=30 per group)")
print(table2.to_string(index=False))

## Table 3: S01 Evaluation (Salary Negotiation)

No significant gender differences in evaluation.

In [None]:
s01_settings = [
    ('GPT-5.2 → Gemini 2.0', 'Naturalistic', 'gemini_nat'),
    ('GPT-5.2 → Gemini 2.0', 'Debiased', 'gemini_deb'),
    ('Gemini 2.0 → GPT-5.2', 'Naturalistic', 'gpt52_nat'),
    ('Gemini 2.0 → GPT-5.2', 'Debiased', 'gpt52_deb'),
]

results = []
for setting, condition, key in s01_settings:
    res = calc_persona_rating(ratings[key], 'S01', 'likelihood_to_grant_raise')
    results.append({
        'Setting': setting,
        'Condition': condition,
        'F': f"{res['f_mean']:.2f}",
        'M': f"{res['m_mean']:.2f}",
        'Diff': f"{res['diff']:+.2f}",
        'p': f"{res['p']:.3f}"
    })

table3 = pd.DataFrame(results)
print("Table 3: S01 Evaluation - Likelihood to grant raise (1-5 scale, n=30 per group)")
print(table3.to_string(index=False))

## Table 4: S02 Evaluation (Credit Attribution)

Significant pro-female bias in most conditions.

In [None]:
s02_settings = [
    ('GPT-5.2 → Gemini 2.0', 'Naturalistic', 'gemini_nat'),
    ('GPT-5.2 → Gemini 2.0', 'Debiased', 'gemini_deb'),
    ('GPT-5.2 → Gemini 2.0', 'Blinded', 'gemini_blind'),
    ('Gemini 2.0 → GPT-5.2', 'Naturalistic', 'gpt52_nat'),
    ('Gemini 2.0 → GPT-5.2', 'Debiased', 'gpt52_deb'),
    ('Gemini 2.0 → GPT-5.2', 'Blinded', 'gpt52_blind'),
]

results = []
for setting, condition, key in s02_settings:
    res = calc_persona_rating(ratings[key], 'S02', 'likelihood_to_send_correction')
    results.append({
        'Setting': setting,
        'Condition': condition,
        'F-M Diff': f"{res['diff']:+.2f}",
        'p': f"{res['p']:.3f}" if res['p'] >= 0.001 else '<.001',
        'd': f"{res['d']:+.2f}"
    })

table4 = pd.DataFrame(results)
print("Table 4: S02 Evaluation - Likelihood to send correction (n=30 per group)")
print(table4.to_string(index=False))

## Table 5: Blinded Decomposition

Decomposing evaluation bias into name-based and style-based components.

In [None]:
# Gemini evaluator (rating GPT-5.2 emails)
gem_nat = calc_persona_rating(ratings['gemini_nat'], 'S02', 'likelihood_to_send_correction')
gem_blind = calc_persona_rating(ratings['gemini_blind'], 'S02', 'likelihood_to_send_correction')
gem_deb = calc_persona_rating(ratings['gemini_deb'], 'S02', 'likelihood_to_send_correction')

# GPT-5.2 evaluator (rating Gemini emails)
gpt_nat = calc_persona_rating(ratings['gpt52_nat'], 'S02', 'likelihood_to_send_correction')
gpt_blind = calc_persona_rating(ratings['gpt52_blind'], 'S02', 'likelihood_to_send_correction')
gpt_deb = calc_persona_rating(ratings['gpt52_deb'], 'S02', 'likelihood_to_send_correction')

print("Table 5: S02 Bias Decomposition")
print("="*70)
print(f"\nGemini 2.0 evaluator:")
print(f"  Unblinded: {gem_nat['diff']:+.2f} (d={gem_nat['d']:.2f})")
print(f"  Blinded:   {gem_blind['diff']:+.2f} (d={gem_blind['d']:.2f})")
print(f"  Debiased:  {gem_deb['diff']:+.2f}")
print(f"  → Name component: {gem_nat['diff'] - gem_blind['diff']:+.2f}")
print(f"  → Style component: {gem_blind['diff']:+.2f}")
print(f"  → Interpretation: Pure style (blinding has no effect)")

print(f"\nGPT-5.2 evaluator:")
print(f"  Unblinded: {gpt_nat['diff']:+.2f} (d={gpt_nat['d']:.2f})")
print(f"  Blinded:   {gpt_blind['diff']:+.2f} (d={gpt_blind['d']:.2f})")
print(f"  Debiased:  {gpt_deb['diff']:+.2f}")
print(f"  → Name component: {gpt_nat['diff'] - gpt_blind['diff']:+.2f}")
print(f"  → Style component: {gpt_blind['diff']:+.2f}")
print(f"  → Interpretation: Name + Style (both contribute)")

## Summary Statistics

In [None]:
print("Dataset Summary")
print("="*50)
print(f"Persona pairs: 30")
print(f"Total personas: 60")
print(f"Scenarios: 2")
print(f"Emails per persona-scenario: 3")
print(f"Generator models: 2 (GPT-5.2, Gemini 2.0)")
print(f"Total emails: {len(gpt_emails) + len(gemini_emails)}")
print(f"Evaluation conditions: 3 (naturalistic, debiased, blinded)")
print(f"Total ratings: {sum(len(v) for v in ratings.values())}")