In [327]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from scipy.stats import norm

def fishers_z_test(r1, r2, n1, n2):
    """
    Perform Fisher's Z-test to compare two correlation coefficients.
    r1, r2: Correlation coefficients to compare
    n1, n2: Sample sizes of the respective groups
    Returns: z-score and p-value
    """
    # Fisher's Z transformation
    z1 = 0.5 * np.log((1 + r1) / (1 - r1))
    z2 = 0.5 * np.log((1 + r2) / (1 - r2))

    # Standard error
    se = np.sqrt(1/(n1 - 3) + 1/(n2 - 3))

    # Z-test statistic
    z = (z1 - z2) / se

    # Two-tailed p-value
    p_value = 2 * (1 - norm.cdf(abs(z)))

    return z, p_value

n_human = 373
n_synthetic = 373

# --- Baseline (Humans) ---
# --- The cors take from the analysis/human-participants/human_participants_analysis.ipynb ---
correlations_human = {
    "Cabin Luggage": {"Extraversion": 0.10, "Agreeableness": 0.26, "Conscientiousness": 0.02, "Neuroticism": 0.03, "Openness": -0.03},
    "Packing Cubes": {"Extraversion": 0.17, "Agreeableness": 0.39, "Conscientiousness": -0.03, "Neuroticism": 0.00, "Openness": 0.04},
    "Water Bottle": {"Extraversion": 0.19, "Agreeableness": 0.26, "Conscientiousness": 0.05, "Neuroticism": -0.17, "Openness": -0.03}
}

# --- Model-specific correlations ---

# --- The cors take from the analysis/synthetic-participants/analysis_gpt_4o.ipynb ---
correlations_gpt4o = {
    "Cabin Luggage": {"Extraversion": 0.27, "Agreeableness": 0.21, "Conscientiousness": 0.26, "Neuroticism": 0.00, "Openness": 0.55},
    "Packing Cubes": {"Extraversion": 0.34, "Agreeableness": 0.23, "Conscientiousness": 0.23, "Neuroticism": 0.00, "Openness": 0.52},
    "Water Bottle": {"Extraversion": 0.64, "Agreeableness": 0.17, "Conscientiousness": 0.27, "Neuroticism": 0.08, "Openness": 0.57}
}

# --- The cors take from the analysis/synthetic-participants/analysis_gpt_5_latest.ipynb ---
correlations_gpt5_latest = {
    "Cabin Luggage": {"Extraversion": 0.41, "Agreeableness": 0.31, "Conscientiousness": 0.30, "Neuroticism": 0.24, "Openness": 0.75},
    "Packing Cubes": {"Extraversion": 0.41, "Agreeableness": 0.30, "Conscientiousness": 0.27, "Neuroticism": 0.22, "Openness": 0.71},
    "Water Bottle": {"Extraversion": 0.72, "Agreeableness": 0.22, "Conscientiousness": -0.09, "Neuroticism": 0.24, "Openness": 0.61}
}

# --- The cors take from the analysis/synthetic-participants/analysis_gemini_1.5_flash.ipynb ---
correlations_gemini_1_5 = {
    "Cabin Luggage": {"Extraversion": 0.18, "Agreeableness": 0.24, "Conscientiousness": 0.28, "Neuroticism": 0.08, "Openness": 0.29},
    "Packing Cubes": {"Extraversion": 0.26, "Agreeableness": 0.22, "Conscientiousness": 0.19, "Neuroticism": 0.14, "Openness": 0.35},
    "Water Bottle": {"Extraversion": 0.37, "Agreeableness": 0.18, "Conscientiousness": 0.09, "Neuroticism": -0.21, "Openness": 0.42}
}

# --- The cors take from the analysis/synthetic-participants/analysis_gemini_2.0_flash.ipynb ---
correlations_gemini_2_0 = {
    "Cabin Luggage": {"Extraversion": 0.28, "Agreeableness": 0.28, "Conscientiousness": 0.25, "Neuroticism": 0.16, "Openness": 0.20},
    "Packing Cubes": {"Extraversion": 0.15, "Agreeableness": 0.25, "Conscientiousness": 0.17, "Neuroticism": 0.16, "Openness": 0.25},
    "Water Bottle": {"Extraversion": 0.23, "Agreeableness": 0.07, "Conscientiousness": 0.16, "Neuroticism": 0.11, "Openness": 0.34}
}

traits = ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']

# Compute Fisher's Z-tests
def fisher_tests(human_data, synthetic_data):
    results = {product: {} for product in human_data.keys()}
    
    for product in human_data.keys():
        for trait in human_data[product].keys():
            z, p = fishers_z_test(
                human_data[product][trait], 
                synthetic_data[product][trait], 
                n_human, 
                n_synthetic
            )
            results[product][trait] = {"z-score": z, "p-value": p}
    
    return results

results_gpt4o = fisher_tests(correlations_human, correlations_gpt4o)
results_gpt5_latest = fisher_tests(correlations_human, correlations_gpt5_latest)
results_gemini_1_5 = fisher_tests(correlations_human, correlations_gemini_1_5)
results_gemini_2_0 = fisher_tests(correlations_human, correlations_gemini_2_0)



<h1>Assessing Alignment Between Humans and Models Using Fisher’s z-Test</h1>

In [328]:
import numpy as np
import scipy.stats as stats
import pandas as pd
from statsmodels.stats.multitest import multipletests

# --- The cors take from the analysis/synthetic-participants/analysis_gpt_4o.ipynb ---
correlations_gpt4o = {
    "Cabin Luggage": {"Extraversion": 0.27, "Agreeableness": 0.21, "Conscientiousness": 0.26, "Neuroticism": 0.00, "Openness": 0.55},
    "Packing Cubes": {"Extraversion": 0.34, "Agreeableness": 0.23, "Conscientiousness": 0.23, "Neuroticism": 0.00, "Openness": 0.52},
    "Water Bottle": {"Extraversion": 0.64, "Agreeableness": 0.17, "Conscientiousness": 0.27, "Neuroticism": 0.08, "Openness": 0.57}
}

# --- The cors take from the analysis/synthetic-participants/analysis_gpt_5_latest.ipynb ---
correlations_gpt5_latest = {
    "Cabin Luggage": {"Extraversion": 0.41, "Agreeableness": 0.31, "Conscientiousness": 0.30, "Neuroticism": 0.24, "Openness": 0.75},
    "Packing Cubes": {"Extraversion": 0.41, "Agreeableness": 0.30, "Conscientiousness": 0.27, "Neuroticism": 0.22, "Openness": 0.71},
    "Water Bottle": {"Extraversion": 0.72, "Agreeableness": 0.22, "Conscientiousness": -0.09, "Neuroticism": 0.24, "Openness": 0.61}
}

# --- The cors take from the analysis/synthetic-participants/analysis_gemini_1.5_flash.ipynb ---
correlations_gemini_1_5 = {
    "Cabin Luggage": {"Extraversion": 0.18, "Agreeableness": 0.24, "Conscientiousness": 0.28, "Neuroticism": 0.08, "Openness": 0.29},
    "Packing Cubes": {"Extraversion": 0.26, "Agreeableness": 0.22, "Conscientiousness": 0.19, "Neuroticism": 0.14, "Openness": 0.35},
    "Water Bottle": {"Extraversion": 0.37, "Agreeableness": 0.18, "Conscientiousness": 0.09, "Neuroticism": -0.21, "Openness": 0.42}
}

# --- The cors take from the analysis/synthetic-participants/analysis_gemini_2.0_flash.ipynb ---
correlations_gemini_2_0 = {
    "Cabin Luggage": {"Extraversion": 0.28, "Agreeableness": 0.28, "Conscientiousness": 0.25, "Neuroticism": 0.16, "Openness": 0.20},
    "Packing Cubes": {"Extraversion": 0.15, "Agreeableness": 0.25, "Conscientiousness": 0.17, "Neuroticism": 0.16, "Openness": 0.25},
    "Water Bottle": {"Extraversion": 0.23, "Agreeableness": 0.07, "Conscientiousness": 0.16, "Neuroticism": 0.11, "Openness": 0.34}
}


df_human = pd.DataFrame(correlations_human).T
df_gpt4o = pd.DataFrame(correlations_gpt4o).T
df_gpt5_latest = pd.DataFrame(correlations_gpt5_latest).T
df_gemini_1_5 = pd.DataFrame(correlations_gemini_1_5).T
df_gemini_2_0 = pd.DataFrame(correlations_gemini_2_0).T


n_human = 373
n_synthetic = 373

def compute_fishers_z_per_product(df_human, df_model, model_name):
    results = []
    pvals = []
    traits = ['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism']

    for product in df_human.index:
        for trait in traits:
            r_human = df_human.loc[product, trait]
            r_model = df_model.loc[product, trait]
            z, p = fishers_z_test(r_human, r_model, n_human, n_synthetic)
            
            # collect raw p-values for FDR adjustment
            pvals.append(p)
            
            # Round values for presentation
            p_rounded = round(p, 4)
            z_rounded = round(z, 2)
            r_human_rounded = round(r_human, 2)
            r_model_rounded = round(r_model, 2)

            similarity_threshold = 0.1
            alignment = "Aligned" if (abs(r_human - r_model) <= similarity_threshold) and (p >= 0.05) else "Misaligned"
            significance = "*" if p < 0.020048 else ""

            results.append({
                "Product": product,
                "Trait-Matched Ad": trait,
                "Human ( r )": r_human_rounded,
                f"{model_name} ( r )": r_model_rounded,
                "Δr": r_human_rounded - r_model_rounded,
                "z-statistics": z_rounded,
                "p-value": p_rounded,
                "Significance": significance,
                "Alignment": alignment
            })

    results = pd.DataFrame(results)

    return results, pvals


df_z_gpt4o, pvals_gpt4o = compute_fishers_z_per_product(df_human, df_gpt4o, "GPT-4o")
df_z_gpt5_latest, pvals_gpt5_latest = compute_fishers_z_per_product(df_human, df_gpt5_latest, "GPT-5-latest")
df_z_gemini_1_5, pvals_gemini_1_5 = compute_fishers_z_per_product(df_human, df_gemini_1_5, "Gemini-1.5-flash")
df_z_gemini_2_0, pvals_gemini_2_0 = compute_fishers_z_per_product(df_human, df_gemini_2_0, "Gemini-2.0-flash")


In [329]:
import json

pvals = {
    "gpt-4o-z-test": pvals_gpt4o,
    "gpt-5-latest-z-test": pvals_gpt5_latest,
    "gemini-1.5-flash-z-test": pvals_gemini_1_5,
    "gemini-2.0-flash-z-test": pvals_gemini_2_0
}

# save pvals
with open("../p_value_correction/z_test_all_pvals.json", "w") as f:
    json.dump(pvals, f, indent=4)

In [330]:
df_z_gpt4o

Unnamed: 0,Product,Trait-Matched Ad,Human ( r ),GPT-4o ( r ),Δr,z-statistics,p-value,Significance,Alignment
0,Cabin Luggage,Openness,-0.03,0.55,-0.58,-8.82,0.0,*,Misaligned
1,Cabin Luggage,Conscientiousness,0.02,0.26,-0.24,-3.35,0.0008,*,Misaligned
2,Cabin Luggage,Extraversion,0.1,0.27,-0.17,-2.4,0.0163,*,Misaligned
3,Cabin Luggage,Agreeableness,0.26,0.21,0.05,0.72,0.4715,,Aligned
4,Cabin Luggage,Neuroticism,0.03,0.0,0.03,0.41,0.6832,,Aligned
5,Packing Cubes,Openness,0.04,0.52,-0.48,-7.29,0.0,*,Misaligned
6,Packing Cubes,Conscientiousness,-0.03,0.23,-0.26,-3.59,0.0003,*,Misaligned
7,Packing Cubes,Extraversion,0.17,0.34,-0.17,-2.48,0.0131,*,Misaligned
8,Packing Cubes,Agreeableness,0.39,0.23,0.16,2.42,0.0157,*,Misaligned
9,Packing Cubes,Neuroticism,0.0,0.0,0.0,0.0,1.0,,Aligned


In [331]:
df_z_gpt5_latest

Unnamed: 0,Product,Trait-Matched Ad,Human ( r ),GPT-5-latest ( r ),Δr,z-statistics,p-value,Significance,Alignment
0,Cabin Luggage,Openness,-0.03,0.75,-0.78,-13.64,0.0,*,Misaligned
1,Cabin Luggage,Conscientiousness,0.02,0.3,-0.28,-3.94,0.0001,*,Misaligned
2,Cabin Luggage,Extraversion,0.1,0.41,-0.31,-4.56,0.0,*,Misaligned
3,Cabin Luggage,Agreeableness,0.26,0.31,-0.05,-0.74,0.459,,Aligned
4,Cabin Luggage,Neuroticism,0.03,0.24,-0.21,-2.92,0.0035,*,Misaligned
5,Packing Cubes,Openness,0.04,0.71,-0.67,-11.52,0.0,*,Misaligned
6,Packing Cubes,Conscientiousness,-0.03,0.27,-0.3,-4.17,0.0,*,Misaligned
7,Packing Cubes,Extraversion,0.17,0.41,-0.24,-3.59,0.0003,*,Misaligned
8,Packing Cubes,Agreeableness,0.39,0.3,0.09,1.39,0.1642,,Aligned
9,Packing Cubes,Neuroticism,0.0,0.22,-0.22,-3.04,0.0023,*,Misaligned


In [332]:
df_z_gemini_1_5

Unnamed: 0,Product,Trait-Matched Ad,Human ( r ),Gemini-1.5-flash ( r ),Δr,z-statistics,p-value,Significance,Alignment
0,Cabin Luggage,Openness,-0.03,0.29,-0.32,-4.47,0.0,*,Misaligned
1,Cabin Luggage,Conscientiousness,0.02,0.28,-0.26,-3.64,0.0003,*,Misaligned
2,Cabin Luggage,Extraversion,0.1,0.18,-0.08,-1.11,0.2668,,Aligned
3,Cabin Luggage,Agreeableness,0.26,0.24,0.02,0.29,0.7717,,Aligned
4,Cabin Luggage,Neuroticism,0.03,0.08,-0.05,-0.68,0.4951,,Aligned
5,Packing Cubes,Openness,0.04,0.35,-0.31,-4.43,0.0,*,Misaligned
6,Packing Cubes,Conscientiousness,-0.03,0.19,-0.22,-3.02,0.0025,*,Misaligned
7,Packing Cubes,Extraversion,0.17,0.26,-0.09,-1.28,0.199,,Aligned
8,Packing Cubes,Agreeableness,0.39,0.22,0.17,2.56,0.0105,*,Misaligned
9,Packing Cubes,Neuroticism,0.0,0.14,-0.14,-1.92,0.0553,,Misaligned


In [333]:
df_z_gemini_2_0

Unnamed: 0,Product,Trait-Matched Ad,Human ( r ),Gemini-2.0-flash ( r ),Δr,z-statistics,p-value,Significance,Alignment
0,Cabin Luggage,Openness,-0.03,0.2,-0.23,-3.17,0.0015,*,Misaligned
1,Cabin Luggage,Conscientiousness,0.02,0.25,-0.23,-3.2,0.0014,*,Misaligned
2,Cabin Luggage,Extraversion,0.1,0.28,-0.18,-2.55,0.0108,*,Misaligned
3,Cabin Luggage,Agreeableness,0.26,0.28,-0.02,-0.29,0.7692,,Aligned
4,Cabin Luggage,Neuroticism,0.03,0.16,-0.13,-1.79,0.0739,,Misaligned
5,Packing Cubes,Openness,0.04,0.25,-0.21,-2.93,0.0034,*,Misaligned
6,Packing Cubes,Conscientiousness,-0.03,0.17,-0.2,-2.74,0.0061,*,Misaligned
7,Packing Cubes,Extraversion,0.17,0.15,0.02,0.28,0.7801,,Aligned
8,Packing Cubes,Agreeableness,0.39,0.25,0.14,2.13,0.0334,,Misaligned
9,Packing Cubes,Neuroticism,0.0,0.16,-0.16,-2.2,0.0282,,Misaligned


In [334]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_alignment_with_humans(model="gpt-4o", similarity_threshold=0.1):
    # --- Baseline (Humans) ---
    correlations_human = {
        "Cabin Luggage": {"Extraversion": 0.10, "Agreeableness": 0.26, "Conscientiousness": 0.02, "Neuroticism": 0.03, "Openness": -0.03},
        "Packing Cubes": {"Extraversion": 0.17, "Agreeableness": 0.39, "Conscientiousness": -0.03, "Neuroticism": 0.00, "Openness": 0.04},
        "Water Bottle": {"Extraversion": 0.19, "Agreeableness": 0.26, "Conscientiousness": 0.05, "Neuroticism": -0.17, "Openness": -0.03}
    }

    # --- Model-specific correlations ---
    correlations_gpt4o = {
        "Cabin Luggage": {"Extraversion": 0.27, "Agreeableness": 0.21, "Conscientiousness": 0.26, "Neuroticism": 0.00, "Openness": 0.55},
        "Packing Cubes": {"Extraversion": 0.34, "Agreeableness": 0.23, "Conscientiousness": 0.23, "Neuroticism": 0.00, "Openness": 0.52},
        "Water Bottle": {"Extraversion": 0.64, "Agreeableness": 0.17, "Conscientiousness": 0.27, "Neuroticism": 0.08, "Openness": 0.57}
    }

    correlations_gpt5_latest = {
        "Cabin Luggage": {"Extraversion": 0.41, "Agreeableness": 0.31, "Conscientiousness": 0.30, "Neuroticism": 0.24, "Openness": 0.75},
        "Packing Cubes": {"Extraversion": 0.41, "Agreeableness": 0.30, "Conscientiousness": 0.27, "Neuroticism": 0.22, "Openness": 0.71},
        "Water Bottle": {"Extraversion": 0.72, "Agreeableness": 0.22, "Conscientiousness": -0.09, "Neuroticism": 0.24, "Openness": 0.61}
    }

    correlations_gemini_1_5 = {
        "Cabin Luggage": {"Extraversion": 0.18, "Agreeableness": 0.24, "Conscientiousness": 0.28, "Neuroticism": 0.08, "Openness": 0.29},
        "Packing Cubes": {"Extraversion": 0.26, "Agreeableness": 0.22, "Conscientiousness": 0.19, "Neuroticism": 0.14, "Openness": 0.35},
        "Water Bottle": {"Extraversion": 0.37, "Agreeableness": 0.18, "Conscientiousness": 0.09, "Neuroticism": -0.21, "Openness": 0.42}
    }

    correlations_gemini_2_0 = {
        "Cabin Luggage": {"Extraversion": 0.28, "Agreeableness": 0.28, "Conscientiousness": 0.25, "Neuroticism": 0.16, "Openness": 0.20},
        "Packing Cubes": {"Extraversion": 0.15, "Agreeableness": 0.25, "Conscientiousness": 0.17, "Neuroticism": 0.16, "Openness": 0.25},
        "Water Bottle": {"Extraversion": 0.23, "Agreeableness": 0.07, "Conscientiousness": 0.16, "Neuroticism": 0.11, "Openness": 0.34}
    }

    MODELS = {
        "gpt-4o": correlations_gpt4o,
        "gpt-5-latest": correlations_gpt5_latest,
        "gemini-1.5-flash": correlations_gemini_1_5,
        "gemini-2.0-flash": correlations_gemini_2_0,
    }

    DISPLAY_NAME = {
        "gpt-4o": "GPT-4o",
        "gpt-5-latest": "GPT-5-latest",
        "gemini-1.5-flash": "Gemini 1.5 Flash",
        "gemini-2.0-flash": "Gemini 2.0 Flash"
    }

    if model not in MODELS:
        raise ValueError(f"Unknown model '{model}'. Valid options are: {list(MODELS.keys())}")

    model_name = DISPLAY_NAME[model]
    correlations_model = MODELS[model]

    # --- DataFrames ---
    df_human = pd.DataFrame(correlations_human).T
    df_model = pd.DataFrame(correlations_model).T

    # Keep trait order fixed
    traits = ['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism']

    # Per-product x-axis ranges using just Human + selected model
    x_ranges = {}
    for product in df_human.index:
        all_vals = pd.concat([df_human.loc[product], df_model.loc[product]])
        x_ranges[product] = (float(all_vals.min()) - 0.05, float(all_vals.max()) + 0.05)

    # --- Figure ---
    fig = make_subplots(
        rows=3, cols=1,
        shared_xaxes=False,
        subplot_titles=list(df_human.index),
        vertical_spacing=0.1
    )

    # Draw per product
    for i, product in enumerate(df_human.index, start=1):
        for trait in traits:
            r_human = round(float(df_human.loc[product, trait]), 2)
            r_model = round(float(df_model.loc[product, trait]), 2)
            aligned = abs(r_human - r_model) <= similarity_threshold
            color = "green" if aligned else "red"

            # Connecting line
            fig.add_trace(go.Scatter(
                x=[r_human, r_model],
                y=[trait, trait],
                mode='lines',
                line=dict(color="black", width=1),
                opacity=0.7,
                showlegend=False
            ), row=i, col=1)

            # Human marker
            fig.add_trace(go.Scatter(
                x=[r_human],
                y=[trait],
                mode='markers+text',
                marker=dict(size=10, symbol="circle", color=color),
                text=[f"r={r_human}"],
                textposition="middle left" if r_human < r_model else "middle right",
                showlegend=False
            ), row=i, col=1)

            # Model marker
            fig.add_trace(go.Scatter(
                x=[r_model],
                y=[trait],
                mode='markers+text',
                marker=dict(size=10, symbol="square", color=color),
                text=[f"r={r_model}"],
                textposition="middle right" if r_human < r_model else "middle left",
                showlegend=False
            ), row=i, col=1)

        # Axis formatting
        fig.update_yaxes(
            categoryorder='array',
            categoryarray=traits[::-1],
            row=i, col=1
        )
        fig.update_xaxes(
            title_text="Correlation",
            range=x_ranges[product],
            row=i, col=1
        )

    # Legends
    fig.add_trace(go.Scatter(
        x=[None], y=[None],
        mode='markers',
        marker=dict(size=10, symbol='circle', color='gray'),
        name='Humans'
    ))
    fig.add_trace(go.Scatter(
        x=[None], y=[None],
        mode='markers',
        marker=dict(size=10, symbol='square', color='gray'),
        name='Synthetic Twin'
    ))
    fig.add_trace(go.Scatter(
        x=[None], y=[None],
        mode='markers',
        marker=dict(size=10, symbol='circle', color='green'),
        name='Aligned'
    ))
    fig.add_trace(go.Scatter(
        x=[None], y=[None],
        mode='markers',
        marker=dict(size=10, symbol='circle', color='red'),
        name='Misaligned'
    ))

    fig.update_layout(
        height=1400,
        width=1300,
        title=f"Alignment Between Humans and Synthetic Twins ({model_name})",
        margin=dict(t=100, b=80, l=100, r=220),
        legend=dict(
            x=1.02,
            y=1,
            traceorder='normal',
            font=dict(size=12),
            borderwidth=0
        )
    )

    fig.show()


In [335]:
plot_alignment_with_humans("gpt-4o")

In [336]:
plot_alignment_with_humans("gpt-5-latest")

In [337]:
plot_alignment_with_humans("gemini-1.5-flash")

In [338]:
plot_alignment_with_humans("gemini-2.0-flash")

In [339]:
import plotly.graph_objects as go

PRODUCTS = ["P1 — Cabin Luggage", "P2 — Packing Cubes", "P3 — Water Bottle"]

# GPT
gpt4o_same_cat = [60.0, 80.0, 80.0]
gpt5_latest_same_cat = [60.0, 80.0, 80.0]

# Gemini
gemini_1_5_flash_same_cat = [80.0, 80.0, 60.0]
gemini_2_0_flash_same_cat = [60.0, 80.0, 80.0]

human_baseline_top = 100.0

fig = go.Figure()

# GPT-4o bars
fig.add_trace(go.Bar(
    x=PRODUCTS,
    y=gpt4o_same_cat,
    name="GPT-4o",
    text=[f"{v:.0f}%" for v in gpt4o_same_cat],
    textposition="outside",
    marker=dict(line=dict(color="black", width=1.5), color="#4C72B0"),
    opacity=0.8,
    textfont=dict(size=12)
))

# GPT-5-latest bars
fig.add_trace(go.Bar(
    x=PRODUCTS,
    y=gpt5_latest_same_cat,
    name="GPT-5-latest",
    text=[f"{v:.0f}%" for v in gpt5_latest_same_cat],
    textposition="outside",
    marker=dict(line=dict(color="black", width=1.5), color="#1F77B4"),
    opacity=0.8,
    textfont=dict(size=12)
))

# Gemini 1.5 Flash bars
fig.add_trace(go.Bar(
    x=PRODUCTS,
    y=gemini_1_5_flash_same_cat,
    name="Gemini-1.5 Flash",
    text=[f"{v:.0f}%" for v in gemini_1_5_flash_same_cat],
    textposition="outside",
    marker=dict(line=dict(color="black", width=1.5), color="#D17ACD"),
    opacity=0.8,
    textfont=dict(size=12)
))

# Gemini 2.0 Flashbars
fig.add_trace(go.Bar(
    x=PRODUCTS,
    y=gemini_2_0_flash_same_cat,
    name="Gemini-2.0 Flash",
    text=[f"{v:.0f}%" for v in gemini_2_0_flash_same_cat],
    textposition="outside",
    marker=dict(line=dict(color="black", width=1.5), color="#A64CA6"),
    opacity=0.8,
    textfont=dict(size=12)
))

# Keep outside texts from being clipped
fig.update_traces(cliponaxis=False)

# Human baseline (100% perfect agreement)
fig.add_hline(
    y=human_baseline_top, 
    line_dash="solid", 
    line_color="black",
    annotation_text="Human baseline (perfect agreement) = 100%",
    annotation_position="top left",
)

# Layout
fig.update_layout(
    xaxis_title="Product",
    yaxis_title="% Same Category",
    yaxis_range=[0, 105],
    bargap=0.35,
    barmode="group",
    plot_bgcolor="white",
    paper_bgcolor="white",
    xaxis=dict(showline=True, linewidth=1, linecolor="black", mirror=True),
    yaxis=dict(showline=True, linewidth=1, linecolor="black", mirror=True, gridcolor="lightgrey"),
    height=550,
    width=950,
    # More breathing room for top labels and bottom footnote
    margin=dict(l=70, r=40, t=90, b=130),
    legend=dict(orientation="h", yanchor="bottom", y=1.04, xanchor="left", x=0)
)

# Footnote — move slightly up and rely on extra bottom margin
fig.add_annotation(
    text=(
        "Note: Bars show the proportion of identical rating categories between humans and synthetic twins. Categories are derived by mean ratings on a 5-point<br>"
        "Likert scale. For example, Human Mean = 3.45 is Category 3 and Synthetic Twins Mean (GPT-4o) = 3.40 is Category 3 (same category, agreement),<br>"
        "whereas Human Mean = 3.45 is Category 3 and Synthetic Twins Mean (Gemini-1.5 Flash) = 3.60 is Category 4 (different categories, no agreement)."
    ),

    xref="paper", yref="paper",
    x=0.48, y=-0.20,
    showarrow=False,
    font=dict(size=12, color="black"),
    align="left",
    xanchor="center",
    yanchor="top"
)

fig


In [340]:
import plotly.graph_objects as go

PRODUCTS = ["P1 — Cabin Luggage", "P2 — Packing Cubes", "P3 — Water Bottle"]

# GPT
gpt4o_r            = [0.87, 0.88, 0.83]
gpt5_latest_r      = [0.86, 0.91, 0.87]

# Gemini
gemini_1_5_flash_r = [0.81, 0.84, 0.81]
gemini_2_0_flash_r = [0.88, 0.88, 0.87]

human_baseline_top = 1.0  # perfect agreement

fig = go.Figure()

# GPT-4o bars
fig.add_trace(go.Bar(
    x=PRODUCTS,
    y=gpt4o_r,
    name="GPT-4o",
    text=[f"r={v:.2f}" for v in gpt4o_r],
    textposition="outside",
    marker=dict(line=dict(color="black", width=1.5), color="#4C72B0"),
    opacity=0.8,
    textfont=dict(size=12)
))

# GPT-5-latest bars
fig.add_trace(go.Bar(
    x=PRODUCTS,
    y=gpt5_latest_r,
    name="GPT-5-latest",
    text=[f"r={v:.2f}" for v in gpt5_latest_r],
    textposition="outside",
    marker=dict(line=dict(color="black", width=1.5), color="#1F77B4"),
    opacity=0.8,
    textfont=dict(size=12)
))

# Gemini 1.5 Flash bars
fig.add_trace(go.Bar(
    x=PRODUCTS,
    y=gemini_1_5_flash_r,
    name="Gemini-1.5 Flash",
    text=[f"r={v:.2f}" for v in gemini_1_5_flash_r],
    textposition="outside",
    marker=dict(line=dict(color="black", width=1.5), color="#D17ACD"),
    opacity=0.8,
    textfont=dict(size=12)
))

# Gemini 2.0 Flash bars
fig.add_trace(go.Bar(
    x=PRODUCTS,
    y=gemini_2_0_flash_r,
    name="Gemini-2.0 Flash",
    text=[f"r={v:.2f}" for v in gemini_2_0_flash_r],
    textposition="outside",
    marker=dict(line=dict(color="black", width=1.5), color="#A64CA6"),
    opacity=0.8,
    textfont=dict(size=12)
))

# Keep outside texts from being clipped
fig.update_traces(cliponaxis=False)

# Human (perfect) baseline
fig.add_hline(
    y=human_baseline_top,
    line_dash="solid",
    line_color="black",
    annotation_text="Human baseline (perfect agreement) = 1.00",
    annotation_position="top left",
)

# Layout
fig.update_layout(
    xaxis_title="Product",
    yaxis_title="Aggregate correlations",
    yaxis_range=[0, 1.05],
    bargap=0.35,
    barmode="group",
    plot_bgcolor="white",
    paper_bgcolor="white",
    xaxis=dict(showline=True, linewidth=1, linecolor="black", mirror=True),
    yaxis=dict(showline=True, linewidth=1, linecolor="black", mirror=True, gridcolor="lightgrey"),
    height=550,
    width=950,
    margin=dict(l=70, r=40, t=90, b=130),
    legend=dict(orientation="h", yanchor="bottom", y=1.04, xanchor="left", x=0)
)

# Footnote — tailored for correlations
fig.add_annotation(
    text=(
        "Note: Bars show Pearson correlations between human and synthetic twins’ mean ratings across the five personality traits <br>"
        "for each product. A correlation of r = 1.00 indicates perfect alignment with human ratings (i.e., identical relative patterns across traits),<br>"
        "whereas r = 0 indicates no alignment."
    ),
    xref="paper", yref="paper",
    x=0.48, y=-0.20,
    showarrow=False,
    font=dict(size=12, color="black"),
    align="left",
    xanchor="center",
    yanchor="top"
)

fig.show()
