<h1>Gemini 2.0 Flash</h1>

<h1>Imports</h1>

In [292]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, norm, chi2
import pingouin as pg
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

<h1>Evaluate Internal Consistency of Six 5-Point Likert Scale Items on Advertisement Effectiveness Using Cronbach's Alpha</h1>

In [293]:
def calculate_aes_cronbach_alphas(df):

    alphas = {}

    trait_items = {
        "Product 1 - Openness": ['p_1_openness_item_1', 'p_1_openness_item_2', 'p_1_openness_item_3', 'p_1_openness_item_4', 'p_1_openness_item_5', 'p_1_openness_item_6'],
        "Product 1 - Conscientiousness": ['p_1_consc_item_1', 'p_1_consc_item_2', 'p_1_consc_item_3', 'p_1_consc_item_4', 'p_1_consc_item_5', 'p_1_consc_item_6'],
        "Product 1 - Extraversion": ['p_1_extr_item_1', 'p_1_extr_item_2', 'p_1_extr_item_3', 'p_1_extr_item_4', 'p_1_extr_item_5', 'p_1_extr_item_6'],
        "Product 1 - Agreeableness": ['p_1_agree_item_1', 'p_1_agree_item_2', 'p_1_agree_item_3', 'p_1_agree_item_4', 'p_1_agree_item_5', 'p_1_agree_item_6'],
        "Product 1 - Neuroticism": ['p_1_neuro_item_1', 'p_1_neuro_item_2', 'p_1_neuro_item_3', 'p_1_neuro_item_4', 'p_1_neuro_item_5', 'p_1_neuro_item_6'],
        
        "Product 2 - Openness": ['p_2_openness_item_1', 'p_2_openness_item_2', 'p_2_openness_item_3', 'p_2_openness_item_4', 'p_2_openness_item_5', 'p_2_openness_item_6'],
        "Product 2 - Conscientiousness": ['p_2_consc_item_1', 'p_2_consc_item_2', 'p_2_consc_item_3', 'p_2_consc_item_4', 'p_2_consc_item_5', 'p_2_consc_item_6'],
        "Product 2 - Extraversion": ['p_2_extr_item_1', 'p_2_extr_item_2', 'p_2_extr_item_3', 'p_2_extr_item_4', 'p_2_extr_item_5', 'p_2_extr_item_6'],
        "Product 2 - Agreeableness": ['p_2_agree_item_1', 'p_2_agree_item_2', 'p_2_agree_item_3', 'p_2_agree_item_4', 'p_2_agree_item_5', 'p_2_agree_item_6'],
        "Product 2 - Neuroticism": ['p_2_neuro_item_1', 'p_2_neuro_item_2', 'p_2_neuro_item_3', 'p_2_neuro_item_4', 'p_2_neuro_item_5', 'p_2_neuro_item_6'],
        
        "Product 3 - Openness": ['p_3_openness_item_1', 'p_3_openness_item_2', 'p_3_openness_item_3', 'p_3_openness_item_4', 'p_3_openness_item_5', 'p_3_openness_item_6'],
        "Product 3 - Conscientiousness": ['p_3_consc_item_1', 'p_3_consc_item_2', 'p_3_consc_item_3', 'p_3_consc_item_4', 'p_3_consc_item_5', 'p_3_consc_item_6'],
        "Product 3 - Extraversion": ['p_3_extr_item_1', 'p_3_extr_item_2', 'p_3_extr_item_3', 'p_3_extr_item_4', 'p_3_extr_item_5', 'p_3_extr_item_6'],
        "Product 3 - Agreeableness": ['p_3_agree_item_1', 'p_3_agree_item_2', 'p_3_agree_item_3', 'p_3_agree_item_4', 'p_3_agree_item_5', 'p_3_agree_item_6'],
        "Product 3 - Neuroticism": ['p_3_neuro_item_1', 'p_3_neuro_item_2', 'p_3_neuro_item_3', 'p_3_neuro_item_4', 'p_3_neuro_item_5', 'p_3_neuro_item_6'],
    }

    for trait, item_columns in trait_items.items():
        # Subset the DataFrame to include only the columns of interest
        subset = df[item_columns]
        
        # Check if all required columns are in the DataFrame
        if not all(col in df.columns for col in item_columns):
            raise("ALERT!!!")
            alphas[trait] = None  # Assign None if any column is missing
            continue

        # Calculate Cronbach's alpha using pingouin
        alpha = pg.cronbach_alpha(data=subset)
        
        # Store the alpha value in the dictionary
        alphas[trait] = alpha[0]

    return alphas

df_with_personality_scores = pd.read_csv('../../data/synthetic_participants_gemini_2.0_flash.csv')
alphas = calculate_aes_cronbach_alphas(df_with_personality_scores)

for product, alpha in alphas.items():
    if alpha is not None:
        print(f"Cronbach's Alpha for {product}: {alpha:.3f}")
    else:
        print(f"Cronbach's Alpha for {product}: Data missing")


res = df_with_personality_scores.isna().any(axis=1).sum()
res

Cronbach's Alpha for Product 1 - Openness: 0.879
Cronbach's Alpha for Product 1 - Conscientiousness: 0.712
Cronbach's Alpha for Product 1 - Extraversion: 0.889
Cronbach's Alpha for Product 1 - Agreeableness: 0.878
Cronbach's Alpha for Product 1 - Neuroticism: 0.647
Cronbach's Alpha for Product 2 - Openness: 0.823
Cronbach's Alpha for Product 2 - Conscientiousness: 0.706
Cronbach's Alpha for Product 2 - Extraversion: 0.827
Cronbach's Alpha for Product 2 - Agreeableness: 0.855
Cronbach's Alpha for Product 2 - Neuroticism: 0.660
Cronbach's Alpha for Product 3 - Openness: 0.877
Cronbach's Alpha for Product 3 - Conscientiousness: 0.745
Cronbach's Alpha for Product 3 - Extraversion: 0.838
Cronbach's Alpha for Product 3 - Agreeableness: 0.909
Cronbach's Alpha for Product 3 - Neuroticism: 0.699


np.int64(0)

<h1>Aggregate Advertisement Effectiveness Scores (AES) Responses to Derive Dependent Variables: AES by Trait and Product</h1>

In [294]:
from sklearn.linear_model import LinearRegression
import numpy as np

def calculate_raw_and_residualized_aes(dataframe):
    product_numbers = [1, 2, 3]
    traits = {
        "openness": "openness",
        "conscientiousness": "consc",
        "extraversion": "extr",
        "agreeableness": "agree",
        "neuroticism": "neuro"
    }

    for product_num in product_numbers:
        # Dictionary to store raw AES columns for each trait for the current product
        raw_aes_columns = {}

        # Step 1: Calculate Raw AES
        for trait_name, trait_prefix in traits.items():
            # Find all relevant item columns for this product and trait
            target_columns = [
                col for col in dataframe.columns
                if col.startswith(f"p_{product_num}_{trait_prefix}_item_")
            ]
            
            if target_columns:
                # Calculate raw AES as the mean of relevant columns
                dataframe[f"raw_aes_{product_num}_{trait_name}"] = dataframe[target_columns].mean(axis=1, skipna=True)
                raw_aes_columns[trait_name] = dataframe[f"raw_aes_{product_num}_{trait_name}"]
            else:
                print(f"No valid columns found for product {product_num}, trait {trait_name}!")

        # Step 2: Calculate Residualized AES
        for target_trait, raw_aes_target in raw_aes_columns.items():
            # Use raw AES scores of other traits as predictors
            predictors = [
                raw_aes_columns[other_trait]
                for other_trait in traits.keys()
                if other_trait != target_trait and other_trait in raw_aes_columns
            ]

            if predictors:
                # Stack predictors into a matrix
                predictors_matrix = np.column_stack(predictors)
                
                # Perform regression to calculate residuals
                regression_model = LinearRegression()
                regression_model.fit(predictors_matrix, raw_aes_target)
                residuals = raw_aes_target - regression_model.predict(predictors_matrix)
                
                # Save residualized AES
                dataframe[f"aes_{product_num}_{target_trait}"] = residuals
            else:
                # If no predictors are available, retain raw AES
                dataframe[f"aes_{product_num}_{target_trait}"] = raw_aes_target
                print(f"Could not residualize AES for product {product_num}, trait {target_trait} due to missing predictors.")

    return dataframe

df_aes_with_aes_residualized = calculate_raw_and_residualized_aes(df_with_personality_scores)

df_aes_with_aes_residualized


Unnamed: 0,p_1_agree_item_1,p_1_agree_item_2,p_1_agree_item_3,p_1_agree_item_4,p_1_agree_item_5,p_1_agree_item_6,p_1_consc_item_1,p_1_consc_item_2,p_1_consc_item_3,p_1_consc_item_4,...,raw_aes_3_openness,raw_aes_3_conscientiousness,raw_aes_3_extraversion,raw_aes_3_agreeableness,raw_aes_3_neuroticism,aes_3_openness,aes_3_conscientiousness,aes_3_extraversion,aes_3_agreeableness,aes_3_neuroticism
0,4,4,3,4,4,4,4,5,4,5,...,3.833333,3.833333,3.833333,4.000000,4.000000,-0.029725,-0.084732,0.371846,-0.109461,0.031115
1,4,4,4,4,4,4,4,4,3,4,...,4.000000,3.666667,3.166667,4.000000,3.833333,0.488980,-0.229520,-0.325758,0.178913,0.000688
2,3,3,3,3,3,3,4,4,3,4,...,4.000000,3.833333,3.500000,4.000000,4.000000,0.282317,-0.117289,-0.034358,-0.053356,0.066791
3,4,4,4,4,4,4,4,4,3,4,...,4.000000,4.000000,3.000000,4.000000,4.000000,0.444678,0.026103,-0.508934,-0.041793,0.059284
4,3,4,3,3,3,3,4,4,4,4,...,3.833333,4.000000,3.333333,4.000000,3.833333,0.138223,0.128612,-0.035373,-0.099496,-0.143058
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368,4,4,4,4,4,4,4,4,4,4,...,3.000000,4.000000,2.833333,4.000000,4.000000,-0.482634,0.120585,-0.238377,0.259698,0.087180
369,3,4,3,4,4,4,5,5,4,4,...,3.000000,4.000000,3.000000,4.000000,4.000000,-0.555322,0.128343,-0.071711,0.210612,0.068568
370,4,4,4,4,4,4,4,4,4,4,...,3.833333,4.000000,3.833333,4.000000,4.000000,-0.085427,0.081934,0.397270,-0.245157,-0.032226
371,4,4,4,4,4,4,4,5,5,4,...,3.000000,4.000000,3.000000,4.000000,4.000000,-0.555322,0.128343,-0.071711,0.210612,0.068568


<h1>Regression Analysis: Scores on the Big Five Traits as Predictors of Respondents' Ratings of the Advertisements' Effectivenes</h1>

In [295]:
%%time

import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

def perform_regression_on_personality(df, aes_cols, traits_cols):
    # Initialize a results DataFrame: rows -> traits, columns -> AES framings
    results = pd.DataFrame(index=traits_cols, columns=aes_cols)

    for aes_col in aes_cols:
        # Standardize predictors (traits) and outcome (AES)
        scaler = StandardScaler()
        X = scaler.fit_transform(df[traits_cols])
        y = scaler.fit_transform(df[[aes_col]]).flatten()

        # Add constant to predictors
        X = sm.add_constant(X)

        # Fit regression model
        model = sm.OLS(y, X).fit()

        # Extract coefficients and p-values (skip constant)
        coefficients = model.params[1:]  # Exclude constant
        p_values = model.pvalues[1:]  # Exclude constant

        # Store results as standardized beta coefficients with p-values
        results[aes_col] = [f"{coeff:.2f} ({pval:.4f})" for coeff, pval in zip(coefficients, p_values)]

    return results

# Define AES and personality trait columns
aes_cols_product1 = ['aes_1_extraversion', 'aes_1_agreeableness', 'aes_1_conscientiousness', 'aes_1_neuroticism', 'aes_1_openness']
aes_cols_product2 = ['aes_2_extraversion', 'aes_2_agreeableness', 'aes_2_conscientiousness', 'aes_2_neuroticism', 'aes_2_openness']
aes_cols_product3 = ['aes_3_extraversion', 'aes_3_agreeableness', 'aes_3_conscientiousness', 'aes_3_neuroticism', 'aes_3_openness']

personality_cols = ['extraversion_score', 'agreeableness_score', 'conscientiousness_score', 'neuroticism_score', 'openness_score']


# Run the analysis for each product
regression_results = {
    1: perform_regression_on_personality(df_aes_with_aes_residualized, aes_cols_product1, personality_cols),
    2: perform_regression_on_personality(df_aes_with_aes_residualized, aes_cols_product2, personality_cols),
    3: perform_regression_on_personality(df_aes_with_aes_residualized, aes_cols_product3, personality_cols),
}

regression_results 

CPU times: user 24.5 ms, sys: 3.02 ms, total: 27.6 ms
Wall time: 30.6 ms


{1:                         aes_1_extraversion aes_1_agreeableness  \
 extraversion_score           0.28 (0.0000)      -0.15 (0.0044)   
 agreeableness_score         -0.07 (0.1929)       0.28 (0.0000)   
 conscientiousness_score     -0.13 (0.0128)      -0.03 (0.5383)   
 neuroticism_score           -0.10 (0.0811)       0.04 (0.4446)   
 openness_score               0.02 (0.6320)       0.06 (0.2730)   
 
                         aes_1_conscientiousness aes_1_neuroticism  \
 extraversion_score               -0.07 (0.1887)     0.14 (0.0111)   
 agreeableness_score               0.04 (0.5042)     0.02 (0.6667)   
 conscientiousness_score           0.25 (0.0000)     0.09 (0.0899)   
 neuroticism_score                -0.08 (0.1388)     0.16 (0.0046)   
 openness_score                   -0.02 (0.6936)     0.07 (0.1883)   
 
                          aes_1_openness  
 extraversion_score       -0.05 (0.3544)  
 agreeableness_score      -0.17 (0.0019)  
 conscientiousness_score  -0.11 (0.0451)  

In [296]:
import pandas as pd

df_humans = pd.read_csv('../../data/filtered_participants_dataset.csv')
df_synths = df_aes_with_aes_residualized

# Combine both DataFrames
df_combined = pd.concat([df_humans, df_synths], axis=0).reset_index(drop=True)

df_humans['Group'] = 'Humans'
df_synths['Group'] = 'Synthetic Twins'

df_combined = pd.concat([df_humans, df_synths], axis=0).reset_index(drop=True)

# df_combined
df_combined[aes_cols_product1 + aes_cols_product2 + aes_cols_product3]

Unnamed: 0,aes_1_extraversion,aes_1_agreeableness,aes_1_conscientiousness,aes_1_neuroticism,aes_1_openness,aes_2_extraversion,aes_2_agreeableness,aes_2_conscientiousness,aes_2_neuroticism,aes_2_openness,aes_3_extraversion,aes_3_agreeableness,aes_3_conscientiousness,aes_3_neuroticism,aes_3_openness
0,-0.589320,0.547696,-0.590881,-0.071894,-0.721854,-0.377274,-0.311016,-0.612154,0.093235,-0.698012,-0.939937,-0.883497,0.071693,0.203355,-0.169196
1,-0.581986,-1.097038,0.395912,0.141924,0.416278,-0.386329,-0.935341,0.348401,0.070223,0.870100,-0.435835,-0.519799,0.325971,0.304744,-0.072157
2,0.153302,0.193889,0.422019,-0.694482,0.490547,0.758704,-0.461896,0.783933,-0.205457,0.103993,0.234217,0.235637,0.014028,0.051043,0.239061
3,1.345917,-0.450205,0.999941,-0.431397,-1.119932,0.473976,0.727811,0.133040,-0.083276,-0.916147,0.228827,-0.168965,0.778575,-0.242742,0.169427
4,0.661316,0.624082,-0.127629,-1.174486,-0.606445,-0.095666,-0.507207,0.156978,-1.216210,0.735899,0.938994,-0.155936,-0.600658,-0.230803,-0.617504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
741,0.581061,-0.326874,-0.224426,0.184951,0.103655,-0.580453,-0.368991,0.166949,-0.064609,0.704296,-0.238377,0.259698,0.120585,0.087180,-0.482634
742,0.206009,-0.200839,0.336576,-0.207461,-0.001755,0.112658,-0.134270,-0.085192,0.171381,-0.205053,-0.071711,0.210612,0.128343,0.068568,-0.555322
743,0.502882,-0.132784,-0.092849,0.019795,-0.042294,0.321175,-0.087205,-0.105147,0.134037,-0.284874,0.397270,-0.245157,0.081934,-0.032226,-0.085427
744,-0.533344,-0.144041,0.211191,0.351848,0.156534,-0.390964,-0.279760,-0.206151,0.033692,0.848043,-0.071711,0.210612,0.128343,0.068568,-0.555322


In [297]:
import numpy as np
from scipy.stats import linregress, t
from sklearn.linear_model import LinearRegression
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def create_scatter_plots_with_fit_grouped(df_combined, aes_cols, traits_cols, product_name,
                                          confidence=0.95, alpha=0.015837):
    """alpha is the decision cutoff (our FDR-adjusted threshold)."""
    traits_order = ['openness','conscientiousness','extraversion','agreeableness','neuroticism']
    fig = make_subplots(
        rows=len(traits_order), cols=2,
        subplot_titles=["Humans" if i % 2 == 0 else "Synthetic Twins"
                        for trait in traits_order for i in range(2)],
        vertical_spacing=0.12, horizontal_spacing=0.12
    )

    def format_p(p, threshold=alpha):
        return f"< {threshold:.6f}".replace("0.", ".") if p < threshold else f"= {p:.6f}".replace("0.", ".")

    for i, trait in enumerate(traits_order):
        trait_col = f"{trait}_score"
        aes_col = next((c for c in aes_cols if c.endswith(trait)), None)
        if aes_col is None or trait_col not in traits_cols:
            continue

        for j, group in enumerate(["Humans", "Synthetic Twins"]):
            data = df_combined[df_combined["Group"] == group][[trait_col, aes_col]].dropna()
            if data.empty:
                continue

            x = data[trait_col].values
            y = data[aes_col].values
            x_reshaped = x.reshape(-1, 1)

            # Fit model & CI for the line
            lin_model = LinearRegression().fit(x_reshaped, y)
            y_pred = lin_model.predict(x_reshaped)
            slope, intercept, r_value, p_value, std_err = linregress(x, y)

            n = len(x)
            mean_x = np.mean(x)
            t_val = t.ppf((1 + confidence) / 2., df=n - 2)
            residuals = y - y_pred
            se = np.sqrt(np.sum(residuals ** 2) / (n - 2))
            se_line = se * np.sqrt(1/n + ((x - mean_x)**2 / np.sum((x - mean_x)**2)))
            ci_upper = y_pred + t_val * se_line
            ci_lower = y_pred - t_val * se_line

            # Sort for plotting
            sorted_idx = np.argsort(x)
            x_sorted = x[sorted_idx]
            y_sorted = y_pred[sorted_idx]
            ci_upper_sorted = ci_upper[sorted_idx]
            ci_lower_sorted = ci_lower[sorted_idx]

            row, col = i + 1, j + 1

            # Scatter
            fig.add_trace(go.Scatter(
                x=x, y=y, mode='markers',
                marker=dict(color='black', opacity=0.3),
                showlegend=False
            ), row=row, col=col)

            # Regression line
            fig.add_trace(go.Scatter(
                x=x_sorted, y=y_sorted, mode='lines',
                line=dict(color='blue'),
                showlegend=False
            ), row=row, col=col)

            # Confidence band
            fig.add_trace(go.Scatter(
                x=np.concatenate([x_sorted, x_sorted[::-1]]),
                y=np.concatenate([ci_upper_sorted, ci_lower_sorted[::-1]]),
                fill='toself', fillcolor='rgba(0,0,0,0.15)',
                line=dict(color='rgba(255,255,255,0)'),
                hoverinfo="skip", showlegend=False
            ), row=row, col=col)

            # Axes
            fig.update_xaxes(title_text="Advertisement Effectiveness Score", row=row, col=col)
            fig.update_yaxes(title_text=f"{trait.capitalize()}-Tailored Ad", row=row, col=col)

            # def format_p(p, decimals=4):
            #     return f"= {p:.{decimals}f}"

            # Annotation: bold Significant / Non-Significant + p-value
            sig = p_value < alpha
            label = "<b>Significant</b>" if sig else "<b>Non-Significant</b>"
            color = "green" if sig else "red"
            text = f"{label} (p {format_p(p_value)})"
            
            fig.add_annotation(
                row=row, col=col, xref="x domain", yref="y domain",
                x=0.02, y=0.98, showarrow=False, align="left",
                text=text, font=dict(size=14, color=color),
                bgcolor="rgba(255,255,255,0.9)"
            )

    fig.update_layout(
        height=300 * len(traits_order),
        width=1400,
        title_text=f"{product_name} – AES Scores: Humans vs Synthetic Twins (Gemini 2.0 Flash)",
        template="simple_white",
        showlegend=False
    )
    return fig


In [298]:
raw_aes_cols_product1 = ['raw_aes_1_extraversion', 'raw_aes_1_agreeableness', 'raw_aes_1_conscientiousness', 'raw_aes_1_neuroticism', 'raw_aes_1_openness']
raw_aes_cols_product2 = ['raw_aes_2_extraversion', 'raw_aes_2_agreeableness', 'raw_aes_2_conscientiousness', 'raw_aes_2_neuroticism', 'raw_aes_2_openness']
raw_aes_cols_product3 = ['raw_aes_3_extraversion', 'raw_aes_3_agreeableness', 'raw_aes_3_conscientiousness', 'raw_aes_3_neuroticism', 'raw_aes_3_openness']

fig1 = create_scatter_plots_with_fit_grouped(df_combined, raw_aes_cols_product1, personality_cols, "Cabin Luggage")
fig1.show()

fig2 = create_scatter_plots_with_fit_grouped(df_combined, raw_aes_cols_product2, personality_cols, "Packing Cubes")
fig2.show()

fig3 = create_scatter_plots_with_fit_grouped(df_combined, raw_aes_cols_product3, personality_cols, "Water Bottle")
fig3.show()

<h1>Box Plot</h1>

In [299]:
import pandas as pd
import plotly.express as px
import scipy.stats as stats

def compute_95ci(data):
    n = len(data)
    mean = data.mean()
    sem = stats.sem(data)
    ci_range = stats.t.ppf(0.975, df=n-1) * sem
    return round(mean - ci_range, 2), round(mean + ci_range, 2)
    
def plot_trait_boxplots_by_product(col_prefix, product_name):
    traits = ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']
    
    for trait in traits:
        col_name = f"{col_prefix}{trait}"
        trait_label = trait.capitalize()

        # Prepare data
        df_plot = df_combined[["Group", col_name]].dropna().copy()
        df_plot.rename(columns={col_name: "Score"}, inplace=True)

        fig = px.box(df_plot,
                     x="Group",               # Categorical on x → vertical plot
                     y="Score",               # Numeric on y → vertical direction
                     color="Group",
                     points="all",
                     title=f"{trait_label} Trait -> {product_name}",
                     labels={"Score": f"{trait_label} AES Score", "Group": "Group"},
                     category_orders={"Group": ["Humans", "Synthetic Twins"]})

        # Add 95% CI annotations
        for group in ["Humans", "Synthetic Twins"]:
            group_data = df_plot[df_plot["Group"] == group]["Score"]
            if len(group_data) >= 2:
                ci_low, ci_high = compute_95ci(group_data)
                y_pos = group_data.max() + 0.3
                fig.add_annotation(
                    x=group,
                    y=y_pos,
                    text=f"95% CI [{ci_low}, {ci_high}]",
                    showarrow=False,
                    font=dict(size=11),
                    xanchor="center"
                )

        # Control jitter and marker style
        fig.update_traces(jitter=0.4, marker=dict(opacity=0.5, size=6))
        fig.update_layout(
            showlegend=False,
            title_text=f"Human and Synthetic Evaluations of {trait_label}-Tailored Ads: AES Scores for {product_name}"
        )
        fig.show()


<h2> Cabin luggage: Box Plot</h2>

In [300]:
plot_trait_boxplots_by_product("raw_aes_1_", "Cabin Luggage")

<h2> Packing Cubes: Box Plot</h2>

In [301]:
plot_trait_boxplots_by_product("raw_aes_2_", "Packing Cubes")

<h2>Water Bottle</h2>

In [302]:
plot_trait_boxplots_by_product("raw_aes_3_", "Water Bottle")

<h1>Confidence Intervals</h1>

In [303]:
import pandas as pd
import pingouin as pg
from scipy.stats import pearsonr

# Define products and traits
products = {
    "Cabin Luggage": "raw_aes_1_",
    "Packing Cubes": "raw_aes_2_",
    "Water Bottle": "raw_aes_3_"
}
traits = ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']

# Function to compute bootstrapped CI
def boot_ci(data):
    data = data.dropna()
    if len(data) < 2:
        return None, None
    ci = pg.compute_bootci(data, func='mean', n_boot=1000, confidence=0.95)
    return data.mean(), (ci[0], ci[1])

# Output rows
table_rows = []

for product_name, prefix in products.items():
    human_means = []
    synth_means = []
    same_cat_count = 0

    for trait in traits:
        col_name = f"{prefix}{trait}"
        trait_label = trait.capitalize()

        # Subset data
        humans = df_combined[df_combined["Group"] == "Humans"][col_name]
        synths = df_combined[df_combined["Group"] == "Synthetic Twins"][col_name]

        # Compute CIs and means
        h_mean, h_ci = boot_ci(humans)
        s_mean, s_ci = boot_ci(synths)

        # Store means for correlation later
        human_means.append(h_mean)
        synth_means.append(s_mean)

        # Compute categories
        h_cat = int(round(h_mean)) if h_mean is not None else None
        s_cat = int(round(s_mean)) if s_mean is not None else None
        same_cat = "✅" if h_cat == s_cat else "❌"
        if same_cat == "✅":
            same_cat_count += 1

        # T-test
        if humans.dropna().shape[0] > 1 and synths.dropna().shape[0] > 1:
            pval = pg.ttest(humans.dropna(), synths.dropna(), paired=False)['p-val'].values[0]
            pval_str = f"{pval:.3f}"
        else:
            pval_str = "N/A"

        # Format means and CIs
        h_str = f"{h_mean:.2f} [{h_ci[0]:.2f}, {h_ci[1]:.2f}]" if h_mean is not None else "N/A"
        s_str = f"{s_mean:.2f} [{s_ci[0]:.2f}, {s_ci[1]:.2f}]" if s_mean is not None else "N/A"

        # Append row (trait level; product-level values will be added after)
        table_rows.append({
            "Product": product_name,
            "Trait": trait_label,
            "Human Mean [95% CI]": h_str,
            "Synthetic Mean [95% CI]": s_str,
            "Human Cat": h_cat,
            "Synthetic Cat": s_cat,
            "p-value": pval_str,
            "Same Category": same_cat,
            "Aggregate r": "", "Agg. p": "", "% Same Category": ""
        })

    # After 5 traits per product, calculate aggregate stats
    if None not in human_means and None not in synth_means:
        r, p = pearsonr(human_means, synth_means)
        r = round(r, 2)
        p = round(p, 3)
        same_percent = round((same_cat_count / len(traits)) * 100, 1)
        for i in range(len(traits)):
            table_rows[-(i+1)]["Aggregate r"] = r
            table_rows[-(i+1)]["Agg. p"] = p
            table_rows[-(i+1)]["% Same Category"] = same_percent

# Create DataFrame
final_df = pd.DataFrame(table_rows)

# Display
from IPython.display import display
display(final_df)


Unnamed: 0,Product,Trait,Human Mean [95% CI],Synthetic Mean [95% CI],Human Cat,Synthetic Cat,p-value,Same Category,Aggregate r,Agg. p,% Same Category
0,Cabin Luggage,Openness,"2.96 [2.83, 3.06]","3.54 [3.49, 3.59]",3,4,0.0,❌,0.88,0.048,60.0
1,Cabin Luggage,Conscientiousness,"3.60 [3.50, 3.69]","3.90 [3.88, 3.93]",4,4,0.0,✅,0.88,0.048,60.0
2,Cabin Luggage,Extraversion,"2.91 [2.79, 3.04]","3.23 [3.17, 3.29]",3,3,0.0,✅,0.88,0.048,60.0
3,Cabin Luggage,Agreeableness,"3.31 [3.20, 3.41]","3.75 [3.71, 3.80]",3,4,0.0,❌,0.88,0.048,60.0
4,Cabin Luggage,Neuroticism,"3.82 [3.74, 3.90]","3.87 [3.84, 3.89]",4,4,0.308,✅,0.88,0.048,60.0
5,Packing Cubes,Openness,"2.96 [2.84, 3.07]","3.13 [3.08, 3.18]",3,3,0.006,✅,0.88,0.048,80.0
6,Packing Cubes,Conscientiousness,"3.61 [3.52, 3.71]","3.87 [3.84, 3.89]",4,4,0.0,✅,0.88,0.048,80.0
7,Packing Cubes,Extraversion,"2.84 [2.72, 2.97]","3.25 [3.20, 3.29]",3,3,0.0,✅,0.88,0.048,80.0
8,Packing Cubes,Agreeableness,"3.42 [3.31, 3.53]","4.02 [3.97, 4.07]",3,4,0.0,❌,0.88,0.048,80.0
9,Packing Cubes,Neuroticism,"3.79 [3.70, 3.87]","3.88 [3.86, 3.90]",4,4,0.046,✅,0.88,0.048,80.0


In [304]:
from scipy.stats import binomtest

n = 15
chance_level = 0.05
aligned = 11

result = binomtest(aligned, n, p=chance_level, alternative="greater")

print(f"% aligned: {result.statistic}/{n}, p-value: {result.pvalue:.5f}")

% aligned: 0.7333333333333333/15, p-value: 0.00000


<h1>Synth Twins(Gemini 2.0 Flash): Regression Coefficient Matrix for Product 1 (Cabin luggage)</h1>

In [305]:
regression_results[1]

Unnamed: 0,aes_1_extraversion,aes_1_agreeableness,aes_1_conscientiousness,aes_1_neuroticism,aes_1_openness
extraversion_score,0.28 (0.0000),-0.15 (0.0044),-0.07 (0.1887),0.14 (0.0111),-0.05 (0.3544)
agreeableness_score,-0.07 (0.1929),0.28 (0.0000),0.04 (0.5042),0.02 (0.6667),-0.17 (0.0019)
conscientiousness_score,-0.13 (0.0128),-0.03 (0.5383),0.25 (0.0000),0.09 (0.0899),-0.11 (0.0451)
neuroticism_score,-0.10 (0.0811),0.04 (0.4446),-0.08 (0.1388),0.16 (0.0046),-0.04 (0.4985)
openness_score,0.02 (0.6320),0.06 (0.2730),-0.02 (0.6936),0.07 (0.1883),0.20 (0.0002)


<h1>Synth Twins(Gemini 2.0 Flash): Regression Coefficient Matrix for Product 2 (Compressible storage bag set)</h1>

In [306]:
regression_results[2]

Unnamed: 0,aes_2_extraversion,aes_2_agreeableness,aes_2_conscientiousness,aes_2_neuroticism,aes_2_openness
extraversion_score,0.15 (0.0069),-0.15 (0.0047),-0.01 (0.8073),0.10 (0.0811),0.04 (0.5119)
agreeableness_score,0.03 (0.5407),0.25 (0.0000),-0.01 (0.8539),0.09 (0.1265),-0.13 (0.0194)
conscientiousness_score,-0.14 (0.0125),0.04 (0.4187),0.17 (0.0014),0.13 (0.0236),-0.13 (0.0158)
neuroticism_score,-0.20 (0.0004),0.23 (0.0000),-0.20 (0.0003),0.16 (0.0037),-0.03 (0.5518)
openness_score,-0.00 (0.9949),0.06 (0.2074),0.07 (0.1877),-0.05 (0.3608),0.25 (0.0000)


<h1>Synth Twins(Gemini 2.0 Flash): Regression Coefficient Matrix for Product 3 (A water bottle)</h1>

In [307]:
regression_results[3]

Unnamed: 0,aes_3_extraversion,aes_3_agreeableness,aes_3_conscientiousness,aes_3_neuroticism,aes_3_openness
extraversion_score,0.23 (0.0000),-0.01 (0.7957),0.04 (0.4732),-0.02 (0.6799),-0.12 (0.0264)
agreeableness_score,0.15 (0.0066),0.07 (0.2088),0.07 (0.1851),0.08 (0.1821),-0.13 (0.0134)
conscientiousness_score,-0.01 (0.7983),-0.05 (0.3290),0.18 (0.0012),0.12 (0.0329),-0.16 (0.0023)
neuroticism_score,-0.01 (0.8361),0.08 (0.1421),-0.02 (0.6750),0.11 (0.0643),-0.13 (0.0156)
openness_score,-0.07 (0.2060),-0.02 (0.6563),0.02 (0.7687),0.02 (0.7060),0.34 (0.0000)


In [308]:
import pandas as pd
import pingouin as pg  # Using pingouin instead of scipy

def compute_correlations_with_pvalues(df, aes_cols, traits_cols):
    correlation_results = pd.DataFrame(index=traits_cols, columns=aes_cols)
    
    for trait in traits_cols:
        for aes in aes_cols:
            corr_res = pg.corr(df[trait], df[aes])
            r = corr_res["r"].values[0]
            p = corr_res["p-val"].values[0]
            correlation_results.loc[trait, aes] = f"{r:.2f} (p={p:.6f})"
    
    return correlation_results

# Define AES and personality trait columns
aes_cols_product1 = ['aes_1_extraversion', 'aes_1_agreeableness', 'aes_1_conscientiousness', 'aes_1_neuroticism', 'aes_1_openness']
aes_cols_product2 = ['aes_2_extraversion', 'aes_2_agreeableness', 'aes_2_conscientiousness', 'aes_2_neuroticism', 'aes_2_openness']
aes_cols_product3 = ['aes_3_extraversion', 'aes_3_agreeableness', 'aes_3_conscientiousness', 'aes_3_neuroticism', 'aes_3_openness']


personality_cols = ['openness_score', 'conscientiousness_score', 'extraversion_score', 'agreeableness_score', 'neuroticism_score']

# Compute Pearson correlations with p-values for each product
correlations_pvalues_product1 = compute_correlations_with_pvalues(df_aes_with_aes_residualized, aes_cols_product1, personality_cols)
correlations_pvalues_product2 = compute_correlations_with_pvalues(df_aes_with_aes_residualized, aes_cols_product2, personality_cols)
correlations_pvalues_product3 = compute_correlations_with_pvalues(df_aes_with_aes_residualized, aes_cols_product3, personality_cols)

# Display results
print("Pearson Correlations with p-values - Product 1")
display(correlations_pvalues_product1)

print("Pearson Correlations with p-values - Product 2")
display(correlations_pvalues_product2)

print("Pearson Correlations with p-values - Product 3")
display(correlations_pvalues_product3)

Pearson Correlations with p-values - Product 1


Unnamed: 0,aes_1_extraversion,aes_1_agreeableness,aes_1_conscientiousness,aes_1_neuroticism,aes_1_openness
openness_score,0.05 (p=0.289998),0.10 (p=0.044114),-0.02 (p=0.756640),0.08 (p=0.117501),0.15 (p=0.003372)
conscientiousness_score,-0.07 (p=0.188790),-0.05 (p=0.353969),0.27 (p=0.000000),0.05 (p=0.300086),-0.12 (p=0.024490)
extraversion_score,0.26 (p=0.000000),-0.06 (p=0.215623),-0.02 (p=0.757136),0.14 (p=0.005675),-0.09 (p=0.090742)
agreeableness_score,0.02 (p=0.748871),0.25 (p=0.000002),0.02 (p=0.638575),0.10 (p=0.056165),-0.15 (p=0.004176)
neuroticism_score,-0.10 (p=0.060385),0.08 (p=0.110690),-0.16 (p=0.001964),0.10 (p=0.065022),-0.01 (p=0.862096)


Pearson Correlations with p-values - Product 2


Unnamed: 0,aes_2_extraversion,aes_2_agreeableness,aes_2_conscientiousness,aes_2_neuroticism,aes_2_openness
openness_score,0.05 (p=0.374071),0.09 (p=0.086509),0.08 (p=0.127632),-0.03 (p=0.586058),0.23 (p=0.000007)
conscientiousness_score,-0.04 (p=0.418532),-0.04 (p=0.418297),0.24 (p=0.000002),0.08 (p=0.106016),-0.13 (p=0.014769)
extraversion_score,0.17 (p=0.000697),-0.09 (p=0.083802),0.05 (p=0.348368),0.11 (p=0.040440),0.02 (p=0.747614)
agreeableness_score,0.07 (p=0.195505),0.23 (p=0.000007),0.01 (p=0.836933),0.12 (p=0.022387),-0.06 (p=0.225317)
neuroticism_score,-0.17 (p=0.000890),0.24 (p=0.000004),-0.27 (p=0.000000),0.11 (p=0.040018),-0.01 (p=0.783124)


Pearson Correlations with p-values - Product 3


Unnamed: 0,aes_3_extraversion,aes_3_agreeableness,aes_3_conscientiousness,aes_3_neuroticism,aes_3_openness
openness_score,0.00 (p=0.925564),-0.01 (p=0.780506),0.04 (p=0.437262),0.03 (p=0.609258),0.31 (p=0.000000)
conscientiousness_score,0.03 (p=0.543312),-0.08 (p=0.112014),0.20 (p=0.000105),0.08 (p=0.108721),-0.14 (p=0.007390)
extraversion_score,0.27 (p=0.000000),-0.02 (p=0.766117),0.09 (p=0.072651),0.00 (p=0.965486),-0.11 (p=0.033970)
agreeableness_score,0.21 (p=0.000056),0.06 (p=0.259002),0.10 (p=0.045345),0.08 (p=0.102530),-0.10 (p=0.063045)
neuroticism_score,-0.04 (p=0.488640),0.11 (p=0.033149),-0.10 (p=0.062074),0.06 (p=0.212896),-0.08 (p=0.118945)


<h1>Compute r_difference</h1>

In [309]:
import numpy as np
import pandas as pd
from scipy.stats import t

def compute_r_difference_synths(synths_results, product_num, N):
    """
    Compute r-difference (difference in standardized regression coefficients) for synth twins only.

    Args:
    - human_results (pd.DataFrame): Regression coefficients for synth twins for a specific product.
    - product_num (int): Product number (1, 2, or 3).
    - N (int): Sample size.

    Returns:
    - pd.DataFrame: Table with r-differences, t-values, and p-values.
    """
    # Compute degrees of freedom
    df = N - 2

    def fisher_r_to_z(r):
        """Apply Fisher's transformation to r."""
        return 0.5 * np.log((1 + r) / (1 - r))

    def compute_t_value(r_matched, r_mismatched, N):
        """Compute t-value and p-value for r-difference."""
        z_matched = fisher_r_to_z(r_matched)
        z_mismatched = fisher_r_to_z(r_mismatched)
        
        SE = np.sqrt((1 / (N - 3)) + (1 / (N - 3)))
        t_value = (z_matched - z_mismatched) / SE
        p_value = 2 * (1 - t.cdf(abs(t_value), df))
        
        return t_value, p_value

    # Dynamically construct the matched-mismatched AES columns for the given product
    matched_mismatched_pairs = {
        "openness_score": (f"aes_{product_num}_openness", f"aes_{product_num}_agreeableness"),
        "conscientiousness_score": (f"aes_{product_num}_conscientiousness", f"aes_{product_num}_extraversion"),
        "extraversion_score": (f"aes_{product_num}_extraversion", f"aes_{product_num}_neuroticism"),
        "agreeableness_score": (f"aes_{product_num}_agreeableness", f"aes_{product_num}_openness"),
        "neuroticism_score": (f"aes_{product_num}_neuroticism", f"aes_{product_num}_conscientiousness")
    }

    # Initialize results DataFrame
    results = pd.DataFrame(index=matched_mismatched_pairs.keys(), columns=["r_difference", "t_value", "p_value"])

    # Compute r-difference for each trait
    for trait, (matched_col, mismatched_col) in matched_mismatched_pairs.items():
        if matched_col in synths_results.columns and mismatched_col in synths_results.columns:
            # Extract beta coefficient (ignoring p-values in parentheses)
            r_matched = float(synths_results.loc[trait, matched_col].split()[0])  
            r_mismatched = float(synths_results.loc[trait, mismatched_col].split()[0])  
            
            # Compute r-difference
            r_diff = r_matched - r_mismatched
            
            # Compute t-test values
            t_val, p_val = compute_t_value(r_matched, r_mismatched, N)
            
            # Store results
            results.loc[trait, "r_difference"] = f'{r_diff:.2f}'
            results.loc[trait, "t_value"] = f'{t_val:.2f}'
            results.loc[trait, "p_value"] = p_val
        else:
            print(f"Missing data for {trait} in product {product_num}. Check if columns exist in synths_results.")

    return results



# Compute r-differences for all three products
r_difference_synths_product1 = compute_r_difference_synths(regression_results[1], product_num=1, N=373)
r_difference_synths_product2 = compute_r_difference_synths(regression_results[2], product_num=2, N=373)
r_difference_synths_product3 = compute_r_difference_synths(regression_results[3], product_num=3, N=373)

# Display results
print("R-Difference Table for Synths Twins (Product 1)")
display(r_difference_synths_product1)

print("R-Difference Table for Synths Twins (Product 2)")
display(r_difference_synths_product2)

print("R-Difference Table for Synths Twins (Product 3)")
display(r_difference_synths_product3)


R-Difference Table for Synths Twins (Product 1)


Unnamed: 0,r_difference,t_value,p_value
openness_score,0.14,1.94,0.053089
conscientiousness_score,0.38,5.25,0.0
extraversion_score,0.14,2.0,0.046653
agreeableness_score,0.45,6.25,0.0
neuroticism_score,0.24,3.29,0.001115


R-Difference Table for Synths Twins (Product 2)


Unnamed: 0,r_difference,t_value,p_value
openness_score,0.19,2.66,0.008226
conscientiousness_score,0.31,4.25,2.7e-05
extraversion_score,0.05,0.69,0.489983
agreeableness_score,0.38,5.25,0.0
neuroticism_score,0.36,4.95,1e-06


R-Difference Table for Synths Twins (Product 3)


Unnamed: 0,r_difference,t_value,p_value
openness_score,0.36,5.09,1e-06
conscientiousness_score,0.19,2.61,0.009387
extraversion_score,0.25,3.46,0.000609
agreeableness_score,0.2,2.73,0.006598
neuroticism_score,0.13,1.77,0.076833
