<h2>Data cleaning & validation</h2>
<h3>1. Detect completion time outliers beyond 3 SDs from the mean</h3>


In [None]:
import pandas as pd

def detect_outlier_participants(data):
    data['StartDate'] = pd.to_datetime(data['StartDate'])
    data['EndDate'] = pd.to_datetime(data['EndDate'])
    
    # Calculate survey duration in seconds
    data['Duration'] = (data['EndDate'] - data['StartDate']).dt.total_seconds()
    
    # Calculate mean and standard deviation of survey times
    mean_duration = data['Duration'].mean()
    std_duration = data['Duration'].std()

    # Define thresholds for outliers
    lower_threshold = mean_duration - 3 * std_duration
    upper_threshold = mean_duration + 3 * std_duration
    
    # Identify participants below and above thresholds
    below_3_std = data[data['Duration'] < lower_threshold]
    above_3_std = data[data['Duration'] > upper_threshold]
    
    # Filter out outliers from the original DataFrame
    participants_within_3_std = data[(data['Duration'] >= lower_threshold) & (data['Duration'] <= upper_threshold)]

    print(mean_duration)
    print(f'beloow: {len(df)}, after: {len(participants_within_3_std)}')

    return below_3_std, above_3_std, participants_within_3_std


df = pd.read_csv('../../data/prolific_participants_dataset.csv')

below_3_std, above_3_std, participants_within_3_std = detect_outlier_participants(df)

pd.set_option('display.max_columns', None)


<h3>2. Detect participants failing multiple attention checks</h3>

In [None]:
def detect_participants_with_multiple_failed_attention_checks(df):
    checks = {
        'pq_attention_check_1': 5, 
        'pq_attention_check_2': 2, 
        'p_3_agree_att_check': 2, 
        'seriousness_check': 2
    }

    df = df.copy()
    df['failed_checks'] = df[list(checks)].apply(lambda row: sum(row[col] != val for col, val in checks.items()), axis=1)
    failed_participants = df[df['failed_checks'] > 1]

    return failed_participants

participants = detect_participants_with_multiple_failed_attention_checks(participants_within_3_std)

<h3>3. Exclude participants failing attention checks and suspected bots <a href="https://www.qualtrics.com/support/survey-platform/survey-module/survey-checker/fraud-detection/#BotDetection">bots</a> (Q_RecaptchaScore >= 0.5)</h3>

In [None]:
def exclude_failed_attention_checks_and_bots(df):
    # Define correct attention check responses
    passed_checks = (
        (df['pq_attention_check_1'] == 5) & 
        (df['pq_attention_check_2'] == 2) & 
        (df['p_3_agree_att_check'] == 2) &
        (df['seriousness_check'] == 2) & 
        (df['Q_RecaptchaScore'] >= 0.5)
    )
    
    return df[passed_checks].copy()

num_passed_captcha = (participants_within_3_std['Q_RecaptchaScore'] >= 0.5).sum()
print(f"Number of participants who passed CAPTCHA: {num_passed_captcha}")

num_likely_bots = (participants_within_3_std['Q_RecaptchaScore'] < 0.5).sum()
print(f"Number of likely bots (reCAPTCHA score < 0.5): {num_likely_bots}")

df_filtered = exclude_failed_attention_checks_and_bots(participants_within_3_std)

<h1>Analysis</h1>
<h3>1. Participants Demographics</h3>

In [None]:
import sys
sys.path.append("../..")  

from utils.mappings import MAPPINGS


def summarize_us_participants(df):
    total_n = len(df)
    mean_age = round(df['age'].mean(), 2)
    std_age = round(df['age'].std(), 2)
    
    gender_counts = df['gender'].map(MAPPINGS['gender']).value_counts()
    female = gender_counts.get('Female', 0)
    male = gender_counts.get('Male', 0)
    non_binary = gender_counts.get('Non-binary / third gender', 0)
    prefer_not_to_say = gender_counts.get('Prefer not to say', 0)

    return {
        'Country': 'USA',
        'N': total_n,
        'M': mean_age,
        'SD': std_age,
        'Female': female,
        'Male': male,
        'Non-binary / third gender': non_binary,
        'Prefer not to answer': prefer_not_to_say
    }

# Apply to both datasets
summary_all = summarize_us_participants(df)
summary_filtered = summarize_us_participants(df_filtered)

us_summary_table = pd.DataFrame([
    {'Group': 'All participants', **summary_all},
    {'Group': 'Participants eligible for analyses after exclusions', **summary_filtered}
])

us_summary_table


<h3>2. Calculat Personality Traits (Mini-IPIP) Scores</h3>

In [None]:
def calculate_mini_ipip_scores(dataframe):
    traits = {
        "extraversion": ["pq_1", "pq_6_reverse_score", "pq_11", "pq_16_reverse_score"],
        "agreeableness": ["pq_2", "pq_7_reverse_score", "pq_12", "pq_17_reverse_score"],
        "conscientiousness": ["pq_3", "pq_8_reverse_score", "pq_13", "pq_18_reverse_score"],
        "neuroticism": ["pq_4", "pq_9_reverse_score", "pq_14", "pq_19_reverse_score"],
        "openness": ["pq_5", "pq_10_reverse_score", "pq_15_reverse_score", "pq_20_reverse_score"]
    }

    df = dataframe.copy()

    # Reverse scoring logic: Apply 6 - response for specific items
    for trait, questions in traits.items():
        for q in questions:
            if "reverse_score" in q:
                df[q] = 6 - df[q]

    # Calculate the mean score for each trait (1-5 scale)
    for trait, questions in traits.items():
        # Validate responses are within 1-5 range
        for q in questions:
            if not df[q].between(1, 5).all():
                print(f"Warning: Found values outside 1-5 range in {q}")
                df.loc[~df[q].between(1, 5), q] = np.nan
        
        # Calculate mean score
        df[f"{trait}_score"] = df[questions].mean(axis=1)

    return df

def drop_invalid_trait_scores(df):
    columns_to_drop = [
        "intellectImagination_score",
        "extraversion_score",
        "agreeableness_score",
        "conscientiousness_score",
        "neuroticism_score"
    ]

    df = df.drop(columns=columns_to_drop, errors='ignore')
    return df

def validate_scores(df):
    traits = ["extraversion", "agreeableness", "conscientiousness", 
              "neuroticism", "openness"]
    
    valid = True
    for trait in traits:
        score_col = f"{trait}_score"
        if not df[score_col].between(1, 5).all():
            print(f"Error: {trait} scores outside valid range (1-5)")
            valid = False
        
        if df[score_col].isna().any():
            print(f"Warning: Found missing values in {trait} scores")
            valid = False
    
    return valid

#calcualate new scores
df_filtered = calculate_mini_ipip_scores(drop_invalid_trait_scores(df_filtered.copy()))

#validate scores
if not validate_scores(df_filtered):
    raise ValueError("Score validation failed! Please check the input data for inconsistencies.")

df_filtered

<h3>3. Evaluate internal consistency (Cronbach's Alpha) of Mini-IPIP responses</h3>

In [None]:
import pingouin as pg

def calculate_mini_ipip_cronbach_alphas(df):
    alphas = {}
    
    trait_items = {
        "Extraversion": ['pq_1', 'pq_6_reverse_score', 'pq_11', 'pq_16_reverse_score'],
        "Agreeableness": ['pq_2', 'pq_7_reverse_score', 'pq_12', 'pq_17_reverse_score'],
        "Conscientiousness": ['pq_3', 'pq_8_reverse_score', 'pq_13', 'pq_18_reverse_score'],
        "Neuroticism": ['pq_4', 'pq_9_reverse_score', 'pq_14', 'pq_19_reverse_score'],
        "Openness": ['pq_5', 'pq_10_reverse_score', 'pq_15_reverse_score', 'pq_20_reverse_score']
    }

    for trait, item_columns in trait_items.items():
        # Subset the DataFrame to include only the columns of interest
        subset = df[item_columns]
        
        # Calculate Cronbach's alpha
        alpha = pg.cronbach_alpha(data=subset)
        
        # Store the alpha value
        alphas[trait] = alpha[0]

    return alphas

# Calculate Cronbach's alpha for all traits
alphas = calculate_mini_ipip_cronbach_alphas(df_filtered)

for trait, alpha in alphas.items():
    print(f"Cronbach's alpha for {trait}: {alpha:.3f}")

<h3>4. Evaluate internal consistency (Cronbach's Alpha) for Advertisement Effectiveness Scores</h3>

In [None]:
import pingouin as pg

def calculate_aes_cronbach_alphas(df):
    alphas = {}    

    trait_items = {
        "Product 1 - Openness": ['p_1_openness_item_1', 'p_1_openness_item_2', 'p_1_openness_item_3', 'p_1_openness_item_4', 'p_1_openness_item_5', 'p_1_openness_item_6'],
        "Product 1 - Conscientiousness": ['p_1_consc_item_1', 'p_1_consc_item_2', 'p_1_consc_item_3', 'p_1_consc_item_4', 'p_1_consc_item_5', 'p_1_consc_item_6'],
        "Product 1 - Extraversion": ['p_1_extr_item_1', 'p_1_extr_item_2', 'p_1_extr_item_3', 'p_1_extr_item_4', 'p_1_extr_item_5', 'p_1_extr_item_6'],
        "Product 1 - Agreeableness": ['p_1_agree_item_1', 'p_1_agree_item_2', 'p_1_agree_item_3', 'p_1_agree_item_4', 'p_1_agree_item_5', 'p_1_agree_item_6'],
        "Product 1 - Neuroticism": ['p_1_neuro_item_1', 'p_1_neuro_item_2', 'p_1_neuro_item_3', 'p_1_neuro_item_4', 'p_1_neuro_item_5', 'p_1_neuro_item_6'],
        
        "Product 2 - Openness": ['p_2_openness_item_1', 'p_2_openness_item_2', 'p_2_openness_item_3', 'p_2_openness_item_4', 'p_2_openness_item_5', 'p_2_openness_item_6'],
        "Product 2 - Conscientiousness": ['p_2_consc_item_1', 'p_2_consc_item_2', 'p_2_consc_item_3', 'p_2_consc_item_4', 'p_2_consc_item_5', 'p_2_consc_item_6'],
        "Product 2 - Extraversion": ['p_2_extr_item_1', 'p_2_extr_item_2', 'p_2_extr_item_3', 'p_2_extr_item_4', 'p_2_extr_item_5', 'p_2_extr_item_6'],
        "Product 2 - Agreeableness": ['p_2_agree_item_1', 'p_2_agree_item_2', 'p_2_agree_item_3', 'p_2_agree_item_4', 'p_2_agree_item_5', 'p_2_agree_item_6'],
        "Product 2 - Neuroticism": ['p_2_neuro_item_1', 'p_2_neuro_item_2', 'p_2_neuro_item_3', 'p_2_neuro_item_4', 'p_2_neuro_item_5', 'p_2_neuro_item_6'],
        
        "Product 3 - Openness": ['p_3_openness_item_1', 'p_3_openness_item_2', 'p_3_openness_item_3', 'p_3_openness_item_4', 'p_3_openness_item_5', 'p_3_openness_item_6'],
        "Product 3 - Conscientiousness": ['p_3_consc_item_1', 'p_3_consc_item_2', 'p_3_consc_item_3', 'p_3_consc_item_4', 'p_3_consc_item_5', 'p_3_consc_item_6'],
        "Product 3 - Extraversion": ['p_3_extr_item_1', 'p_3_extr_item_2', 'p_3_extr_item_3', 'p_3_extr_item_4', 'p_3_extr_item_5', 'p_3_extr_item_6'],
        "Product 3 - Agreeableness": ['p_3_agree_item_1', 'p_3_agree_item_2', 'p_3_agree_item_3', 'p_3_agree_item_4', 'p_3_agree_item_5', 'p_3_agree_item_6'],
        "Product 3 - Neuroticism": ['p_3_neuro_item_1', 'p_3_neuro_item_2', 'p_3_neuro_item_3', 'p_3_neuro_item_4', 'p_3_neuro_item_5', 'p_3_neuro_item_6'],
    }

    for trait, item_columns in trait_items.items():
        # Subset the DataFrame to include only the columns of interest
        subset = df[item_columns]
        
        # Check if all required columns are in the DataFrame
        if not all(col in df.columns for col in item_columns):
            print("ALERT!!!")
            alphas[trait] = None  # Assign None if any column is missing
            continue

        # Calculate Cronbach's alpha using pingouin
        alpha = pg.cronbach_alpha(data=subset)
        
        # Store the alpha value in the dictionary
        alphas[trait] = alpha[0]

    return alphas



alphas = calculate_aes_cronbach_alphas(df_filtered)

for product, alpha in alphas.items():
    if alpha is not None:
        print(f"Cronbach's Alpha for {product}: {alpha:.3f}")
    else:
        print(f"Cronbach's Alpha for {product}: Data missing")


<h3>5. Aggregate Advertisement Effectiveness Scores (AES) scores to derive Dependent Variables: AES by trait and product</h3>

In [None]:
from sklearn.linear_model import LinearRegression
import numpy as np

def calculate_raw_and_residualized_aes(dataframe):
    product_numbers = [1, 2, 3]
    
    traits = {
        "openness": "openness",
        "conscientiousness": "consc",
        "extraversion": "extr",
        "agreeableness": "agree",
        "neuroticism": "neuro"
    }

    for product_num in product_numbers:
        # Dictionary to store raw AES columns for each trait for the current product
        aes_columns = {}

        # Step 1: Calculate Raw AES
        for trait_name, trait_prefix in traits.items():
            # Find all relevant item columns for this product and trait
            target_columns = [
                col for col in dataframe.columns
                if col.startswith(f"p_{product_num}_{trait_prefix}_item_")
            ]
            
            if target_columns:
                # Calculate raw AES as the mean of relevant columns
                dataframe[f"aes_{product_num}_{trait_name}"] = dataframe[target_columns].mean(axis=1, skipna=True)
                aes_columns[trait_name] = dataframe[f"aes_{product_num}_{trait_name}"]
            else:
                print(f"No valid columns found for product {product_num}, trait {trait_name}!")

        # Step 2: Calculate Residualized AES
        for target_trait, aes_target in aes_columns.items():
            # Use raw AES scores of other traits as predictors
            predictors = [
                aes_columns[other_trait]
                for other_trait in traits.keys()
                if other_trait != target_trait and other_trait in aes_columns
            ]

            if predictors:
                # Stack predictors into a matrix
                predictors_matrix = np.column_stack(predictors)
                
                # Perform regression to calculate residuals
                regression_model = LinearRegression()
                regression_model.fit(predictors_matrix, aes_target)
                residuals = aes_target - regression_model.predict(predictors_matrix)
                
                # Save residualized AES
                dataframe[f"aes_resd_{product_num}_{target_trait}"] = residuals
            else:
                # If no predictors are available, retain raw AES
                dataframe[f"aes_{product_num}_{target_trait}"] = aes_target
                print(f"Could not residualize AES for product {product_num}, trait {target_trait} due to missing predictors.")

    return dataframe

df_filtered = calculate_raw_and_residualized_aes(df_filtered.copy())


In [None]:
df_filtered.to_csv('../../data/filtered_participants_dataset.csv', index=False)

<h3>6. Regression Analysis: using Big Five personality traits to predict respondents’ AES scores</h3>

In [None]:
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

def perform_regression_on_personality(df, aes_type='raw'):
    # Personality trait columns (predictors)
    traits_cols = [
        'extraversion_score',
        'agreeableness_score',
        'conscientiousness_score',
        'neuroticism_score',
        'openness_score'
    ]

    if aes_type == 'raw':
        aes_sufix = 'aes'
    elif aes_type == 'resd':
        aes_sufix = 'aes_resd'

    # AES columns grouped by product
    product_aes_cols = {
        "p1": [f'{aes_sufix}_1_extraversion', f'{aes_sufix}_1_agreeableness', f'{aes_sufix}_1_conscientiousness', f'{aes_sufix}_1_neuroticism', f'{aes_sufix}_1_openness'],
        "p2": [f'{aes_sufix}_2_extraversion', f'{aes_sufix}_2_agreeableness', f'{aes_sufix}_2_conscientiousness', f'{aes_sufix}_2_neuroticism', f'{aes_sufix}_2_openness'],
        "p3": [f'{aes_sufix}_3_extraversion', f'{aes_sufix}_3_agreeableness', f'{aes_sufix}_3_conscientiousness', f'{aes_sufix}_3_neuroticism', f'{aes_sufix}_3_openness']
    }

    results_dict = {}

    for product, aes_cols in product_aes_cols.items():
        # Create a results DataFrame for this product
        results = pd.DataFrame(index=traits_cols, columns=aes_cols)

        for aes_col in aes_cols:
            # Standardize predictors (traits) and outcome (AES)
            scaler = StandardScaler()
            X = scaler.fit_transform(df[traits_cols])
            y = scaler.fit_transform(df[[aes_col]]).flatten()

            # Add constant to predictors
            X = sm.add_constant(X)

            # Fit regression model
            model = sm.OLS(y, X).fit()

            # Extract coefficients and p-values (skip constant)
            coefficients = model.params[1:]
            p_values = model.pvalues[1:]

            # Store results as standardized beta coefficients with p-values
            results[aes_col] = [
                f"{0.00 if round(coeff, 2) == 0 else coeff:.2f} ({pval:.4f})"
                for coeff, pval in zip(coefficients, p_values)
            ]

        # Save this product's results
        results_dict[product] = results

    # Return results for all three products
    return results_dict["p1"], results_dict["p2"], results_dict["p3"]


# Raw AES
raw_results_p1, raw_results_p2, raw_results_p3 = perform_regression_on_personality(df_filtered, aes_type='raw')

print("Human Participants: Regression Coefficient Matrix – P1: Cabin luggage")
display(raw_results_p1)
print("Human Participants: Regression Coefficient – P2: Packing Cubes")
display(raw_results_p2)
print("Human Participants: Regression Coefficient – P3:Water Bottle")
display(raw_results_p3)

# # Residualized AES
resd_results_p1, resd_results_p2, resd_results_p3 = perform_regression_on_personality(df_filtered, aes_type='resd')

print("Human Participants: Regression Coefficient Matrix – P1: Cabin luggage (Residualized AES)")
display(resd_results_p1)
print("Human Participants: Regression Coefficient – P2: Packing Cubes (Residualized AES)")
display(resd_results_p2)
print("Human Participants: Regression Coefficient – P3:Water Bottle (Residualized AES)")
display(resd_results_p3)


<h3>7. Store filtered_participants_dataset_with_aes_scores</h3>

In [None]:
import json
df = pd.read_csv('../../data/prolific_participants_dataset.csv')

# save df
df.to_csv('../../data/filtered_participants_dataset.csv', index=False)

<h3>7. Compute Pearson Correaltions for Fisher's z test </h3>

In [None]:
import pandas as pd
import pingouin as pg

def compute_correlations_with_pvalues(df, aes_type='raw'):
    # Big Five traits
    traits_cols = [
        'extraversion_score',
        'agreeableness_score',
        'conscientiousness_score',
        'neuroticism_score',
        'openness_score'
    ]

    # AES suffix
    if aes_type == 'raw':
        aes_suffix = 'aes'
    elif aes_type == 'resd':
        aes_suffix = 'aes_resd'
    else:
        raise ValueError("aes_type must be 'raw' or 'resd'.")

    # AES columns grouped by product
    product_aes_cols = {
        "p1": [f'{aes_suffix}_1_extraversion', f'{aes_suffix}_1_agreeableness',
               f'{aes_suffix}_1_conscientiousness', f'{aes_suffix}_1_neuroticism', f'{aes_suffix}_1_openness'],
        "p2": [f'{aes_suffix}_2_extraversion', f'{aes_suffix}_2_agreeableness',
               f'{aes_suffix}_2_conscientiousness', f'{aes_suffix}_2_neuroticism', f'{aes_suffix}_2_openness'],
        "p3": [f'{aes_suffix}_3_extraversion', f'{aes_suffix}_3_agreeableness',
               f'{aes_suffix}_3_conscientiousness', f'{aes_suffix}_3_neuroticism', f'{aes_suffix}_3_openness']
    }

    results = {}
    p_vals = {}

    for product, aes_cols in product_aes_cols.items():
        correlation_results = pd.DataFrame(index=traits_cols, columns=aes_cols)

        p_vals_product = []

        for trait_col in traits_cols:
            for aes_col in aes_cols:
                corr_res = pg.corr(df[trait_col], df[aes_col])
                correlation = corr_res["r"].values[0]
                p_value = corr_res["p-val"].values[0]
                correlation_results.loc[trait_col, aes_col] = f"{correlation:.2f} (p={p_value:.6f})"
                p_vals_product.append(p_value)

        results[product] = correlation_results
        p_vals[product] = p_vals_product

    return results, p_vals

results, p_vals = compute_correlations_with_pvalues(df_filtered, aes_type='raw')


# Display results
print("Pearson Correlations with p-values - Product 1")
display(results["p1"])
print("Pearson Correlations with p-values - Product 2")
display(results["p2"])
print("Pearson Correlations with p-values - Product 3")
display(results["p3"])

<h3> 8. Store p-values for for p-val FDR correction</h3>

In [None]:
import json

pvals = {
    "p1_human_correlatins_analysis": p_vals["p1"],
    "p2_human_correlatins_analysis": p_vals["p2"],
    "p3_human_correlatins_analysis": p_vals["p3"],
}

# save pvals
with open("../p_value_correction/human_correlations_pvals.json", "w") as f:
    json.dump(pvals, f, indent=4)