In [60]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import  kruskal, mannwhitneyu, wilcoxon, skew, rankdata
from mapper import continent_mapper, politicians_db
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('default')
sns.set_palette("husl")

In [61]:
key_traits = ["competence", "likable", "trustworthy", "approachable", "charismatic", 
                "aggressive", "friendly", "professional", "visually_appealing", "professionally_designed",
                "aggressive_poster", "inspirational", "manipulative", "boring", "misleading",
                "match_pol_comm", "credible_typo_color", "background", "typo_attention", "support", "interested_in"]

skew_traits = ['misleading', 'aggressive_poster', 'aggressive', 'manipulative', 'inspirational', 'professional']

In [62]:
def epsilon_interpretation(epsilon_sq):
    """
    Interpret the epsilon square value.
    """
    if epsilon_sq < 0.01:
        print("Negligible effect size (very small practical impact).")
    elif epsilon_sq < 0.06:
        print("Small effect size (small practical impact).")
    elif epsilon_sq < 0.14:
        print("Medium effect size (moderate practical impact).")
    else:
        print("Large effect size (substantial practical impact).")
        
def rank_biserial_matched_pairs(before, after):
    """
    Calculate matched-pairs rank-biserial correlation (Wilcoxon signed-rank test effect size)

    Parameters:
    before, after: array-like, paired observations

    Returns:
    r_rb: matched-pairs rank-biserial correlation (-1 to 1)
    """

    before = np.array(before)
    after = np.array(after)

    # Remove pairs where either value is NaN
    valid_pairs = ~(np.isnan(before) | np.isnan(after))
    before = before[valid_pairs]
    after = after[valid_pairs]

    # Calculate differences
    differences = after - before

    # Remove zero differences
    non_zero_diff = differences[differences != 0]
    if len(non_zero_diff) == 0:
        return 0

    # Get ranks of absolute differences
    ranks = rankdata(np.abs(non_zero_diff))
    # Positive and negative differences
    W_pos = np.sum(ranks[non_zero_diff > 0])
    W_neg = np.sum(ranks[non_zero_diff < 0])

    # Rank-biserial correlation
    r_rb = (W_pos - W_neg) / (W_pos + W_neg)
    return r_rb

In [None]:
class Phase1Analyser:
    def __init__(self, phase1_files):
        """
        Initialize the analyzer with both phase datasets
        """
        self.continent_mapping = continent_mapper
        self.politician_info = politicians_db
        self.phase1_df = self.load_and_clean_data(phase1_files)
    
    
    def load_and_clean_data(self, filepath):
        """
        Load and clean the data, with politician identification
        """
        dfs = []  # list to collect all DataFrames

        for path in filepath:
            df = pd.read_csv(path)

            # Extract politician short name from file path, e.g., "phase1_data/albanese_phase1_data.csv"
            politician_name = path.split("/")[1].split('_')[0].lower()
            df['politician_id'] = politician_name
            
                   
            df = df.drop(columns=['msg_understand','other_msg_understand'])

            # Add continent based on country
            df['continent'] = df['country'].map(self.continent_mapping)

            # Add politician attributes
            for idx, row in df.iterrows():
                pid = row.get('politician_id', 'unknown')
                if pid in self.politician_info:
                    pdata = self.politician_info[pid]
                    df.loc[idx, 'politician_name'] = pdata['name']
                    df.loc[idx, 'politician_country'] = pdata['country']
                    df.loc[idx, 'politician_continent'] = pdata['continent']
                    df.loc[idx, 'ideology'] = pdata['ideology']
                    df.loc[idx, 'popularity_score'] = pdata['popularity_score']
                    df.loc[idx, 'incumbent_status'] = pdata['incumbent_status']
                    df.loc[idx, 'same_country'] = row['country'] == pdata['country']
                    df.loc[idx, 'same_continent'] = df.loc[idx, 'continent'] == pdata['continent']

            dfs.append(df)

        # Combine all individual DataFrames into one
        combined_df = pd.concat(dfs, ignore_index=True)
        return combined_df
    
    
    def analyze_phase1_baseline_perceptions(self):
            """
            Phase 1: Baseline perceptions of original posters (1-5 scale ratings)
            This establishes baseline perceptions before manipulation
            """
            print(f"\n{'='*80}")
            print("PHASE 1: BASELINE PERCEPTIONS ANALYSIS")
            print(f"{'='*80}")
            print("Phase 1 provides baseline ratings of ORIGINAL posters only")
            print("Used for: Controlling baseline differences, geographical effects, familiarity effects")
            
            if self.phase1_df.empty:
                print("No Phase 1 data available")
                return
            
            # Key traits for RQ1-RQ3
            available_traits = [col for col in key_traits if col in self.phase1_df.columns]
            
            if not available_traits:
                print("Key trait columns not found in Phase 1 data")
                print("Available columns:", list(self.phase1_df.columns))
                return
            
            print(f"Analyzing baseline traits: {available_traits}")
            
            # 1. Overall baseline statistics
            print(f"\n=== BASELINE DESCRIPTIVE STATISTICS ===")
            for trait in available_traits:
                trait_data = self.phase1_df[self.phase1_df[trait] != -99][trait].dropna()
                if len(trait_data) > 0:
                    trait_skew = skew(trait_data)
                    mean = trait_data.mean()
                    median = trait_data.median()
                    sd = trait_data.std()
                    min_val = trait_data.min()
                    max_val = trait_data.max()
                    print(f"{trait.capitalize()}: Mean = {mean:.2f}, Median = {median:.2f}, SD = {sd:.2f}, "
                        f"Range = {min_val:.1f}-{max_val:.1f}, n = {len(trait_data)}, Skewness = {trait_skew:.2f}")
            
            # 2. Differences in perceptions by politician
            if 'politician_name' in self.phase1_df.columns:
                politicians = self.phase1_df['politician_name'].unique()
                politicians = [p for p in politicians if not pd.isna(p)]
                
                if len(politicians) > 1:
                    print(f"\n=== BASELINE DIFFERENCES BY POLITICIAN ===")
                    for trait in available_traits:
                        print(f"\n{trait.capitalize()} by politician:")
                        politician_data = []
                        
                        for politician in politicians:
                            pol_data = self.phase1_df[(self.phase1_df['politician_name'] == politician)
                                                      & (self.phase1_df[trait] != -99)
                                                      ][trait].dropna()
                            if len(pol_data) >= 3:
                                politician_data.append(pol_data)
                                print(f"  {politician}: M = {pol_data.mean():.2f}, SD = {pol_data.std():.2f}, n = {len(pol_data)}")
                        
                        if len(politician_data) >= 2:
                            try:
                                stat, p_val = kruskal(*politician_data)
                                print(f"  Statistical test result: p = {p_val:.3f}")
                                if p_val < 0.05:
                                    print("  There are clear differences in how participants rate this trait for different politicians.")
                                    N = sum(len(p) for p in politician_data)
                                    k = len(politician_data)
                                    epsilon_sq = (stat - k + 1) / (N - k)
                                    print(f"  Effect size (epsilon squared): {epsilon_sq:.3f}")
                                    epsilon_interpretation(epsilon_sq)

                                else:
                                    print("  No meaningful differences in ratings between politicians for this trait.")
                            except Exception as e:
                                print(f"  Error in test: {e}")
            
            # 3. Geographical baseline differences (for RQ4)
            self.analyze_phase1_geographical_baselines(available_traits)
            
            # 4. Familiarity effects on baseline
            self.analyze_phase1_familiarity_effects(available_traits)
            
            # 5. Opinion shift after viewing posters
            self.analyze_phase1_opinion_shift()
            
            self.analyze_phase1_understanding_effects(available_traits)
            
            self.analyze_phase1_political_interest(available_traits)
            

    def analyze_phase1_geographical_baselines(self, traits):
        """
        Analyze geographical differences in baseline perceptions
        Important for RQ4 - do continents differ in baseline perceptions?
        """
        print(f"\n=== GEOGRAPHICAL EFFECT ON PERCEPTION (BASELINE) ===")
        
        if 'continent' not in self.phase1_df.columns:
            print("No continent data available")
            return
        
        continents = self.phase1_df['continent'].value_counts()
        continents = continents[continents >= 3]  # Minimum 3 per continent
        
        if len(continents) < 2:
            print("Insufficient data for continental comparison")
            return
        
        print(f"Comparing continents: {list(continents.index)}")
        
        for trait in traits:
            print(f"\n{trait.capitalize()} overall perception analysis - Participant vs Politician Origin:")
            
            # Get all participant-politician pairs with trait ratings
            trait_data = self.phase1_df[(self.phase1_df[trait].notna())
                                        & (self.phase1_df[trait] != -99)
                                        ].copy()
            
            # Separate same vs different continent participant-politician pairs
            same_continent_pairs = trait_data[trait_data['same_continent'] == True]
            diff_continent_pairs = trait_data[trait_data['same_continent'] == False]
            
            if len(same_continent_pairs) >= 5 and len(diff_continent_pairs) >= 5:
                # Calculate how participants rate politicians from same vs different continents
                same_continent_rating = same_continent_pairs[trait].mean()
                same_continent_std = same_continent_pairs[trait].std()
                same_continent_n = len(same_continent_pairs)
                same_continet_min = same_continent_pairs[trait].min()
                same_continet_max = same_continent_pairs[trait].max()
                
                diff_continent_rating = diff_continent_pairs[trait].mean()
                diff_continent_std = diff_continent_pairs[trait].std()
                diff_continent_n = len(diff_continent_pairs)
                diff_continent_min = diff_continent_pairs[trait].min()
                diff_continent_max = diff_continent_pairs[trait].max()
                
                print(f"  Same continent (participant-politician): M = {same_continent_rating:.2f}, SD = {same_continent_std:.2f}, n = {same_continent_n}, Range = {same_continet_min:.1f}-{same_continet_max:.1f}")
                print(f"  Different continent (participant-politician): M = {diff_continent_rating:.2f}, SD = {diff_continent_std:.2f}, n = {diff_continent_n}, Range = {diff_continent_min:.1f}-{diff_continent_max:.1f}")
        
                
                # Statistical test comparing same vs different continent ratings
                try:
                    stat, p_val = mannwhitneyu(same_continent_pairs[trait].dropna(), 
                                            diff_continent_pairs[trait].dropna(), 
                                            alternative='two-sided')
                    # H₀: There is no difference in ratings between same and different continents.
                    # H₁: There is a difference (but not specifying direction)
                    print(f"  Mann-Whitney U test: U = {stat:.1f}, p = {p_val:.3f}")
                    
                    
                    if p_val < 0.05:
                        if same_continent_rating > diff_continent_rating:
                            print(f"  **Reject H₀:**: Overall participants rate politicians from their own continent as MORE {trait} (M diff = {same_continent_rating - diff_continent_rating:.2f})**")
                        else:
                            print(f"  **Reject H₀: Overall participants rate politicians from their own continent as LESS {trait} (M diff = {abs(same_continent_rating - diff_continent_rating):.2f})**")
                    else:
                        print(f"  **Fail to reject H₀**: No significant overall cross-cultural bias detected for {trait} perception")
                        
                except Exception as e:
                    print(f"  Error in statistical test: {e}")
            
            else:
                print(f"  Insufficient data: Same continent pairs n={len(same_continent_pairs)}, Different continent pairs n={len(diff_continent_pairs)}")
                print(f"  (Minimum n=5 required for each group)")
            
            # Detailed breakdown by specific continent combinations
            print(f"  \nDetailed continent by continent analysis:")
            
            # Group by participant continent
            continents = trait_data['continent'].unique()
            
            for participant_continent in continents:
                    # เช่น ผู้เข้าร่วมจากยุโรป
                    participant_data = trait_data[trait_data['continent'] == participant_continent]
                    
                    if len(participant_data) >= 5:  # Ensure sufficient data
                        print(f"    \n    Participants from {participant_continent}:")
                        
                        for politician_continents in continents:
                            # เช่น นักการเมืองจากยุโรปในข้อมูลผู้เข้าร่วมจากยุโรป หรือ นักการเมืองจากเอเชียในข้อมูลผู้เข้าร่วมจากยุโรป
                            politician_data = participant_data[participant_data['politician_continent'] == politician_continents]
                            
                            if len(politician_data) >= 3:
                                rating = politician_data[trait].mean()
                                n = len(politician_data)
                                relationship = "SAME CONTINENT" if participant_continent == politician_continents else "DIFFERENT CONTINENT"
                                
                                print(f"      → Rate {politician_continents} politicians: M = {rating:.2f}, n = {n} ({relationship})")
                        
                        # Compare same vs different continent for this participant group 
                        same_ratings = participant_data[participant_data['continent'] == participant_data['politician_continent']]
                        diff_ratings = participant_data[participant_data['continent'] != participant_data['politician_continent']]

                        if len(same_ratings) >= 3 and len(diff_ratings) >= 3:
                            try:
                                data1 = same_ratings[trait].dropna() 
                                data2 = diff_ratings[trait].dropna()
                                
                                
                                U, p_val = mannwhitneyu(data1, data2, alternative='two-sided')
                                n1 = len(data1)
                                n2 = len(data2)

                                # Calculate rank-biserial correlation
                                r_rb = 1 - (2 * U) / (n1 * n2)

                                if p_val < 0.05:
                                    if trait in skew_traits:
                                        bias_direction = "higher" if r_rb > 0 else "lower"
                                    else:
                                        bias_direction = 'higher' if data1.mean() > data2.mean() else 'lower'
                                    print(f"      → **{participant_continent} participants gave {bias_direction} {trait} ratings to politicians from their continent (p = {p_val:.3f}, r_rb = {r_rb:.3f})**")        
                            except Exception as e:
                                pass
                        # Note when use man the result go other ways 

              
    def analyze_phase1_familiarity_effects(self, traits):
        """
        Analyze how familiarity affects baseline perceptions
        """
        print(f"\n=== FAMILIARITY EFFECTS ON PERCEPTION (BASELINE) ===")
        
        if 'familiarity' not in self.phase1_df.columns:
            print("No familiarity data available")
            return
        
        familiarity_levels = self.phase1_df['familiarity'].value_counts()
        print(f"Familiarity distribution: {dict(familiarity_levels)}")
        
        for trait in traits:
            print(f"\n{trait.capitalize()} by familiarity:")
            familiarity_data = []
            familiarity_names = []

            means_dict = {}

            for familiarity in familiarity_levels.index:
                if pd.isna(familiarity):
                    continue
                fam_data = self.phase1_df[(self.phase1_df['familiarity'] == familiarity)
                                          & (self.phase1_df[trait] != -99)
                                          ][trait].dropna()
                
                if len(fam_data) >= 3:
                    familiarity_data.append(fam_data)
                    familiarity_names.append(familiarity)
                    means_dict[familiarity] = fam_data.mean()
                    print(f"  {familiarity}: Average rating = {fam_data.mean():.2f} (n = {len(fam_data)})")

            if len(familiarity_data) >= 2:
                try:
                    stat, p_val = kruskal(*familiarity_data)
                    print(f"  Statistical test: p = {p_val:.3f}")
                    if p_val < 0.05:
                        print(f"  Familiarity DOES affect ratings for {trait}.")

                        # Effect size epsilon squared (ε²) for Kruskal-Wallis
                        N = sum(len(group) for group in familiarity_data)
                        k = len(familiarity_data)
                        epsilon_sq = (stat - k + 1) / (N - k)
                        print(f"  Effect size (epsilon squared): {epsilon_sq:.3f}")
                        epsilon_interpretation(epsilon_sq)

                        # Find highest and lowest mean groups (descriptive)
                        sorted_means = sorted(means_dict.items(), key=lambda x: x[1])
                        lowest_group, lowest_mean = sorted_means[0]
                        highest_group, highest_mean = sorted_means[-1]
                        print(f"    People who are '{highest_group}' gave the HIGHEST ratings ({highest_mean:.2f}).")
                        print(f"    People who are '{lowest_group}' gave the LOWEST ratings ({lowest_mean:.2f}).")
                except Exception as e:
                    print(f"  Error: {e}")
                    
    def analyze_phase1_opinion_shift(self):
        opinion_map = {
            "Extremely negative": 1,
            "Somewhat negative": 2,
            "Neutral": 3,
            "Somewhat positive": 4,
            "Extremely positive": 5
        }

        # Map opinions to ordinal values
        df = self.phase1_df.copy()
        df['pre_opinion_num'] = df['pre_opinion'].map(opinion_map)
        df['post_opinion_num'] = df['post_opinion'].map(opinion_map)
        

        # Drop rows with missing values
        paired_df = df.dropna(subset=['pre_opinion_num', 'post_opinion_num'])
        
        print(f"\n=== OPINION SHIFT AFTER VIEWING POSTERS ===")

        # Wilcoxon Signed-Rank test
        if len(paired_df) >= 10:  # Minimum sample size for Wilcoxon
            stat, p_val = wilcoxon(paired_df['pre_opinion_num'], paired_df['post_opinion_num'])
            
            # Calculate matched-pairs rank-biserial correlation
            r_rb = rank_biserial_matched_pairs(paired_df['pre_opinion_num'], paired_df['post_opinion_num'])
            
            print(f"Wilcoxon Signed-Rank test for opinion shift (pre vs post):")
            print(f"  n = {len(paired_df)}")
            print(f"  Statistic = {stat:.2f}, p-value = {p_val:.4f}")
            # These are value skew, so we can't use mean/std
            print(f"  Skewness pre-opinion = {skew(paired_df['pre_opinion_num']):.2f}, Skewness post-opinion = {skew(paired_df['post_opinion_num']):.2f}")
            
            median_pre = paired_df['pre_opinion_num'].median()
            median_post = paired_df['post_opinion_num'].median()
            print(f"  Median pre-opinion = {median_pre:.2f}, Median post-opinion = {median_post:.2f}")
            
            if p_val < 0.05:
                # H₁: There is a consistent shift in opinions after viewing the poster.
                print("  **Significant shift in opinion after viewing the poster**")
                if r_rb > 0:
                    print(f"  Participants viewed the politicians more positively after viewing (r_rb = {r_rb:.3f})")
                elif r_rb < 0:
                    print(f"  Participants viewed the politicians more negatively after viewing (r_rb = {r_rb:.3f})")
                else:
                    print("  Significant test but negligible effect size")
            else:
                # H₀: There is no consistent change in opinions before and after viewing the poster.
                print("  No significant shift in opinion detected.")
        else:
            print("Insufficient paired data for Wilcoxon test (need at least 10 pairs).")
            
    def analyze_phase1_understanding_effects(self, traits):
        """
        Analyze how language understanding and poster meaning affect trait perception.
        Includes post-hoc pairwise tests if significant differences are found.
        """
        print("\n=== UNDERSTANDING EFFECTS ON PERCEPTION ===")

        if 'understand_lang' not in self.phase1_df.columns or 'poster_meaning' not in self.phase1_df.columns:
            print("No understanding data available.")
            return

        binary_mapper = {'Yes': 1, 'No': 0}
        df = self.phase1_df.copy()
        df['understand_lang'] = df['understand_lang'].map(binary_mapper)
        df['poster_meaning'] = df['poster_meaning'].map(binary_mapper)

        group_labels = {
            (1, 1): "Understood BOTH language and poster meaning",
            (1, 0): "Understood ONLY language",
            (0, 1): "Understood ONLY poster meaning",
            (0, 0): "Did NOT understand language NOR poster meaning"
        }

        print("\nParticipant Group Distribution:")
        for (lang, mean), label in group_labels.items():
            count = df[(df['understand_lang'] == lang) & (df['poster_meaning'] == mean)].shape[0]
            print(f"  {label}: {count} participants")

        for trait in traits:
            if trait not in df.columns:
                print(f"\nTrait '{trait}' not found in data.")
                continue

            print(f"\n--- Trait: {trait.capitalize()} ---")
            group_stats = {}
            for (lang, mean), label in group_labels.items():
                data = df[(df['understand_lang'] == lang) & (df['poster_meaning'] == mean)][trait].dropna()
                if len(data) > 0:
                    group_stats[label] = data
                    print(f"  {label}: Mean={data.mean():.2f}, SD={data.std():.2f}, n={len(data)}")

            valid_groups = [data for data in group_stats.values() if len(data) >= 3]
            if len(valid_groups) >= 2:
                try:
                    stat, p_val = kruskal(*valid_groups)
                    print(f"  Kruskal-Wallis test: H={stat:.2f}, p={p_val:.3f}")
                    if p_val < 0.05:
                        print("  Significant differences found between understanding groups.")

                        # Calculate epsilon squared for overall effect size
                        N = sum(len(g) for g in valid_groups)
                        k = len(valid_groups)
                        epsilon_sq = (stat - k + 1) / (N - k)
                        print(f"  Effect size (epsilon squared): {epsilon_sq:.3f}")
                        epsilon_interpretation(epsilon_sq)

                        # Post-hoc pairwise Mann-Whitney U tests with rank-biserial effect size
                        labels = list(group_stats.keys())
                        for i in range(len(labels)):
                            for j in range(i+1, len(labels)):
                                data1 = group_stats[labels[i]]
                                data2 = group_stats[labels[j]]
                                n1, n2 = len(data1), len(data2)

                                if n1 >= 3 and n2 >= 3:
                                    stat_u, p_u = mannwhitneyu(data1, data2, alternative='two-sided')

                                    # Calculate rank-biserial correlation
                                    r_rb = 1 - (2 * stat_u) / (n1 * n2)

                                    if p_u < 0.05:
                                        if trait in skew_traits:
                                            direction = "higher" if r_rb > 0 else "lower"
                                        else:
                                            direction = 'higher' if data1.mean() > data2.mean() else 'lower'
                                        print(
                                            f"Participants who {labels[i]} rated {trait} {direction} than those who '{labels[j]}' "
                                            f"(rank-biserial r = {r_rb:.3f}, p = {p_u:.3f})"
                                        )
                    else:
                        print(" No significant differences between understanding groups.")
                except Exception as e:
                    print(f"  Error in Kruskal-Wallis test: {e}")
            else:
                print("  Not enough data for statistical test (need at least 2 groups with n>=3).")

                
    def analyze_phase1_political_interest(self, traits):
        """
        Analyze how different levels of political engagement affect trait ratings.
        For each trait, runs Kruskal-Wallis test and reports which engagement group rates higher/lower.
        """
        print(f"\n{'='*80}")
        print("=== POLITICAL ENGAGEMENT EFFECTS ON TRAIT PERCEPTION ===")
        print(f"{'='*80}")

        if 'political_engagement' not in self.phase1_df.columns:
            print("No political engagement data available.")
            return

        # Only use valid engagement responses
        engagement_levels = self.phase1_df['political_engagement'].value_counts().index.tolist()
        print(f"Political engagement groups: {engagement_levels}")

        for trait in traits:
            print(f"\nTrait: {trait.capitalize()}")
            trait_data = []
            group_names = []
            means_dict = {}

            for level in engagement_levels:
                group = self.phase1_df[
                    (self.phase1_df['political_engagement'] == level) &
                    (self.phase1_df[trait] != -99)
                ][trait].dropna()
                if len(group) >= 3:
                    trait_data.append(group)
                    group_names.append(level)
                    means_dict[level] = group.mean()
                    print(f"  {level}: Mean = {group.mean():.2f}, SD = {group.std():.2f}, n = {len(group)}")

            if len(trait_data) >= 2:
                try:
                    stat, p_val = kruskal(*trait_data)
                    print(f"  Kruskal-Wallis test: H = {stat:.2f}, p = {p_val:.3f}")
                    if p_val < 0.05:
                        print("  Significant differences between engagement groups.")
                        # Effect size
                        N = sum(len(g) for g in trait_data)
                        k = len(trait_data)
                        epsilon_sq = (stat - k + 1) / (N - k)
                        print(f"  Effect size (epsilon squared): {epsilon_sq:.3f}")
                        epsilon_interpretation(epsilon_sq)

                        # Direction: which group rates highest/lowest
                        sorted_means = sorted(means_dict.items(), key=lambda x: x[1])
                        lowest_group, lowest_mean = sorted_means[0]
                        highest_group, highest_mean = sorted_means[-1]
                        print(f"    '{highest_group}' engagement group gave the HIGHEST ratings ({highest_mean:.2f}).")
                        print(f"    '{lowest_group}' engagement group gave the LOWEST ratings ({lowest_mean:.2f}).")
                    else:
                        print("  No significant differences between engagement groups.")
                except Exception as e:
                    print(f"  Error in Kruskal-Wallis test: {e}")
            else:
                print("  Not enough data for statistical test (need at least 2 groups with n>=3).")
                
                
    def run_research_questions_analysis(self):
        """
        Main analysis pipeline specifically designed to answer RQ1-RQ4
        """
        # Phase 1: Establish baselines and control variables
        self.analyze_phase1_baseline_perceptions()

In [64]:
phase1_data = [
    'phase1_data/lula_phase1_data.csv',
    'phase1_data/masisi_phase1_data.csv',
    'phase1_data/lee_phase1_data.csv',
    'phase1_data/ruto_phase1_data.csv',
    'phase1_data/yoon_phase1_data.csv',
    'phase1_data/kenyatta_phase1_data.csv',
    'phase1_data/jacinda_phase1_data.csv',
    'phase1_data/albanese_phase1_data.csv',
    'phase1_data/modi_phase1_data.csv',
    'phase1_data/harper_phase1_data.csv',
    'phase1_data/morrison_phase1_data.csv',
    'phase1_data/friedrich_phase1_data.csv',
    'phase1_data/sunak_phase1_data.csv',
    'phase1_data/boko_phase1_data.csv',
    'phase1_data/prayut_phase1_data.csv',
    'phase1_data/trump_phase1_data.csv',
    'phase1_data/luxon_phase1_data.csv',
    'phase1_data/biden_phase1_data.csv',
    'phase1_data/trudeau_phase1_data.csv',
    'phase1_data/starmer_phase1_data.csv',
    'phase1_data/paetongtarn_phase1_data.csv',
    'phase1_data/bolsonaro_phase1_data.csv',
    'phase1_data/olaf_phase1_data.csv',
    'phase1_data/singh_phase1_data.csv',
]


analyzer = Phase1Analyser(phase1_data)

# Run the recommended analysis pipeline
analyzer.run_research_questions_analysis()


PHASE 1: BASELINE PERCEPTIONS ANALYSIS
Phase 1 provides baseline ratings of ORIGINAL posters only
Used for: Controlling baseline differences, geographical effects, familiarity effects
Analyzing baseline traits: ['competence', 'likable', 'trustworthy', 'approachable', 'charismatic', 'aggressive', 'friendly', 'professional', 'visually_appealing', 'professionally_designed', 'aggressive_poster', 'inspirational', 'manipulative', 'boring', 'misleading', 'match_pol_comm', 'credible_typo_color', 'background', 'typo_attention', 'support', 'interested_in']

=== BASELINE DESCRIPTIVE STATISTICS ===
Competence: Mean = 3.17, Median = 3.00, SD = 1.14, Range = 1.0-5.0, n = 314, Skewness = -0.19
Likable: Mean = 3.08, Median = 3.00, SD = 1.21, Range = 1.0-5.0, n = 314, Skewness = -0.17
Trustworthy: Mean = 2.90, Median = 3.00, SD = 1.17, Range = 1.0-5.0, n = 315, Skewness = -0.04
Approachable: Mean = 3.05, Median = 3.00, SD = 1.27, Range = 1.0-5.0, n = 315, Skewness = -0.07
Charismatic: Mean = 2.99, Med