In [None]:
import time
import numpy as np
import openai
from tqdm import tqdm
import pickle
import json
import sys
import os
import pandas as pd
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
# Load environment variables
load_dotenv()
DATA_PATH = os.getenv('DATA_PATH')
TEMP_PATH = os.getenv('TEMP_PATH')


# Getting all synthetic human bios

I based my code on the code from [this paper](https://arxiv.org/pdf/2209.06899) (IT WAS PUBLISHED AS A [PDF](https://dataverse.harvard.edu/file.xhtml?fileId=6711665&version=1.0) LIKE WHY) for creating the bio prompts, which takes demographic data from [here](https://faculty.wcas.northwestern.edu/jnd260/pub/Rothschild,%20Howat,%20Shafranek,%20Busby%202018.pdf)


In [None]:

def uniqvals(users, field):
    vals = [users[id][field] for id in users.keys()]
    return list(set(vals))

fields_of_interest = {
    "Gender": {
        "Male": "male",
        "Female": "female",
        '': ''
    },
    "Hisp": {
        "Hispanic": "Hispanic",
        "Not Hispanic": '',
        '': ''
    },
    "WHITE": {
        "White": "white",
        "Non-white": '',
        '': ''
    },
    "Ideo": {
        '': '',
        'Liberal': 'liberal',
        'Slightly conservative': 'slightly conservative',
        'Conservative': 'conservative',
        'Slightly liberal': 'slightly liberal',
        "Moderate/Haven't thought about it": 'moderate',
        'Extremely Liberal': 'extremely liberal',
        'Extremely conservative': 'extremely conservative',
    },
    "PID7": {
        '': '',
        'Ind': 'am an independent',
        'Strong D': 'am a strong Democrat',
        'Strong R': 'am a strong Republican',
        'Lean D': 'lean towards Democrats',
        'Lean R': 'lean towards Rebublicans',
        'Weak D': 'am a weak Democrat',
        'Weak R': 'am a weak Republican',
    },
    "Inc": {
        '': '',
        'Less than $15K': 'very poor',
        '$15K to $25K': 'poor',
        '$25K to $50K': 'poor',
        '$50K to $75K': 'middle-class',
        '$75K to $100K': 'middle-class',
        '$100K to $150K': 'middle-class',
        '$150K to $200K': 'upper-class',
        '$200K to $250K': 'upper-class',
        '$250K to $500K': 'upper-class',
        'Prefer not to answer': '',
        '-8': '',
    },
}

def mapper(profile):
    results = {}
    for k in profile.keys():
        if k in fields_of_interest:
            results[k] = fields_of_interest[k].get(profile[k], '')
    if profile['Age'] != '':
        age = int(profile['Age'])
        if age >= 18 and age < 25:
            results['Age'] = 'young'
        elif age >= 25 and age < 40: 
            results['Age'] = 'middle-aged'
        elif age >= 40 and age < 60: 
            results['Age'] = 'old'
        elif age >= 60 and age < 100: 
            results['Age'] = 'very old'
        else:
            results['Age'] = ''
    return results


In [None]:

# Read the CSV file into a DataFrame
df = pd.read_csv("../data/ppfull.csv")

# Drop rows where both race categories are empty or have values that will map to empty
df = df[
    (df['WHITE'].isin(['White'])) |  # Only keep 'White' for WHITE column
    (df['Hisp'].isin(['Hispanic']))   # Only keep 'Hispanic' for Hisp column
]

# Drop rows with empty/NaN values or values that will map to empty for other features
df = df[df['Age'].notna() & (df['Age'] != '') & (df['Age'].astype(float) >= 18) & (df['Age'].astype(float) < 100)]
df = df[df['Ideo'].isin(fields_of_interest['Ideo'].keys()) & (df['Ideo'] != '')]
df = df[df['PID7'].isin(fields_of_interest['PID7'].keys()) & (df['PID7'] != '')]
df = df[df['Gender'].isin(fields_of_interest['Gender'].keys()) & (df['Gender'] != '')]
df = df[df['Inc'].isin(fields_of_interest['Inc'].keys()) & (df['Inc'] != '')]

# Convert the DataFrame to a dictionary
dmap = df.set_index(df.columns[0]).T.to_dict()

In [None]:

results = {}
ids = dmap.keys()
for id in tqdm(ids):
    user_profile = mapper(dmap[id])
    
    # Store all features and bio
    results[id] = {
        'id': id,
        'ideology': user_profile.get('Ideo', ''),
        'political_affiliation': user_profile.get('PID7', ''),
        'race_white': user_profile.get('WHITE', ''),
        'hispanic': user_profile.get('Hisp', ''),
        'gender': user_profile.get('Gender', ''),
        'income': user_profile.get('Inc', ''),
        'age': user_profile.get('Age', '')
    }
    
    # Construct bio
    prompt = ""
    if user_profile['Ideo'] != '':
        prompt += "Ideologically, I describe myself as " + user_profile['Ideo'] + ". "
    if user_profile['PID7'] != '':
        prompt += "Politically, I " + user_profile['PID7'] + ". "
    if user_profile['WHITE'] == 'White':
        prompt += "Racially, I am white. "
    if user_profile['Hisp'] == 'Hispanic':
        prompt += "Racially, I am Hispanic. "
    if user_profile['Gender'] != '':
        prompt += "I am " + user_profile['Gender'] + ". "
    if user_profile['Inc'] != '':
        prompt += "Financially, I am " + user_profile['Inc'] + ". "
    if user_profile.get('Age', '') != '':
        prompt += "In terms of my age, I am " + user_profile['Age'] + ". "
    
    results[id]['bio'] = prompt

# Convert to DataFrame (you can add this after the loop)
import pandas as pd
df = pd.DataFrame.from_dict(results, orient='index')


In [None]:
df

Theres a lot of duplicates, dont need 'em

In [None]:
# Count the occurrences of each duplicated row, excluding the (unique) id
duplicate_value_counts = df.drop(columns=['id']).value_counts()

# TODO: sample weighted by these counts

# Display the value counts of duplicated rows
duplicate_value_counts

In [None]:
# Create a DataFrame with unique rows and their counts
unique_with_counts_df = df.drop(columns=['id']).value_counts().reset_index(name='count')

# Save the DataFrame to a CSV file with counts for future sampling 
unique_with_counts_df.to_csv('../data/pigeonhole_human_data_counts.csv', index=False)


In [None]:
# Create a copy of df with only the unique rows
unique_df = df.drop_duplicates(subset=df.columns.difference(['id']))

# Save the unique DataFrame to a CSV file
unique_df.to_csv('../data/pigeonhole_human_data.csv', index=False)


# Generating their responses on some Habermas questions using some models
for now: 
1. sample subset of 50 users
2. sample 5 Habermas questions 
3. generate for llama-3.1-8b-instruct & gpt-4o-mini

this bit of code will look a lot like `generating_llm_responses.ipynb`

## Sampling N "humans"

In [None]:
N_human = 15
sample_human = unique_with_counts_df.sample(N_human, weights='count', random_state=42).reset_index(drop=True)

## Questions + LLM responses data

In [None]:
df_questions = pd.read_csv(DATA_PATH+'questions_and_human_perspectives_with_responses.csv')
if 'Unnamed: 0' in df_questions.columns:
    df_questions.drop(columns=['Unnamed: 0'], inplace=True)
print("df_questions.shape: ", df_questions.shape)

In [None]:
N_qs = 5
sample_qs = df_questions.sample(N_qs,random_state=42).reset_index(drop=True)
sample_qs.head()

## Generating Human responses

In [None]:
# # this works but it only saves as a json with questions as keys, then values are dict with key of bios and values as responses. 
# # commenting out so i can look back for posterity but the next cells contain the actual pipeline code

def generate_responses(questions, bios, generation_function, output_path, start_from_checkpoint=True):
    """
    # This is a general helper function to generate responses from an LLM and save them to a JSON file. It takes in an arbitrary generation function and can resume from a checkpoint. It will save a JSON file of responses.
    Generate responses from an LLM for each bio-question pair, with checkpointing.
    """
    print("Generating responses for:", output_path)
    
    # Load existing responses if any
    responses = {}
    if start_from_checkpoint and os.path.exists(output_path):
        print('Loaded checkpoint file!')
        with open(output_path, 'r') as f:
            responses = json.load(f)

    # Make sure the directory exists
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    # Track which question-bio pairs need to be generated
    for question in tqdm(questions, desc='Processing questions'):
        if question not in responses:
            responses[question] = {}
            
        # Get bios that haven't been answered yet for this question
        remaining_bios = [
            b for b in bios 
            if b not in responses[question]
        ]
            
        if not remaining_bios:
            print(f"All bios already processed for question.")
            continue
        else: 
            print(len(bios)-len(remaining_bios), "skipped! Beginning the remaining", len(remaining_bios), "now...")
                
        for bio in tqdm(remaining_bios, desc="Generating responses for bios"):
            try:
                response = generation_function(question, bio)
                responses[question][bio] = response
                
                # Checkpoint save every 10 bios
                if len(responses[question]) % 10 == 0:
                    with open(output_path, 'w') as f:
                        json.dump(responses, f, indent=2)
                    
            except Exception as e:
                print(f"\nError processing bio for question '{question}': {str(e)}")
                continue

    # Final save
    with open(output_path, 'w') as f:
        json.dump(responses, f, indent=2)
    
    return responses


In [None]:
from together import Together

client = Together(api_key=os.getenv('TOGETHER_API_KEY'))

def generate_together_response(question, bio, model):
    """
    Generate a response using the Together API for a given question and bio.

    This function sends a request to the Together API to generate a response
    based on the provided question and bio. The response is generated using
    a specified model, which is passed as an argument to the function.

    Parameters:
    - question (str): The question to be answered.
    - bio (str): The demographic and belief information of the person whose perspective
      should be considered when generating the response.
    - model (str): The model identifier to be used for generating the response.

    Returns:
    - str: The generated response content from the Together API.
    """
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", 
             "content": f"Answer only from the perspective of a person with the following demographics and beliefs:\n{bio}"},
            {"role": "user", "content": question}
        ],
        max_tokens=2048
    )
    return completion.choices[0].message.content

In [None]:
def generate_responses_to_df(questions_df, bios_df, output_path, start_from_checkpoint=True):
    """
    Generate responses and format them into a DataFrame with demographic information,
    with checkpointing functionality.
    """
    model = 'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-128K'
    all_results = []
    
    # Load existing results if checkpoint exists
    if start_from_checkpoint and os.path.exists(output_path):
        print('Loading from checkpoint file...')
        existing_df = pd.read_csv(output_path)
        all_results = existing_df.to_dict('records')
        
        # Create set of existing question-bio pairs
        existing_pairs = set(
            (row['question'], row['bio']) 
            for row in all_results
        )
    else:
        existing_pairs = set()
    
    # Make sure output directory exists
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Loop through each bio
    for _, bio_row in tqdm(bios_df.iterrows(), desc='Processing bios', total=len(bios_df), position=0):
        for _, question_row in tqdm(questions_df.iterrows(), desc="Processing questions for bio", total=len(questions_df), leave=False, position=1):
            # Skip if this pair was already processed
            if (question_row['question'], bio_row['bio']) in existing_pairs:
                continue
                
            try:
                response = generate_together_response(question_row['question'], bio_row['bio'], model)
                
                # Combine question, response, and demographic info
                result = {
                    'question': question_row['question'],
                    'response': response,
                    'ideology': bio_row['ideology'],
                    'political_affiliation': bio_row['political_affiliation'],
                    'race_white': bio_row['race_white'],
                    'hispanic': bio_row['hispanic'],
                    'gender': bio_row['gender'],
                    'income': bio_row['income'],
                    'age': bio_row['age'],
                    'bio': bio_row['bio']
                }
                all_results.append(result)
                
                # Checkpoint save every 10 responses
                if len(all_results) % 10 == 0:
                    pd.DataFrame(all_results).to_csv(output_path, index=False)
                    
            except Exception as e:
                print(f"\nError processing question '{question_row['question']}' for bio: {str(e)}")
                continue
    
    # Final save
    results_df = pd.DataFrame(all_results)
    results_df.to_csv(output_path, index=False)
    return results_df

# Generate responses with checkpointing
final_df = generate_responses_to_df(
    questions_df=sample_qs,
    bios_df=sample_human,
    output_path=TEMP_PATH + 'llama_responses_with_demographics.csv',
    start_from_checkpoint=True
)

# Save to CSV
final_df.to_csv(TEMP_PATH + 'llama_responses_with_demographics.csv', index=False)

In [None]:
final_df.head()

In [None]:
from pydantic import BaseModel, Field
from typing import Literal

class AgreementRating(BaseModel):
    explanation: str = Field(description="Brief rationale for the chosen rating")
    answer: Literal[1, 2, 3, 4, 5] = Field(description="Agreement rating on a 1-5 scale")
    quotes: list[str] = Field(description="List of substrings from the response that the user feels specifically represented by")

def generate_agreement_rating(question, llm_response, bio, human_answer, model):
    """
    Generate structured agreement rating for an LLM response from a human perspective
    """
    system_prompt = f"""Answer only from the perspective of a person with the following demographics and beliefs:
{bio}

In response to this question: '{question},' your opinion is: '{human_answer}'"""
    prompt = f"""Question: {question}

Response: {llm_response}

On a scale of 1-5, to what extent does the above response represent *your* opinion?
1 = Completely disagree
2 = Somewhat disagree
3 = Neutral
4 = Somewhat agree
5 = Completely agree

Provide your rating, a brief explanation for your rating, and a list of exact quotes from the response that you feel represent your opinion, if any.

Output should be in the following JSON format:
{{
    "explanation": "Brief rationale for your rating",
    "answer": <integer between 1-5>,
    "quotes": ["quote1", "quote2", ...]
}}"""

    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ],
        max_tokens=2048,
        response_format={"type": "json_object"}
    )
    
    try:
        response_dict = json.loads(completion.choices[0].message.content)
        rating = AgreementRating(**response_dict)
        return rating
    except Exception as e:
        print(f"Error parsing response: {e}")
        return None


In [None]:
# helper function to find the best substring match for inexact quotes (usually formatting stuff) 
# and it returns the % of matched characters so i can set a threshold for precision
# right now its set to 85%
def find_best_substring_match(text: str, quote: str) -> tuple[str, float]:
    """
    Find the longest matching substring and its match ratio
    
    Args:
        text: The full text to search in
        quote: The quote to match
        
    Returns:
        tuple: (best matching substring, ratio of match length to quote length)
    """
    words = quote.split()
    best_match = ""
    best_ratio = 0
    
    # Try all possible contiguous subsequences of words
    for i in range(len(words)):
        for j in range(i + 1, len(words) + 1):
            substring = " ".join(words[i:j])
            if substring in text:
                ratio = len(substring) / len(quote)
                if ratio > best_ratio:
                    best_ratio = ratio
                    best_match = substring
    
    return best_match, best_ratio

def validate_quotes(llm_response: str, quotes: list[str], match_threshold: float = 0.85, verbose: bool = False) -> tuple[bool, list[str]]:
    """
    Validate quotes against response, allowing for partial matches above threshold
    
    Args:
        llm_response: The full response from the LLM
        quotes: List of quoted strings that should appear in the response
        match_threshold: Minimum ratio of match length to quote length (default: 0.9)
        verbose: If True, print detailed validation process (default: False)
        
    Returns:
        tuple: (validation success, list of validated/corrected quotes)
    """
    all_valid = True
    validated_quotes = []
    
    if verbose:
        print("\nValidating quotes in response:")
        print(f"Full response: {llm_response}\n")
    
    for quote in quotes:
        if quote in llm_response:
            if verbose:
                print(f"✓ Found exact quote: {quote}")
            validated_quotes.append(quote)
        else:
            best_match, match_ratio = find_best_substring_match(llm_response, quote)
            
            if match_ratio >= match_threshold:
                if verbose:
                    print(f"~ Found close match for: {quote}")
                    print(f"  Best match ({match_ratio:.1%}): {best_match}")
                validated_quotes.append(best_match)
            else:
                if verbose:
                    print(f"✗ Missing quote: {quote}")
                    if best_match:
                        print(f"  Best partial match ({match_ratio:.1%}): {best_match}")
                all_valid = False
    
    if verbose:
        if not all_valid:
            print("\nQuote validation failed!")
        else:
            print("\nAll quotes validated successfully!")
        
    return all_valid, validated_quotes

In [None]:
def generate_agreement_rating_with_retry(question, llm_response, bio, human_answer, model, retry=1):
    """
    Generate agreement rating with quote validation and retry logic
    
    Args:
        question: The question being rated
        llm_response: The LLM's response to rate
        bio: The demographic/belief profile
        human_answer: The human's answer
        model: The model to use for generation
        retry: Number of retries if quote validation fails (default: 1)
    
    Returns:
        AgreementRating or None: Valid rating object or None if all attempts fail
    """
    attempts = 0
    while attempts <= retry:
        try:
            rating = generate_agreement_rating(question, llm_response, bio, human_answer, model)
            
            if rating:
                is_valid, validated_quotes = validate_quotes(llm_response, rating.quotes)
                if is_valid:
                    # Update rating with validated quotes if any were modified
                    rating.quotes = validated_quotes
                    return rating
            
            attempts += 1
            if attempts <= retry:
                print(f"\nQuote validation failed, attempt {attempts}/{retry}")
            
        except Exception as e:
            print(f"\nError in generation attempt {attempts}: {str(e)}")
            attempts += 1
            
    print("\nAll attempts failed to generate valid quotes")
    return None

# Update the main function to use the retry logic
def generate_agreement_ratings_df(questions_df, bios_df, llm_to_eval, human_responses_df, checkpoint_path=None, retry=1):
    """
    Generate agreement ratings for each human-LLM response pair, with checkpoint functionality
    
    Args:
        questions_df: DataFrame containing questions and LLM responses
        bios_df: DataFrame containing demographic information and bios
        llm_to_eval: Name of LLM column to evaluate
        human_responses_df: DataFrame containing human responses
        checkpoint_path: Optional path to checkpoint file
        retry: Number of retries for quote validation (default: 1)
    """
    model = 'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo'
    all_results = []
    
    # Load existing results if checkpoint exists
    if checkpoint_path and os.path.exists(checkpoint_path):
        print('Loading from checkpoint file...')
        existing_df = pd.read_csv(checkpoint_path)
        all_results = existing_df.to_dict('records')
        
        # Create set of existing question-bio pairs
        existing_pairs = set(
            (row['question'], row['bio'], row['llm']) 
            for row in all_results
        )
    else:
        existing_pairs = set()
    
    # Make sure checkpoint directory exists if path provided
    if checkpoint_path:
        os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
    
    for bio_idx, bio_row in tqdm(bios_df.iterrows(), 
                                desc='Processing bios', 
                                total=len(bios_df),
                                position=0):
        for q_idx, question_row in tqdm(questions_df.iterrows(), 
                                      desc='Processing questions', 
                                      total=len(questions_df),
                                      leave=False, 
                                      position=1):
            # Skip if this combination already exists in checkpoint
            if (question_row['question'], bio_row['bio'], llm_to_eval) in existing_pairs:
                continue
                
            llm_response = question_row[llm_to_eval]
            human_response = human_responses_df[
                (human_responses_df['question'] == question_row['question']) &
                (human_responses_df['bio'] == bio_row['bio'])
            ]['response'].iloc[0]
            
            try:
                rating = generate_agreement_rating_with_retry(
                    question_row['question'],
                    llm_response,
                    bio_row['bio'],
                    human_response,
                    model,
                    retry=retry
                )
                
                if rating: # only save to results if the above doesn't fail
                    result = {
                        'question': question_row['question'],
                        'llm': llm_to_eval,
                        'llm_response': llm_response,
                        'human_response': human_response,
                        'agreement_rating': rating.answer,
                        'rating_explanation': rating.explanation,
                        'quotes': rating.quotes,
                        'ideology': bio_row['ideology'],
                        'political_affiliation': bio_row['political_affiliation'],
                        'race_white': bio_row['race_white'],
                        'hispanic': bio_row['hispanic'],
                        'gender': bio_row['gender'],
                        'income': bio_row['income'],
                        'age': bio_row['age'],
                        'bio': bio_row['bio'],
                    }
                    all_results.append(result)
                    
                    # Save checkpoint every 10 new results
                    if checkpoint_path and len(all_results) % 10 == 0:
                        pd.DataFrame(all_results).to_csv(checkpoint_path, index=False)
                
            except Exception as e:
                print(f"\nError processing rating: {str(e)}")
                continue
    
    results_df = pd.DataFrame(all_results)
    
    # Final save if checkpoint path provided
    if checkpoint_path:
        results_df.to_csv(checkpoint_path, index=False)
        
    return results_df

checkpoint_path = TEMP_PATH + 'llama_agreement_ratings.csv'
agreement_df = generate_agreement_ratings_df(
    questions_df=sample_qs,
    bios_df=sample_human,
    llm_to_eval='llama-3.1-8b-instruct',
    human_responses_df=final_df,
    checkpoint_path=checkpoint_path,
    retry=2  # Will try up to 3 times total (initial + 2 retries)
)


In [None]:
# Uncomment below if checkpoint is disabled above and you want to save the results anyway 
# save_path = TEMP_PATH + 'llama_agreement_ratings.csv'
# agreement_df.to_csv(save_path)
agreement_df.shape

# Plots! Statistics!

lets calculate average agreement by the various demographic categories and plot those

## Average agreement rating by llm and demographics

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from typing import List, Tuple

def setup_plot_style(fig: plt.Figure, ax: plt.Axes) -> None:
    """Set up the basic plot style and appearance."""
    # Set white background
    fig.set_facecolor('white')
    ax.set_facecolor('white')
    
    # Remove unnecessary spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

def calculate_statistics(df: pd.DataFrame, llm: str) -> pd.DataFrame:
    """Calculate mean and std of agreement ratings for a given LLM."""
    llm_data = df[df['llm'] == llm]
    return llm_data.groupby('question')['agreement_rating'].agg(['mean', 'std']).reset_index()

def create_grouped_bar_plot(agreement_df: pd.DataFrame, 
                          bar_width: float = 0.25,
                          figsize: Tuple[int, int] = (15, 8)) -> None:
    """
    Create a grouped bar plot showing agreement ratings across questions and LLMs.
    
    Args:
        agreement_df: DataFrame containing agreement ratings
        bar_width: Width of each bar in the plot
        figsize: Size of the figure (width, height)
    """
    # Setup
    fig, ax = plt.subplots(figsize=figsize)
    setup_plot_style(fig, ax)
    
    # Get unique values
    llms = agreement_df['llm'].unique()
    questions = agreement_df['question'].unique()
    r = np.arange(len(questions))
    
    # Plot bars for each LLM
    for idx, llm in enumerate(llms):
        stats = calculate_statistics(agreement_df, llm)
        position = r + bar_width * idx
        ax.bar(position, stats['mean'], bar_width, 
               yerr=stats['std'], label=llm, alpha=0.7, capsize=5)
    
    # Customize plot
    short_questions = [q[:30] + '...' for q in questions]
    ax.set_xticks(r + bar_width)
    ax.set_xticklabels(short_questions, rotation=45, ha='right')
    ax.set_xlabel('Questions')
    ax.set_ylabel('Average Agreement Rating')
    ax.set_title('Average Agreement Ratings by Question and LLM')
    ax.legend()
    
    # Final adjustments
    plt.tight_layout()
    
    # Show plot
    plt.show()

# Create the plot
create_grouped_bar_plot(agreement_df)

In [None]:
def create_demographic_bar_plot(agreement_df: pd.DataFrame,
                              llm: str,
                              demographic_col: str,
                              figsize: Tuple[int, int] = (15, 8)) -> None:
    """
    Create a grouped bar plot showing agreement ratings across questions,
    grouped by a demographic category for a single LLM.
    
    Args:
        agreement_df: DataFrame containing agreement ratings
        llm: Name of the LLM to analyze
        demographic_col: Name of demographic column to group by
        figsize: Size of the figure (width, height)
    """
    # Filter for specific LLM
    llm_data = agreement_df[agreement_df['llm'] == llm]
    
    # Setup
    fig, ax = plt.subplots(figsize=figsize)
    setup_plot_style(fig, ax)
    
    # Get unique values
    questions = llm_data['question'].unique()
    demographic_values = llm_data[demographic_col].unique()
    r = np.arange(len(questions))
    
    # Calculate appropriate bar width
    # Total width available per group = 0.8 (leaving 0.2 for spacing between groups)
    total_width_per_group = 0.8
    bar_width = total_width_per_group / len(demographic_values)
    
    # Calculate offset to center the group of bars
    group_center_offset = (total_width_per_group - bar_width) / 2
    
    # Plot bars for each demographic value
    for idx, demo_value in enumerate(demographic_values):
        # Calculate statistics for this demographic group
        demo_data = llm_data[llm_data[demographic_col] == demo_value]
        stats = demo_data.groupby('question')['agreement_rating'].agg(['mean', 'std']).reset_index()
        
        # Plot bars
        # Position each bar relative to the center of the group
        position = r + (idx * bar_width) - group_center_offset
        ax.bar(position, stats['mean'], bar_width,
               yerr=stats['std'], label=demo_value, alpha=0.7, capsize=5)
    
    # Customize plot
    short_questions = [q[:30] + '...' for q in questions]
    ax.set_xticks(r)  # Set ticks at the center of each group
    ax.set_xticklabels(short_questions, rotation=15, ha='right')
    ax.set_xlabel('Questions')
    ax.set_ylabel('Average Agreement Rating')
    ax.set_title(f'Average Agreement Ratings by Question and {demographic_col}\nfor {llm}')
    
    
    # Set y-axis limits to show full range of ratings (1-5)
    ax.set_ylim(0.5, 5.5)  # Slightly expanded range for visibility
    
    # Adjust subplot parameters to make room for legend at bottom
    plt.subplots_adjust(bottom=0.25)
    
    # Place legend horizontally at bottom
    ax.legend(title=demographic_col, 
             bbox_to_anchor=(0.5, -0.2),  # Position below plot
             loc='upper center',           # Center horizontally
             ncol=len(demographic_values), # All items in one row
             borderaxespad=0)
    
    # Show plot
    plt.show()

# Example usage:
# Create plot for ideology
create_demographic_bar_plot(agreement_df, 
                          llm='llama-3.1-8b-instruct', 
                          demographic_col='ideology')

# Create plot for age
create_demographic_bar_plot(agreement_df, 
                          llm='llama-3.1-8b-instruct', 
                          demographic_col='age')

# Create plot for gender
create_demographic_bar_plot(agreement_df, 
                          llm='llama-3.1-8b-instruct', 
                          demographic_col='gender')


## Position


In [None]:
def find_quote_positions(row: pd.Series) -> Tuple[List[int], List[float], int]:
    """
    Find the starting indices of quotes within the LLM response, normalize them, and count them.
    
    Args:
        row: DataFrame row containing 'llm_response' and 'quotes' columns
        
    Returns:
        Tuple of (raw positions, normalized positions, number of quotes)
    """
    response = row['llm_response']
    quotes = row['quotes']
    response_length = len(response)
    
    # Handle case where quotes is a string (needs to be evaluated as a list)
    if isinstance(quotes, str):
        try:
            quotes = eval(quotes)  # Safely convert string representation of list to actual list
        except:
            return [], [], 0
    
    # Handle empty or missing quotes
    if not quotes or not isinstance(quotes, list):
        return [], [], 0
    
    positions = []
    for quote in quotes:
        try:
            pos = response.index(quote)
            positions.append(pos)
        except ValueError:
            # If exact quote not found, try finding best substring match
            best_match, match_ratio = find_best_substring_match(response, quote)
            if match_ratio > 0.85:  # Using same threshold as before
                pos = response.index(best_match)
                positions.append(pos)
            else:
                print(f"Warning: Could not find quote: '{quote}' in response: '{response}'")
    
    # Calculate normalized positions and number of quotes
    normalized_positions = [pos/response_length for pos in positions] if positions else []
    num_quotes = len(positions)
    
    return positions, normalized_positions, num_quotes

# Apply to DataFrame
agreement_df[['quote_positions', 'normalized_quote_positions', 'num_quotes']] = (
    agreement_df.apply(find_quote_positions, axis=1)
    .apply(pd.Series)  # Convert tuple output to separate columns
    .rename(columns={0: 'quote_positions', 1: 'normalized_quote_positions', 2: 'num_quotes'})
)

# Example usage:
print("Example row:")
sample_row = agreement_df.iloc[0]
print(f"LLM Response: {sample_row['llm_response'][:100]}")
print(f"Response length: {len(sample_row['llm_response'])}")
print(f"Quotes: {sample_row['quotes']}")
print(f"Raw quote positions: {sample_row['quote_positions']}")
print(f"Normalized quote positions: {sample_row['normalized_quote_positions']}")
print(f"Number of quotes: {sample_row['num_quotes']}")

In [None]:
def create_quote_position_plot(agreement_df: pd.DataFrame,
                             figsize: Tuple[int, int] = (15, 8)) -> None:
    """
    Create a grouped bar plot showing average quote positions across questions for each LLM.
    """
    # Calculate average quote position for each row
    agreement_df['avg_quote_position'] = agreement_df['normalized_quote_positions'].apply(
        lambda x: np.mean(x) if len(x) > 0 else np.nan
    )
    
    # Setup
    fig, ax = plt.subplots(figsize=figsize)
    setup_plot_style(fig, ax)
    
    # Get unique values
    llms = agreement_df['llm'].unique()
    questions = agreement_df['question'].unique()
    r = np.arange(len(questions))
    
    # Calculate appropriate bar width
    total_width_per_group = 0.8
    bar_width = total_width_per_group / len(llms)
    group_center_offset = (total_width_per_group - bar_width) / 2
    
    # Plot bars for each LLM
    for idx, llm in enumerate(llms):
        llm_data = agreement_df[agreement_df['llm'] == llm]
        stats = llm_data.groupby('question')['avg_quote_position'].agg(['mean', 'std']).reset_index()
        
        position = r + (idx * bar_width) - group_center_offset
        ax.bar(position, stats['mean'], bar_width,
               yerr=stats['std'], label=llm, alpha=0.7, capsize=5)
    
    # Customize plot
    short_questions = [q[:30] + '...' for q in questions]
    ax.set_xticks(r)
    ax.set_xticklabels(short_questions, rotation=15, ha='right')
    ax.set_xlabel('Questions')
    ax.set_ylabel('Average Quote Position (0=start, 1=end)')
    ax.set_title('Average Quote Positions by Question and LLM')
    
    # Set y-axis limits
    ax.set_ylim(0, 1)
    
    # Adjust subplot parameters to make room for legend at bottom
    plt.subplots_adjust(bottom=0.25)
    
    # Place legend horizontally at bottom
    ax.legend(bbox_to_anchor=(0.5, -0.2),
             loc='upper center',
             ncol=len(llms),
             borderaxespad=0)
    
    plt.show()
    
# Plot across LLMs
create_quote_position_plot(agreement_df)


In [None]:
def create_demographic_quote_position_plot(agreement_df: pd.DataFrame,
                                         llm: str,
                                         demographic_col: str,
                                         figsize: Tuple[int, int] = (15, 8)) -> None:
    """
    Create a grouped bar plot showing average quote positions across questions,
    grouped by a demographic category for a single LLM.
    """
    # Calculate average quote position for each row
    agreement_df['avg_quote_position'] = agreement_df['normalized_quote_positions'].apply(
        lambda x: np.mean(x) if len(x) > 0 else np.nan
    )
    
    # Filter for specific LLM
    llm_data = agreement_df[agreement_df['llm'] == llm]
    
    # Setup
    fig, ax = plt.subplots(figsize=figsize)
    setup_plot_style(fig, ax)
    
    # Get unique values
    questions = llm_data['question'].unique()
    demographic_values = llm_data[demographic_col].unique()
    r = np.arange(len(questions))
    
    # Calculate appropriate bar width
    total_width_per_group = 0.8
    bar_width = total_width_per_group / len(demographic_values)
    group_center_offset = (total_width_per_group - bar_width) / 2
    
    # Plot bars for each demographic value
    for idx, demo_value in enumerate(demographic_values):
        demo_data = llm_data[llm_data[demographic_col] == demo_value]
        stats = demo_data.groupby('question')['avg_quote_position'].agg(['mean', 'std']).reset_index()
        
        position = r + (idx * bar_width) - group_center_offset
        ax.bar(position, stats['mean'], bar_width,
               yerr=stats['std'], label=demo_value, alpha=0.7, capsize=5)
    
    # Customize plot
    short_questions = [q[:30] + '...' for q in questions]
    ax.set_xticks(r)
    ax.set_xticklabels(short_questions, rotation=15, ha='right')
    ax.set_xlabel('Questions')
    ax.set_ylabel('Average Quote Position (0=start, 1=end)')
    ax.set_title(f'Average Quote Positions by Question and {demographic_col}\nfor {llm}')
    
    # Set y-axis limits
    ax.set_ylim(0, 1)
    
    # Adjust subplot parameters to make room for legend at bottom
    plt.subplots_adjust(bottom=0.25)
    
    # Place legend horizontally at bottom
    ax.legend(title=demographic_col,
             bbox_to_anchor=(0.5, -0.2),
             loc='upper center',
             ncol=len(demographic_values),
             borderaxespad=0)
    
    plt.show()
    
# Plot for specific LLM and demographic
create_demographic_quote_position_plot(agreement_df,
                                     llm='llama-3.1-8b-instruct',
                                     demographic_col='ideology')

create_demographic_quote_position_plot(agreement_df,
                                     llm='gpt-4o-mini',
                                     demographic_col='ideology')

create_demographic_quote_position_plot(agreement_df,
                                     llm='gpt-3.5-turbo',
                                     demographic_col='ideology')

## Real Estate

In [None]:
def calculate_quote_real_estate(row: pd.Series) -> float:
    """
    Calculate the proportion of the response that is quoted.
    
    Args:
        row: DataFrame row containing 'llm_response' and 'quotes' columns
        
    Returns:
        Float representing proportion of response that is quoted (0-1)
    """
    response = row['llm_response']
    quotes = row['quotes']
    
    # Handle case where quotes is a string
    if isinstance(quotes, str):
        try:
            quotes = eval(quotes)
        except:
            return 0.0
    
    # Handle empty or missing quotes
    if not quotes or not isinstance(quotes, list):
        return 0.0
    
    # Calculate total length of quotes
    total_quote_length = sum(len(quote) for quote in quotes)
    response_length = len(response)
    
    return total_quote_length / response_length


In [None]:

def create_real_estate_plot(agreement_df: pd.DataFrame,
                          figsize: Tuple[int, int] = (15, 8)) -> None:
    """
    Create a grouped bar plot showing quote real estate across questions for each LLM.
    """
    # Calculate real estate for each row
    agreement_df['quote_real_estate'] = agreement_df.apply(calculate_quote_real_estate, axis=1)
    
    # Setup
    fig, ax = plt.subplots(figsize=figsize)
    setup_plot_style(fig, ax)
    
    # Get unique values
    llms = agreement_df['llm'].unique()
    questions = agreement_df['question'].unique()
    r = np.arange(len(questions))
    
    # Calculate appropriate bar width
    total_width_per_group = 0.8
    bar_width = total_width_per_group / len(llms)
    group_center_offset = (total_width_per_group - bar_width) / 2
    
    # Plot bars for each LLM
    for idx, llm in enumerate(llms):
        llm_data = agreement_df[agreement_df['llm'] == llm]
        stats = llm_data.groupby('question')['quote_real_estate'].agg(['mean', 'std']).reset_index()
        
        position = r + (idx * bar_width) - group_center_offset
        ax.bar(position, stats['mean'], bar_width,
               yerr=stats['std'], label=llm, alpha=0.7, capsize=5)
    
    # Customize plot
    short_questions = [q[:30] + '...' for q in questions]
    ax.set_xticks(r)
    ax.set_xticklabels(short_questions, rotation=15, ha='right')
    ax.set_xlabel('Questions')
    ax.set_ylabel('Quote Real Estate (proportion of response quoted)')
    ax.set_title('Proportion of Response Quoted by Question and LLM')
    
    # Set y-axis limits
    ax.set_ylim(0, 1)
    
    # Adjust subplot parameters to make room for legend at bottom
    plt.subplots_adjust(bottom=0.25)
    
    # Place legend horizontally at bottom
    ax.legend(bbox_to_anchor=(0.5, -0.2),
             loc='upper center',
             ncol=len(llms),
             borderaxespad=0)
    
    plt.show()
    
# Plot across LLMs
create_real_estate_plot(agreement_df)



In [None]:

def create_demographic_real_estate_plot(agreement_df: pd.DataFrame,
                                      llm: str,
                                      demographic_col: str,
                                      figsize: Tuple[int, int] = (15, 8)) -> None:
    """
    Create a grouped bar plot showing quote real estate across questions,
    grouped by a demographic category for a single LLM.
    """
    # Calculate real estate for each row
    agreement_df['quote_real_estate'] = agreement_df.apply(calculate_quote_real_estate, axis=1)
    
    # Filter for specific LLM
    llm_data = agreement_df[agreement_df['llm'] == llm]
    
    # Setup
    fig, ax = plt.subplots(figsize=figsize)
    setup_plot_style(fig, ax)
    
    # Get unique values
    questions = llm_data['question'].unique()
    demographic_values = llm_data[demographic_col].unique()
    r = np.arange(len(questions))
    
    # Calculate appropriate bar width
    total_width_per_group = 0.8
    bar_width = total_width_per_group / len(demographic_values)
    group_center_offset = (total_width_per_group - bar_width) / 2
    
    # Plot bars for each demographic value
    for idx, demo_value in enumerate(demographic_values):
        demo_data = llm_data[llm_data[demographic_col] == demo_value]
        stats = demo_data.groupby('question')['quote_real_estate'].agg(['mean', 'std']).reset_index()
        
        position = r + (idx * bar_width) - group_center_offset
        ax.bar(position, stats['mean'], bar_width,
               yerr=stats['std'], label=demo_value, alpha=0.7, capsize=5)
    
    # Customize plot
    short_questions = [q[:30] + '...' for q in questions]
    ax.set_xticks(r)
    ax.set_xticklabels(short_questions, rotation=15, ha='right')
    ax.set_xlabel('Questions')
    ax.set_ylabel('Quote Real Estate (proportion of response quoted)')
    ax.set_title(f'Proportion of Response Quoted by Question and {demographic_col}\nfor {llm}')
    
    # Set y-axis limits
    ax.set_ylim(0, 1)
    
    # Adjust subplot parameters to make room for legend at bottom
    plt.subplots_adjust(bottom=0.25)
    
    # Place legend horizontally at bottom
    ax.legend(title=demographic_col,
             bbox_to_anchor=(0.5, -0.2),
             loc='upper center',
             ncol=len(demographic_values),
             borderaxespad=0)
    
    plt.show()

# Example usage:

# Plot for specific LLM and demographic
create_demographic_real_estate_plot(agreement_df,
                                  llm='llama-3.1-8b-instruct',
                                  demographic_col='ideology')

create_demographic_real_estate_plot(agreement_df,
                                  llm='gpt-4o-mini',
                                  demographic_col='ideology')

create_demographic_real_estate_plot(agreement_df,
                                  llm='gpt-3.5-turbo',
                                  demographic_col='ideology')