In [1]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import glob
import krippendorff
from scipy.stats import chi2_contingency
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, roc_auc_score, precision_score, recall_score

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

survey_data = pd.read_csv("../data/duch_et_al_2023_vaccine_financial_vaccine_intention_training.csv", header=1)
NUM_SUBJECTS = len(survey_data)
EXPERIMENT_ROUND = 'round9'

demographic_questions = [
    "Start Date",
    "What is your current age?",
    "What is your gender?",
    "What is the highest educational qualification you have completed?",
    "Which region do you live in?",
    "Which distric do you live in?",
    "What is the name of the community you live in?",
    "How many people live in your village?",
    "What is the distance in km of the nearest health clinic from where you live?",
    "How many people live in the house together with you (NOT including you) at this moment?",
    "How many children below 18 years old are currently living in your home?",
    "What is your current working situation?",
    "How much (in Ghanaian Cedis) on average does your household spend in a typical week on food?",
    "How much (in Ghanaian Cedis) on average does your household spend in a typical week on non-food items (electricity, water, rent, school fees)?",
    "How would you rate the overall economic or financial condition of your household today?",
    "Do you have a registered mobile number?",
    "How many family members do you have in another village?",
    "How many friends and acquaintances who are not part of your family do you have in another village?",
    "How many individuals can you identify in your social network? Think of friends and relatives that live close to you",
    "How often do you use social media?",
]

survey_questions = [
    "Do you think you will get a first shot of a COVID-19 vaccine within the first 6 weeks after the vaccine becomes available to you?"
]

survey_data = survey_data[['ID'] + demographic_questions + survey_questions]

In [2]:
# Cramer's V function
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2_corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    r_corr = r - ((r-1)**2)/(n-1)
    k_corr = k - ((k-1)**2)/(n-1)
    epsilon = 1e-10  # Small value to prevent division by zero
    return np.sqrt(phi2_corr / max(min((k_corr-1), (r_corr-1)), epsilon))


# Define the custom mapping function
def map_response(response, question):
    if question == "Do you think you will get a first shot of a COVID-19 vaccine within the first 6 weeks after the vaccine becomes available to you?":
        if response == "Yes":
            return 1
        elif response == "No":
            return 0
        else:
            return -1
    else:
        raise ValueError(f"{question} is not considered!")


def evaluate_responses(prompts_with_responses: pd.DataFrame) -> dict:
    """
    Evaluate the LLM's ability to predict the ground truth responses.

    Parameters:
        prompts_with_responses (pd.DataFrame): A DataFrame containing the prompts and the corresponding responses.

    Returns:
        dict: A dictionary containing the evaluation metrics for each question.
    """
    evaluation_results = {}

    for question in prompts_with_responses["question"].unique():
        question_responses = prompts_with_responses[
            prompts_with_responses["question"] == question
        ]

        # Categorical evaluation
        response_type = "Categorical"
        evaluation_result = evaluate_categorical_response(
            question_responses["user_response"], question_responses["llm_response"], question
        )

        evaluation_results[question] = {
            "response_type": response_type,
            "evaluation_result": evaluation_result,
        }

    return evaluation_results


def format_response(response: pd.Series) -> pd.Series:
    """
    Formats a pandas Series containing strings by removing all special characters,
    leading and trailing whitespaces, and converting all characters to lower case.

    Parameters:
        response (pd.Series): A pandas Series containing the strings to be formatted.

    Returns:
        pd.Series: A pandas Series containing the formatted strings.
    """
    formatted_response = response.str.replace(r'[^a-zA-Z0-9\s]', '', regex=True).str.strip()
    return formatted_response


def evaluate_categorical_response(
    user_response: pd.Series, llm_response: pd.Series, question: str
) -> dict:
    """
    Evaluate the LLM's ability to predict the user's categorical response in terms of
    cramer's V correlation, accuracy, F1 score, and Matthews correlation coefficient.

    Parameters:
        user_response (pd.Series): A pandas Series containing the user's responses.
        llm_response (pd.Series): A pandas Series containing the LLM's responses.

    Returns:
        dict: A dictionary containing the evaluation metrics.
    """
    # Remove all special characters and convert text to lower case
    formatted_user_response = format_response(user_response)
    formatted_llm_response = format_response(llm_response)

    formatted_user_response = formatted_user_response.apply(lambda response: map_response(response, question))
    formatted_llm_response = formatted_llm_response.apply(lambda response: map_response(response, question))

    # Mask the invalid responses from LLM
    invalid_indices = formatted_llm_response[formatted_llm_response == -1].index
    print(len(invalid_indices))
    user_response_cleaned = formatted_user_response.drop(index=invalid_indices)
    llm_response_cleaned = formatted_llm_response.drop(index=invalid_indices)

    # Calculate accuracy
    accuracy = accuracy_score(user_response_cleaned, llm_response_cleaned)

    # Calculate F1 score
    f1 = float(f1_score(user_response_cleaned, llm_response_cleaned, average="macro"))

    # Calculate precision
    precision = float(precision_score(user_response_cleaned, llm_response_cleaned, average="macro"))

    # Calculate recall
    recall = float(recall_score(user_response_cleaned, llm_response_cleaned, average="macro"))

    # Calculate Matthews correlation coefficient
    mcc = float(matthews_corrcoef(user_response_cleaned, llm_response_cleaned))

    # Calculate Cramer's V correlation
    cramer_v = cramers_v(user_response_cleaned, llm_response_cleaned)

    # Calculate AUC
    auc = roc_auc_score(user_response_cleaned, llm_response_cleaned)

    # Calculate number of invalid entries
    invalid_percent = len(invalid_indices) * 100 / len(formatted_llm_response)

    return {
        "accuracy": accuracy,
        "macro_f1_score": f1,
        "precision": precision,
        "recall": recall,
        "matthews_corrcoef": mcc,
        "cramer_v_correlation": cramer_v,
        "auc":auc,
        "invalid_percent": invalid_percent
    }


def clean_llm_response(response: str) -> str:
    try:
        response = response.lower()
    except AttributeError:
        return ""
    
    # Extract the first 10 characters and the last 10 characters of the response
    first_10 = response[:10]
    last_10 = response[-10:]

    # Combine the first 10 and last 10 characters
    combined = first_10 + " " + last_10

    # Use regular expression to find 'yes', 'no', 'do not know', or 'prefer not to say'
    match = re.search(r"\b(yes|no|do not know|prefer not to say)\b", combined)

    # # Use regular expression to find 'Yes', 'yes', 'No', 'no', 'do not know', or 'prefer not to say'
    # match = re.search(r"\b(yes|no|do not know|prefer not to say)", response)
    
    # If a match is found, return the matched word
    if match:
        if match.group(0) in ["yes"]:
            return "Yes"
        elif match.group(0) in ["no"]:
            return "No"
        elif match.group(0) in ["do not know"]:
            return "Do not know"
        elif match.group(0) in ["prefer not to say"]:
            return "Prefer not to say"
        else:
            raise ValueError(f"{match.group(0)} is not considered")

    else:
        return ""


# Define the custom mapping function
def map_response_string(response, question):
    if pd.isnull(response):
        return response
    
    if question == "Do you think you will get a first shot of a COVID-19 vaccine within the first 6 weeks after the vaccine becomes available to you?":
        if response in ["No", "Yes", "Do not know", "Prefer not to say"]:
            return response
        else:
            return "Invalid"
    else:
        raise ValueError(f"{question} is not considered!")
    
custom_labels = {
    "Start Date":"InterviewDate",
    "What is your current age?":"Age",
    "What is your gender?":"Gender",
    "What is the highest educational qualification you have completed?":"Education",
    "Which region do you live in?":"Region",
    "Which distric do you live in?":"District",
    "What is the name of the community you live in?":"Community",
    "How many people live in your village?":"VillageSize",
    "What is the distance in km of the nearest health clinic from where you live?":"NearestClinic",
    "How many people live in the house together with you (NOT including you) at this moment?":"HouseholdSize",
    "How many children below 18 years old are currently living in your home?":"NumChildren",
    "What is your current working situation?":"Employment",
    "How much (in Ghanaian Cedis) on average does your household spend in a typical week on food?":"FoodSpend",
    "How much (in Ghanaian Cedis) on average does your household spend in a typical week on non-food items (electricity, water, rent, school fees)?":"NonFoodSpend",
    "How would you rate the overall economic or financial condition of your household today?":"EconomicCondition",
    "Do you have a registered mobile number?":"RegisteredMobile",
    "How many family members do you have in another village?":"FamilyOtherVillage",
    "How many friends and acquaintances who are not part of your family do you have in another village?":"FriendsOtherVillage",
    "How many individuals can you identify in your social network? Think of friends and relatives that live close to you":"SocialNetwork",
    "How often do you use social media?":"SocialMediaUse",
    "Do you think you will get a first shot of a COVID-19 vaccine within the first 6 weeks after the vaccine becomes available to you?": "VaccineIntention",
}


def plot_response_distribution(experiment_version: str) -> None:
    """
    Plots the distribution of responses for each feature in the given experiment version.

    Parameters:
        experiment_version (str): The version of the experiment.

    Returns:
        None
    """
    results = pd.read_excel(f'../results/{EXPERIMENT_ROUND}/{experiment_version}.xlsx', header=1)

    # Drop rows with missing responses
    results.dropna(subset=['user_response'], inplace=True)
    results = results[~results["user_response"].isin(["Do not know", "Prefer not to say"])].reset_index(drop=True)
    
    # Clean LLM response
    results["llm_response"] = results["llm_response"].apply(clean_llm_response)

    user_response = {}
    llm_response = {}

    for question in results['question'].unique():
        user_resp = results[results['question'] == question]['user_response'].tolist()
        llm_resp = results[results['question'] == question]['llm_response'].tolist()
        
        # Ensure each list has same number of items, filling with NaN if necessary
        user_response[question] = user_resp + [np.nan] * (NUM_SUBJECTS - len(user_resp))
        llm_response[question] = llm_resp + [np.nan] * (NUM_SUBJECTS - len(llm_resp))

    user_response = pd.DataFrame(user_response)
    llm_response = pd.DataFrame(llm_response)

    # Convert columns to categorical
    for col in user_response.columns:
        user_response[col] = format_response(user_response[col].astype('category'))
        user_response[col] = user_response[col].apply(lambda response: map_response_string(response, col))
        llm_response[col] = format_response(llm_response[col].astype('category'))
        llm_response[col] = llm_response[col].apply(lambda response: map_response_string(response, col))
        
    # Calculate Cramer's V for each pair of features
    features = user_response.columns

    # Plotting
    fig, axes = plt.subplots(nrows=len(features) // 2 + len(features) % 2, ncols=2, figsize=(20, 5 * (len(features) // 2 + len(features) % 2)))
    axes = axes.flatten()  # Flatten the 2D array of axes to easily use a single index

    for i, feature in enumerate(features):
        ax = axes[i]
        user_counts = user_response[feature].value_counts().sort_index()
        llm_counts = llm_response[feature].value_counts().sort_index()

        # Ensure both user and llm responses have the same categories
        all_categories = user_counts.index.union(llm_counts.index)
        user_counts = user_counts.reindex(all_categories, fill_value=0)
        llm_counts = llm_counts.reindex(all_categories, fill_value=0)

        # Plotting
        x = np.arange(len(all_categories))  # the label locations
        width = 0.35  # the width of the bars

        ax.bar(x - width/2, user_counts, width, alpha=0.5, label='User Response')
        ax.bar(x + width/2, llm_counts, width, alpha=0.5, label='LLM Response')

        ax.set_ylabel('Counts')
        ax.set_title(custom_labels.get(feature, feature))
        ax.set_xticks(x)

        if custom_labels.get(feature, feature) == 'VaccineReason':
            ax.set_xticklabels(all_categories, rotation=90)
        else:
            ax.set_xticklabels(all_categories)
        
        # Relabel y ticks
        current_ytick_labels = ax.get_yticklabels()
        custom_ytick_labels = [custom_labels.get(label.get_text(), label.get_text()) for label in current_ytick_labels]
        ax.set_yticklabels(custom_ytick_labels)

        ax.legend()
    
    # If there's an odd number of features, hide the last subplot if unused
    if len(features) % 2 != 0:
        axes[-1].set_visible(False)
    plt.savefig(f'../results/{EXPERIMENT_ROUND}/response_distribution_{experiment_version}.png', dpi=600, bbox_inches='tight')
    plt.show()


In [3]:
# List to hold dataframes for each file's evaluation results
all_evaluation_results = []

# Loop through each .xlsx file in the results folder
for file_path in glob.glob(f'../results/{EXPERIMENT_ROUND}/vaccine_financial_incentive_vaccinationintention_*.xlsx'):

    if file_path.startswith("../results/round9/vaccine_financial_incentive_vaccinationintention_demographic"):
        continue

    print(file_path)
    # Extract experiment version from the file name
    start = file_path.find('vaccine_financial_incentive_vaccinationintention_')
    end = file_path.find('.xlsx')
    experiment_version = file_path[start:end]

    # Read the results from the file
    results = pd.read_excel(file_path, header=1)
    results = results[~results["user_response"].isin(["Do not know", "Prefer not to say"])].reset_index(drop=True)
    
    # Clean LLM response
    results["llm_response"] = results["llm_response"].apply(clean_llm_response)
    
    # Perform evaluation
    evaluation_results = evaluate_responses(results)
    evaluation_results['Experiment Version'] = experiment_version
    
    # Normalize the evaluation results and append to the list
    evaluation_results = pd.json_normalize(evaluation_results, sep=' ')
    evaluation_results = evaluation_results.rename(columns=lambda x: x.replace('evaluation_result', '\n'))
    all_evaluation_results.append(evaluation_results)

# Concatenate all dataframes into a single dataframe
all_evaluation_results = pd.concat(all_evaluation_results, ignore_index=True)

# Save evaluation results to a CSV file
all_evaluation_results.to_excel(f'../results/{EXPERIMENT_ROUND}/vaccinationintention_evaluation_results.xlsx', index=False)

# Display the final DataFrame
all_evaluation_results

../results/round9/vaccine_financial_incentive_vaccinationintention_llama3.1_70b_instruct.xlsx
45
../results/round9/vaccine_financial_incentive_vaccinationintention_gpt4o_predicttreatmenteffects.xlsx
1584
../results/round9/vaccine_financial_incentive_vaccinationintention_llama3.1_8b_instructiontuned.xlsx


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


3478
../results/round9/vaccine_financial_incentive_vaccinationintention_mistral_7b_generalhealthcare_contexttuned.xlsx
448
../results/round9/vaccine_financial_incentive_vaccinationintention_llama3.1_8b_generalhealthcare_contexttuned.xlsx
190
../results/round9/vaccine_financial_incentive_vaccinationintention_llama3.3_70b_instruct.xlsx
0
../results/round9/vaccine_financial_incentive_vaccinationintention_llama3.3_70b_chainofthought.xlsx
6
../results/round9/vaccine_financial_incentive_vaccinationintention_llama3.1_70b_generalhealthcare_contexttuning.xlsx
232
../results/round9/vaccine_financial_incentive_vaccinationintention_claude3.5sonnet_chainofthought.xlsx
308
../results/round9/vaccine_financial_incentive_vaccinationintention_claude3.5sonnet_instruct.xlsx
175
../results/round9/vaccine_financial_incentive_vaccinationintention_llama3.1_8b_instruct.xlsx
291
../results/round9/vaccine_financial_incentive_vaccinationintention_llama3.1_70b_chainofthought.xlsx
2205
../results/round9/vaccine_fin

Unnamed: 0,Experiment Version,Do you think you will get a first shot of a COVID-19 vaccine within the first 6 weeks after the vaccine becomes available to you? response_type,Do you think you will get a first shot of a COVID-19 vaccine within the first 6 weeks after the vaccine becomes available to you? \n accuracy,Do you think you will get a first shot of a COVID-19 vaccine within the first 6 weeks after the vaccine becomes available to you? \n macro_f1_score,Do you think you will get a first shot of a COVID-19 vaccine within the first 6 weeks after the vaccine becomes available to you? \n precision,Do you think you will get a first shot of a COVID-19 vaccine within the first 6 weeks after the vaccine becomes available to you? \n recall,Do you think you will get a first shot of a COVID-19 vaccine within the first 6 weeks after the vaccine becomes available to you? \n matthews_corrcoef,Do you think you will get a first shot of a COVID-19 vaccine within the first 6 weeks after the vaccine becomes available to you? \n cramer_v_correlation,Do you think you will get a first shot of a COVID-19 vaccine within the first 6 weeks after the vaccine becomes available to you? \n auc,Do you think you will get a first shot of a COVID-19 vaccine within the first 6 weeks after the vaccine becomes available to you? \n invalid_percent
0,vaccine_financial_incentive_vaccinationintenti...,Categorical,0.676309,0.51332,0.522847,0.516738,0.039111,0.036193,0.516738,0.792254
1,vaccine_financial_incentive_vaccinationintenti...,Categorical,0.75415,0.429923,0.377075,0.5,0.0,0.0,0.5,27.887324
2,vaccine_financial_incentive_vaccinationintenti...,Categorical,0.630336,0.492359,0.492572,0.49313,-0.014287,0.0,0.49313,61.232394
3,vaccine_financial_incentive_vaccinationintenti...,Categorical,0.616972,0.502938,0.503027,0.503141,0.006167,0.0,0.503141,7.887324
4,vaccine_financial_incentive_vaccinationintenti...,Categorical,0.533698,0.480285,0.4969,0.496002,-0.007041,0.0,0.496002,3.34507
5,vaccine_financial_incentive_vaccinationintenti...,Categorical,0.679225,0.520204,0.530197,0.52248,0.052109,0.049839,0.52248,0.0
6,vaccine_financial_incentive_vaccinationintenti...,Categorical,0.730349,0.45027,0.502118,0.500347,0.001714,0.0,0.500347,0.105634
7,vaccine_financial_incentive_vaccinationintenti...,Categorical,0.574523,0.495862,0.501399,0.501684,0.00307,0.0,0.501684,4.084507
8,vaccine_financial_incentive_vaccinationintenti...,Categorical,0.608712,0.506584,0.507267,0.507965,0.015216,0.005589,0.507965,5.422535
9,vaccine_financial_incentive_vaccinationintenti...,Categorical,0.560218,0.507309,0.520351,0.526076,0.046073,0.043619,0.526076,3.080986


In [4]:
def calculate_cohen_kappa(user_response: pd.Series, llm_response: pd.Series) -> float:
    """
    Calculate Cohen's kappa coefficient between user_response and llm_response.
    
    Args:
        user_response (pd.Series): A pandas Series containing the user's responses.
        llm_response (pd.Series): A pandas Series containing the LLM's responses.
    
    Returns:
        float: Cohen's kappa coefficient.
    """
    from sklearn.metrics import cohen_kappa_score
    if len(user_response) != len(llm_response):
        raise ValueError("Series must be of the same length")
    
    return cohen_kappa_score(user_response, llm_response)


def calculate_krippendorff_alpha(user_response: pd.Series, llm_response: pd.Series) -> float:
    """Calculate Krippendorff's alpha coefficient between user_response and llm_response.
    
    Args:
        user_response (pd.Series): A pandas Series containing the user's responses.
        llm_response (pd.Series): A pandas Series containing the LLM's responses.
    
    Returns:
        float: Krippendorff's alpha coefficient.
    """
    if len(user_response) != len(llm_response):
        raise ValueError("Series must be of the same length")
    
    # Prepare the data in the format required by Krippendorff's alpha function
    data = [user_response.tolist(), llm_response.tolist()]

    try:
        return krippendorff.alpha(reliability_data=data, level_of_measurement='nominal')
    except ValueError:
        return 0.0


def calculate_proportion_agreement(user_response, llm_response) -> float:
    """
    Calculate the proportion agreement between two series containing responses.

    Parameters:
        user_response (pd.Series): A pandas Series containing the user's responses.
        llm_response (pd.Series): A pandas Series containing the LLM's responses.

    Returns:
        float: The proportion agreement between the two series.
    """
    agreement_count = (user_response == llm_response).sum()
    total_count = len(user_response)
    proportion_agreement = agreement_count / total_count
    return proportion_agreement


def calculate_tetrachoric_correlation(user_response, llm_response) -> float:
    """
    Calculate the tetrachoric correlation between two series containing responses.

    Parameters:
        user_response (pd.Series): A pandas Series containing the user's responses.
        llm_response (pd.Series): A pandas Series containing the LLM's responses.

    Returns:
        float: The tetrachoric correlation between the two series.
    """
    x = np.array(user_response)
    y = np.array(llm_response)
    
    # Calculate the contingency table
    n00 = np.sum((x == 0) & (y == 0))
    n01 = np.sum((x == 0) & (y == 1))
    n10 = np.sum((x == 1) & (y == 0))
    n11 = np.sum((x == 1) & (y == 1))
    
    # Total number of observations
    n = n00 + n01 + n10 + n11

    # Proportions
    p00 = n00 / n
    p01 = n01 / n
    p10 = n10 / n
    p11 = n11 / n
    
    # Marginal proportions
    p0_ = p00 + p01
    p1_ = p10 + p11
    p_0 = p00 + p10
    p_1 = p01 + p11
    
    # Calculate the tetrachoric correlation
    # Using the inverse of the cumulative distribution function (CDF) of the normal distribution
    phi = (p00 * p11 - p01 * p10) / np.sqrt(p0_ * p1_ * p_0 * p_1 + 1e-10)
    return phi

def evaluate_responses_correlation(prompts_with_responses: pd.DataFrame, question) -> dict:
    """
    Evaluate the LLM's ability to predict the ground truth responses.

    Parameters:
        prompts_with_responses (pd.DataFrame): A DataFrame containing the prompts and the corresponding responses.

    Returns:
        dict: A dictionary containing the evaluation metrics for each question.
    """

    proportion_agreement = {"question":question, "Metric":"Proportion Agreement"}
    tetrachoric_correlation = {"question":question, "Metric":"Tetrachoric Correlation"}
    cohen_kappa = {"question":question, "Metric":"Cohen's Kappa Coefficient"}
    krippendorff_alpha = {"question":question, "Metric":"Krippendorff's Alpha Coefficient"}

    # Define the list of filters
    filters = {
        "Whole Sample": prompts_with_responses["What is your gender?"].isin(["Male","Female"]),
        "CDC Health": prompts_with_responses["treatment"] == "CDC Health",
        "Placebo": prompts_with_responses["treatment"] == "Placebo",
        "Low Cash": prompts_with_responses["treatment"] == "Low Cash",
        "High Cash": prompts_with_responses["treatment"] == "High Cash",
        "Male": prompts_with_responses["What is your gender?"] == "Male",
        "Female": prompts_with_responses["What is your gender?"] == "Female",
        "18-30 Years Old": prompts_with_responses["What is your current age?"] <= 30,
        "31-45 Years Old": (prompts_with_responses["What is your current age?"] > 30) & (prompts_with_responses["What is your current age?"] <= 45),
        "46-60 Years Old": (prompts_with_responses["What is your current age?"] > 45) & (prompts_with_responses["What is your current age?"] <= 60),
        "Over 60 Years Old": prompts_with_responses["What is your current age?"] > 60,
        "Central Region": prompts_with_responses["Which region do you live in?"] == "Central",
        "Eastern Region": prompts_with_responses["Which region do you live in?"] == "Eastern",
    }

    # Loop through each filter
    for filter_name, filter in filters.items():
        filtered_data = prompts_with_responses[filter]
        filtered_data = filtered_data.dropna(subset=['user_response','llm_response']).reset_index(drop=True)

        # Map the user and llm responses using the custom mapping function
        user_mapped = filtered_data['user_response'].apply(lambda response: map_response(response, question))
        llm_mapped = filtered_data['llm_response'].apply(lambda response: map_response(response, question))
        proportion_agreement[filter_name] = calculate_proportion_agreement(user_mapped, llm_mapped)
        tetrachoric_correlation[filter_name] = calculate_tetrachoric_correlation(user_mapped, llm_mapped)
        cohen_kappa[filter_name] = calculate_cohen_kappa(user_mapped, llm_mapped)
        krippendorff_alpha[filter_name] = calculate_krippendorff_alpha(user_mapped, llm_mapped)

    return proportion_agreement, tetrachoric_correlation, cohen_kappa, krippendorff_alpha

# Loop through each .xlsx file in the results folder
for file_path in glob.glob(f'../results/{EXPERIMENT_ROUND}/vaccine_financial_incentive_vaccinationintention_*.xlsx'):
    if file_path.startswith("../results/round9/vaccine_financial_incentive_vaccinationintention_demographic"):
            continue
    
    print(file_path)

    # List to hold dataframes for each file's evaluation results
    all_evaluation_results = []

    # Extract experiment version from the file name
    start = file_path.find('vaccine_financial_incentive_vaccinationintention_')
    end = file_path.find('.xlsx')
    experiment_version = file_path[start:end]
    
    # Read the results from the file
    results = pd.read_excel(file_path, header=1)
    results = results[~results["user_response"].isin(["Do not know", "Prefer not to say"])].reset_index(drop=True)

    # Clean LLM response
    results["llm_response"] = results["llm_response"].apply(clean_llm_response)

    for question in survey_questions:
        question_results = results[results['question']==question].reset_index()
    
        # Perform evaluation
        result_proportion_agreement, result_tetrachoric_correlation, result_cohen_kappa, result_krippendorff_alpha = evaluate_responses_correlation(question_results, question)
        
        # Normalize the evaluation results and append to the list
        all_evaluation_results.append(result_proportion_agreement)
        all_evaluation_results.append(result_tetrachoric_correlation)
        all_evaluation_results.append(result_cohen_kappa)
        all_evaluation_results.append(result_krippendorff_alpha)

    # Convert results into a single dataframe
    all_evaluation_results = pd.DataFrame(all_evaluation_results).T
    all_evaluation_results.columns = all_evaluation_results.iloc[0]
    all_evaluation_results = all_evaluation_results[1:]

    # Save evaluation results to a CSV file
    all_evaluation_results.to_excel(f'../results/{EXPERIMENT_ROUND}/evaluation_results_correlation_{experiment_version}.xlsx', index=True)

all_evaluation_results

../results/round9/vaccine_financial_incentive_vaccinationintention_llama3.1_70b_instruct.xlsx
../results/round9/vaccine_financial_incentive_vaccinationintention_gpt4o_predicttreatmenteffects.xlsx
../results/round9/vaccine_financial_incentive_vaccinationintention_llama3.1_8b_instructiontuned.xlsx
../results/round9/vaccine_financial_incentive_vaccinationintention_mistral_7b_generalhealthcare_contexttuned.xlsx
../results/round9/vaccine_financial_incentive_vaccinationintention_llama3.1_8b_generalhealthcare_contexttuned.xlsx
../results/round9/vaccine_financial_incentive_vaccinationintention_llama3.3_70b_instruct.xlsx
../results/round9/vaccine_financial_incentive_vaccinationintention_llama3.3_70b_chainofthought.xlsx
../results/round9/vaccine_financial_incentive_vaccinationintention_llama3.1_70b_generalhealthcare_contexttuning.xlsx
../results/round9/vaccine_financial_incentive_vaccinationintention_claude3.5sonnet_chainofthought.xlsx
../results/round9/vaccine_financial_incentive_vaccinationint

question,Do you think you will get a first shot of a COVID-19 vaccine within the first 6 weeks after the vaccine becomes available to you?,Do you think you will get a first shot of a COVID-19 vaccine within the first 6 weeks after the vaccine becomes available to you?.1,Do you think you will get a first shot of a COVID-19 vaccine within the first 6 weeks after the vaccine becomes available to you?.2,Do you think you will get a first shot of a COVID-19 vaccine within the first 6 weeks after the vaccine becomes available to you?.3
Metric,Proportion Agreement,Tetrachoric Correlation,Cohen's Kappa Coefficient,Krippendorff's Alpha Coefficient
Whole Sample,0.564812,-0.025611,-0.008804,-0.037747
CDC Health,0.586074,-0.035574,-0.027795,-0.065956
Placebo,0.458805,-0.054006,-0.026387,-0.069714
Low Cash,0.700674,-0.02965,-0.020771,-0.039115
High Cash,0.669216,-0.013295,0.00187,-0.027759
Male,0.575185,-0.045558,-0.024161,-0.050518
Female,0.557028,-0.010233,0.002009,-0.029367
18-30 Years Old,0.592225,-0.05017,-0.02979,-0.055708
31-45 Years Old,0.571429,-0.012522,-0.009414,-0.042889


In [5]:
# # Load the results
# experiment_version = 'vaccine_financial_incentive_vaccinationintention_claude3.5sonnet_instruct'  # TODO need to be updated
# plot_response_distribution(experiment_version=experiment_version)