In [1]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import glob
import krippendorff
from scipy.stats import chi2_contingency
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, roc_auc_score, precision_score, recall_score

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Configuration for Afrobarometer (START) ###
# survey_data = pd.read_excel("../data/afrobarometer_ghana_training.xlsx")
survey_data = pd.read_csv("../data/afrobarometer_r9_ghana_latlong_training.csv", header=1)
NUM_SUBJECTS = len(survey_data)
EXPERIMENT_ROUND = 'round9'

demographic_questions = [
    # "Do you come from a rural or urban area?",
    # "How old are you?",
    # "What is your gender?",
    # "What is your highest level of education?",
    # "What is your religion, if any?",
    # "Do you have a job that pays a cash income? If yes, is it full time or part time? If no, are you currently looking for a job?",
    # "What region do you come from?",
    # "Do you feel close to any particular political party?",
    # "When you get together with your friends or family, how often would you say you discuss political matters?",
    # "Latitude",
    # "Longitude",
    # "What is the distance to the nearest health clinic from your location in kilometers?",
    # "What district do you live in?",
    # "What percentage of the population in your district voted for the National Democratic Congress (NDC)?",
    # "What percentage of the population in your district voted for the New Patriotic Party (NPP)?"
    "Country", 
    "PSU/EA", 
    "Region/Province/State", 
    "Are the following services present in the primary sampling unit/enumeration area: Electricity grid that most houses can access? Answer with No; Yes; Can't determine",
    "Are the following services present in the primary sampling unit/enumeration area: Sewage system that most houses can access? Answer with No; Yes; Can't determine", 
    "Are the following services present in the primary sampling unit/enumeration area: Mobile phone service? Answer with No; Yes; Can't determine", 
    "Are the following facilities present in the primary sampling unit/enumeration area or in easy walking distance: School (private or public or both)? Answer with No; Yes; Can't determine", 
    "Are the following facilities present in the primary sampling unit/enumeration area or in easy walking distance: Health clinic (private or public or both)? Answer with No; Yes; Can't determine",
    "Date of interview", 
    "How old are you? Answer with an integer above 17; Refused; Don't know", 
    "What is the primary language you speak in your home? Answer with Achode; Akan; Atwede; Baasare; Banda; Basare; Bem; Bimoba; Bisa; Bowiri; Brefo; Bulisa; Busanga; Busi; Buuzu; Chamba; Chokosi; Dagaare/Waale; Dagbani; Dagomba; Ekpana; English; Ewe/Anlo; Frafra; Fulani; Ga/Dangbe; Gawo; Gonja; Gruma; Gruni; Grusi; Guan; Hausa; Kabre; Kassem; Konkonba; Kotokoli; Kusaal; Kusasi; Likpakpaln; Mampruli; Moar; Moli; Moshie; Nabt; Nankani; Safalba; Sissali; Taln; Tampulima; Tsala; Wala; Zamrama; Refused; Don't know",
    "Let's start with your general view about the current direction of our country. Some people might think the country is going in the wrong direction. Others may feel it is going in the right direction. So let me ask YOU about the overall direction of the country: Would you say that the country is going in the wrong direction or going in the right direction? Answer with Going in the wrong direction; Going in the right direction; Refused; Don't know", 
    "In general, how would you describe: the present economic condition of this country? Answer with Very bad; Fairly bad; Neither good nor bad; Fairly good; Very good; Refused; Don't know", 
    "In general, how would you describe: Your own present living conditions? Answer with Very bad; Fairly bad; Neither good nor bad; Fairly good; Very good; Refused; Don't know", 
    "Over the past year, how often, if ever, have you or anyone in your family gone without: Medicines or medical treatment? Answer with Never; Just once or twice; Several times; Many times; Always; Refused; Don't\nknow", 
    "When you get together with your friends or family, how often would you say you discuss political matters? Answer with Never; Occasionally; Frequently; Refused; Don't know", 
    "In this country, how free are you: to choose who to vote for without feeling pressured? Answer with Not at all free; Not very free; Somewhat free; Completely free; Refused; Don't\nknow",
    "Let's talk about the last national election held in 2020. People are not always able to vote in elections, for example, because they weren't registered, they were unable to go, or someone prevented them from voting. How about you? In the last national election held in 2020, did you vote, or not, or were you too young to vote? Or can’t you remember whether you voted? Answer with I did not vote; I was too young to vote; I can't remember whether I voted; I voted in the election; Refused; Don't know",
    "How much do you trust each of the following, or haven't you heard enough about them to say: the [president]? Answer with Not at all; Just a little; Somewhat; A lot; Refused; Don't know/Haven't heard enough",
    "How much do you trust each of the following, or haven't you heard enough about them to say: [Parliament]? Answer with Not at all; Just a little; Somewhat; A lot; Refused; Don't know/Haven't heard enough", 
    "How much do you trust each of the following, or haven't you heard enough about them to say: your [local government council]? Answer with Not at all; Just a little; Somewhat; A lot; Refused; Don't know/Haven't heard enough", 
    "How much do you trust each of the following, or haven't you heard enough about them to say: the ruling party? Answer with Not at all; Just a little; Somewhat; A lot; Refused; Don't know/Haven't heard enough", 
    "How much do you trust each of the following, or haven't you heard enough about them to say: traditional leaders? Answer with Not at all; Just a little; Somewhat; A lot; Refused; Don't know/Haven't heard enough",
    "How much do you trust each of the following, or haven't you heard enough about them to say: religious leaders? Answer with Not at all; Just a little; Somewhat; A lot; Refused; Don't know/Haven't heard enough", 
    "In the past 12 months have you had contact with a public clinic or hospital? Answer with No; Yes; Refused; Don't know", 
    "How easy or difficult was it to obtain the medical care or services you needed? Answer with Very easy; Easy; Difficult; Very difficult; Refused; Don't know. Answer No contact if you haven't had any contact with a public clinic or hospital in the past 12 months", 
    "How often, if ever, did you have to pay a bribe, give a gift, or do a favour for a health worker or clinic or hospital staff in order to get the medical care or services you needed? Answer with Never; Once or twice; A few times; Often; Refused; Don't know. Answer No contact if you haven't had any contact with a public clinic or hospital in the past 12 months", 
    "In general, when dealing with health workers and clinic or hospital staff, how much do you feel that they treat you with respect? Answer with Not at all; A little bit; Somewhat; A lot; Refused; Don't know. Answer No contact if you haven't had any contact with a public clinic or hospital in the past 12 months",
    "And have you encountered any of these problems with a public clinic or hospital during the past 12 months: lack of medicines or other supplies? Answer with Never; Once or twice; A few times; Often; Refused; Don't know. Answer No contact if you haven't had any contact with a public clinic or hospital in the past 12 months", 
    "And have you encountered any of these problems with a public clinic or hospital during the past 12 months: absence of doctors or other medical personnel? Answer with Never; Once or twice; A few times; Often; Refused; Don't know. Answer No contact if you haven't had any contact with a public clinic or hospital in the past 12 months", 
    "And have you encountered any of these problems with a public clinic or hospital during the past 12 months: long waiting time? Answer with Never; Once or twice; A few times; Often; Refused; Don't know. Answer No contact if you haven't had any contact with a public clinic or hospital in the past 12 months", 
    "And have you encountered any of these problems with a public clinic or hospital during the past 12 months: poor condition of facilities? Answer with Never; Once or twice; A few times; Often; Refused; Don't know. Answer No contact if you haven't had any contact with a public clinic or hospital in the past 12 months", 
    "In your opinion, what are the most important problems facing this country that government should address?",
    "Please tell me whether you personally or any other or any other member of your household have been affected in any of the following ways by the COVID-19 pandemic: became ill with, or tested positive for, COVID-19? Answer with Yes; No; Refused; Don't know", 
    "Please tell me whether you personally or any other or any other member of your household have been affected in any of the following ways by the COVID-19 pandemic: temporarily or permanently lost a job, business, or primary source of income? Answer with Yes; No; Refused; Don't know", 
    "What is the main reason that you would be unlikely to get a COVID-19 vaccine? Answer with COVID doesn't exist/COVID is not real; Not worried about COVID/COVID is not serious or life-threatening/not deadly; I am at no risk or low risk for getting COVID/Small chance of contracting COVID;\n    I already had COVID and believe I am immune; God will protect me; Don't trust the vaccine/worried about getting fake or counterfeit vaccine;\n    Don't trust the government to ensure the vaccine is safe; Vaccine is not safe; Vaccine was developed too quickly;\n    Vaccine is not effective/Vaccinated people can still get COVID; Vaccine may cause COVID; Vaccine may cause infertility;\n    Vaccine may cause other bad side effects; Vaccines are being used to control or track people; People are being experimented on with vaccines;\n    Afraid of vaccines in general; Allergic to vaccines; Don't like needles;\n    Don't trust the vaccine source/will wait for other vaccines; Effective treatments for COVID are or will be available; It is too difficult to get the vaccine, e.g. have to travel far;\n    Vaccine will be too expensive; I don't know how to get the vaccine; I will wait until others have been vaccinated;\n    I will get the vaccine later; Religious objections to vaccines in general or to the COVID vaccine; Some other reason;\n    Don't know. Answer Not applicable if you've already been vaccinated or have answered you're likely to get vaccinated", 
    "How much do you trust the government to ensure that any vaccine for COVID-19 that is developed or offered to Ghanaian citizens is safe before it is used in this country? Answer with Not at all; Just a little; Somewhat; A lot; Refused; Don't know", 
    "How well or badly would you say the current government has managed the response to the COVID-19 pandemic? Answer with Very badly; Fairly badly; Fairly well; Very well; Refused; Don't know", 
    "When the country is facing a public health emergency like the COVID-19 pandemic, do you agree or disagree that it is justified for the government to temporarily limit democracy or democratic freedoms by taking the following measures: using the police and security forces to enforce public health mandates like restrictions on public gatherings or wearing face masks? Answer with Strongly disagree; Disagree; Neither agree nor disagree; Agree; Strongly agree; Refused; Don't know", 
    "Now let us talk about the media and how you get information about politics and other issues. How often do you get news from the following sources: radio? Answer with Never; Less than once a month; A few times a month; A few times a week; Every day; Refused; Don't know",
    "How often do you get news from the following sources: television? Answer with Never; Less than once a month; A few times a month; A few times a week; Every day; Refused; Don't know", 
    "How often do you get news from the following sources: print newspapers? Answer with Never; Less than once a month; A few times a month; A few times a week; Every day; Refused; Don't know", 
    "How often do you get news from the following sources: internet? Answer with Never; Less than once a month; A few times a month; A few times a week; Every day; Refused; Don't know", 
    "How often do you get news from the following sources: social media such as Facebook, Twitter, WhatsApp, or others? Answer with Never; Less than once a month; A few times a month; A few times a week; Every day; Refused; Don't know", 
    "Let's go back to talking about you. What is your ethnic community or cultural group? Answer with (National identity) only, or “doesn't think of self in those terms”; Akan; Banda; Basare; Bem; Bisa; Bole; Brefo; Brefo/wala; Bulisa; Busanga; Busi; Buuzu ( mali); Dagaati; Dagbani; Dagomba; Ekpana; Ewe/Anlo; Frafri; Fulani; Ga/Adangbe; Gangaca; Gawo; Gonja; Gruma; Gruni; Grusi; Guan; Gurma; Hausa; Kabre; Kasasi; Kassem; Konkonba; Kotokoli; Kulkulsi; Kusasi; Mamprusi; Mande; Mole-dagbani; Moshie; Nankani; Pampurisi; Safalba; Sissala; Talensi; Taln; Tampluma; Tampulinsi; Templeman; Tsalla; Tsamba; Wale; Wusasi; Zamrama; Zugu; Refused to answer; Don't know",
    "Please tell me whether you agree or disagree with the following statement: I feel strong ties with other Ghanaians. Answer with Strongly disagree; Disagree; Neither agree nor disagree; Agree; Strongly agree; Refused; Don't know", 
    "How much do you trust each of the following types of people: other Ghanaians? Answer with Not at all; Just a little; Somewhat; A lot; Refused; Don't know", 
    "Do you feel close to any particular political party? Answer with No (does NOT feel close to ANY party); Yes (feels close to a party); Refused to answer; Don't know", 
    "Which party is that? Answer with BOTH NPP AND NDC; Convention People's Party (CPP); Democratic People's Party (DPP); Don't know; National Democratic Congress (NDC); New Patriotic Party (NPP); Not Applicable; People's National Convention (PNC); Progressive People's Party (PPP); Refused; Refused; Don't know. Answer Not applicable if you don't feel close to any party",
    "What is your main occupation? [If unemployed, retired, or disabled, ask:] What was your last main occupation? Answer with Never had a job; Student; Housewife/Homemaker;\n    Agriculture/Farming/Fishing/Forestry; Trader/Hawker/Vendor; Retail/Shop;\n    Unskilled manual worker (e.g. cleaner, laborer, domestic help, unskilled manufacturing worker); Artisan or skilled manual worker (e.g. trades like electrician, mechanic, mechanic, machinist, or skilled manufacturing worker); Clerical or secretarial;\n    Supervisor/Foreman/Senior manager; Security services; Mid-level professional (e.g. teacher, nurse, mid-level government officer);\n    Upper-level professional (e.g. banker/finance, doctor, lawyer, engineer, accountant, professor, senior-level government officer); Other; Refused;\n    Don't know; Retired",
    "What is your highest level of education? Answer with No formal schooling; Informal schooling only (including Koranic schooling); Some primary schooling;\n    Primary school completed; Intermediate school or some secondary school/high school; Secondary school/high school completed;\n    Post-secondary qualifications other than university, e.g. a diploma or degree from a polytechnic or college; Some university; University completed;\n    Post-graduate; Refused; Don't know", 
    "What is your religion, if any? Answer with None; Christian only (i.e., without specific sub-group identification); Roman Catholic; Orthodox; Coptic; Anglican;\n    Lutheran; Methodist; Presbyterian; Baptist; Quaker/Friends; Mennonite;\n    Evangelical; Pentecostal (e.g. “born again” and/or “saved”); Independent (e.g. “African Independent Church”); Jehovah's Witness; Seventh-day Adventist; Mormon;\n    Muslim only (i.e., without specific sub-group identification); Sunni only (i.e., without specific sub-group identification); Ismaeli; Mouridiya Brotherhood; Tijaniya Brotherhood; Qadiriya Brotherhood;\n    Shia; Traditional/Ethnic religion; Hindu; Bahai; Agnostic (Do not know if there is a God); Atheist (Do not believe in a God);\n    Dutch Reformed; Calvinist; Church of Christ; Zionist Christian Church; Jewish; Eglise Du Christianisme Céleste;\n    Fifohazana; Ançardine; Morovian; Faith of Unity; United Church of Zambia or UCZ; New Apostolic Church;\n    Christian mission in many lands (CMML); Salvation Army; Other; Refused; Don't know",
    "Respondent's gender Answer with Man; Woman",
    "Respondent's race Answer with Black/African; White/European; Coloured/Mixed race; Arab/Lebanese/North African;\n    South Asian (Indian, Pakistani, etc.); East Asian (Chinese, Korean, Indonesian, etc.); Other; Don't know",
    "Latitude",
    "Longitude",
]

survey_questions = [
    # "Have you received a vaccination against COVID-19, either one or two doses?",
    "Have you received a vaccination against COVID-19, either one or two doses? Answer with No; Yes; Refused; Don't know"
]
### Configuration for Afrobarometer (END) ###

### Configuration for CANDOR (START) ###
# survey_data = pd.read_excel("../data/candor_5countries.xlsx")
# NUM_SUBJECTS = len(survey_data)
# EXPERIMENT_ROUND = 'round7'

# demographic_questions = [
#     'What is your gender?', 
#     'What is your age?',
#     'Thinking back to 12 months ago, has your household income increased or decreased since then?',
#     "The following is a scale from 0 to 10 that goes from left to right, where 0 means 'Left' and 10 means 'Right'. Today when talking about political trends, many people talk about those who are more sympathetic to the left or the right. According to the sense that the terms 'Left' and 'Right' have for you when you think about your political point of view, where would you find yourself on this scale?",
#     'Gross HOUSEHOLD income combines your gross income with that of your partner or any other household member with whom you share financial responsibilities BEFORE any taxes are paid and BEFORE any benefits are obtained. What is your gross annual household income?',
#     'We would like to know how good or bad your health is TODAY. How would you rate your health today on a scale numbered 0 to 100? 100 means the best health you can imagine. 0 means the worst health you can imagine.',
#     'What is the highest degree or level of education you have completed?',
#     'Do you have any dependent children who live with you? (By "dependent" children, we mean those who are not yet financially independent).',
#     'Are you currently married, in a civil partnership, or living with a partner?',
#     'Would you vote to re-elect this government in the next election?',
#     'Overall, how would you rate the current government on a scale of 0 (very low rating) to 100 (very high rating)?',
#     'Where in the country do you live?',
# ]

# survey_questions = [
#     "Have you received a COVID-19 vaccine?",
# ]
### Configuration for CANDOR (END) ###

survey_data = survey_data[['ID'] + demographic_questions + survey_questions]

In [2]:
# Cramer's V function
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2_corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    r_corr = r - ((r-1)**2)/(n-1)
    k_corr = k - ((k-1)**2)/(n-1)
    epsilon = 1e-10  # Small value to prevent division by zero
    return np.sqrt(phi2_corr / max(min((k_corr-1), (r_corr-1)), epsilon))


# Define the custom mapping function
def map_response(response, question):
    if question == "In the past 12 months, have you had contact with a public clinic or hospital?":
        if response == "No":
            return 0
        elif response == "Yes":
            return 1
        else:
            raise ValueError(f"{response} is not considered")
    elif question == "Have you received a vaccination against COVID-19, either one or two doses?":
        if response == "No":
            return 0
        elif response == "Yes":
            return 1
        elif response == "":
            return -1
        else:
            raise ValueError(f"{response} is not considered")
    elif question == "Have you received a vaccination against COVID-19, either one or two doses? Answer with No; Yes; Refused; Don't know":
        if response == "No":
            return 0
        elif response == "Yes":
            return 1
        elif response == "":
            return -1
        else:
            raise ValueError(f"{response} is not considered")
    elif question == "If a vaccine for COVID-19 is available , how likely are you to try to get vaccinated?":
        if response in ["Somewhat unlikely","Very unlikely","Don't know","Unlikely"]:
            return 0
        elif response in ["Very likely","Somewhat likely","Likely"]:
            return 1
        else:
            raise ValueError(f"{response} is not considered")
    elif question == "Have you received a COVID-19 vaccine?":
        if response == "No":
            return 0
        elif response == "Yes":
            return 1
        else:
            raise ValueError(f"{response} is not considered")
    else:
        raise ValueError(f"{question} is not considered!")


def evaluate_responses(prompts_with_responses: pd.DataFrame) -> dict:
    """
    Evaluate the LLM's ability to predict the ground truth responses.

    Parameters:
        prompts_with_responses (pd.DataFrame): A DataFrame containing the prompts and the corresponding responses.

    Returns:
        dict: A dictionary containing the evaluation metrics for each question.
    """
    evaluation_results = {}

    for question in prompts_with_responses["question"].unique():
        question_responses = prompts_with_responses[
            prompts_with_responses["question"] == question
        ]

        if (
            question_responses["user_response"].dtype == "int64"
            or question_responses["user_response"].dtype == "float64"
            or pd.to_numeric(question_responses["user_response"], errors="coerce")
            .notnull()
            .all()
        ):
            # Numerical evaluation
            response_type = "Numerical"
            evaluation_result = evaluate_numerical_response(
                question_responses["user_response"], question_responses["llm_response"]
            )

        else:
            # Categorical evaluation
            response_type = "Categorical"
            evaluation_result = evaluate_categorical_response(
                question_responses["user_response"], question_responses["llm_response"], question
            )

        evaluation_results[question] = {
            "response_type": response_type,
            "evaluation_result": evaluation_result,
        }

    return evaluation_results


def format_response(response: pd.Series) -> pd.Series:
    """
    Formats a pandas Series containing strings by removing all special characters,
    leading and trailing whitespaces, and converting all characters to lower case.

    Parameters:
        response (pd.Series): A pandas Series containing the strings to be formatted.

    Returns:
        pd.Series: A pandas Series containing the formatted strings.
    """
    formatted_response = response.str.replace(r'[^a-zA-Z0-9\s]', '', regex=True).str.strip()
    return formatted_response


def evaluate_categorical_response(
    user_response: pd.Series, llm_response: pd.Series, question: str
) -> dict:
    """
    Evaluate the LLM's ability to predict the user's categorical response in terms of
    cramer's V correlation, accuracy, F1 score, and Matthews correlation coefficient.

    Parameters:
        user_response (pd.Series): A pandas Series containing the user's responses.
        llm_response (pd.Series): A pandas Series containing the LLM's responses.

    Returns:
        dict: A dictionary containing the evaluation metrics.
    """
    # Remove all special characters and convert text to lower case
    formatted_user_response = format_response(user_response)
    formatted_llm_response = format_response(llm_response)

    formatted_user_response = formatted_user_response.apply(lambda response: map_response(response, question))
    formatted_llm_response = formatted_llm_response.apply(lambda response: map_response(response, question))

    # Mask the invalid responses from LLM
    invalid_indices = formatted_llm_response[formatted_llm_response == -1].index
    print(len(invalid_indices))
    user_response_cleaned = formatted_user_response.drop(index=invalid_indices)
    llm_response_cleaned = formatted_llm_response.drop(index=invalid_indices)

    # Calculate accuracy
    accuracy = accuracy_score(user_response_cleaned, llm_response_cleaned)

    # Calculate F1 score
    f1 = float(f1_score(user_response_cleaned, llm_response_cleaned, average="macro"))

    # Calculate precision
    precision = float(precision_score(user_response_cleaned, llm_response_cleaned, average="macro"))

    # Calculate recall
    recall = float(recall_score(user_response_cleaned, llm_response_cleaned, average="macro"))

    # Calculate Matthews correlation coefficient
    mcc = float(matthews_corrcoef(user_response_cleaned, llm_response_cleaned))

    # Calculate Cramer's V correlation
    cramer_v = cramers_v(user_response_cleaned, llm_response_cleaned)

    # Calculate AUC
    auc = roc_auc_score(user_response_cleaned, llm_response_cleaned)

    # Calculate number of invalid entries
    invalid_percent = len(invalid_indices) * 100 / len(formatted_llm_response)

    return {
        "accuracy": accuracy,
        "macro_f1_score": f1,
        "precision": precision,
        "recall": recall,
        "matthews_corrcoef": mcc,
        "cramer_v_correlation": cramer_v,
        "auc":auc,
        "invalid_percent": invalid_percent
    }


def evaluate_numerical_response(
    user_response: pd.Series, llm_response: pd.Series
) -> dict:
    """
    Evaluate the LLM's ability to predict the user's numerical response in terms of
    mean absolute error, root mean square error, mean absolute percentage error and correlation.

    Parameters:
        user_response (pd.Series): A pandas Series containing the user's responses.
        llm_response (pd.Series): A pandas Series containing the LLM's responses.

    Returns:
        dict: A dictionary containing the evaluation metrics.
    """
    # Convert user_response and llm_response to float type
    user_response = user_response.astype(float)
    llm_response = llm_response.astype(float)

    # Calculate mean absolute error
    mae = float(np.mean(np.abs(user_response - llm_response)))

    # Calculate root mean squared error
    rmse = float(np.sqrt(np.mean((user_response - llm_response) ** 2)))

    # Calculate mean absolute percentage error
    mape = float(np.mean(np.abs((user_response - llm_response) / user_response))) * 100

    # Calculate correlation
    correlation = float(user_response.corr(llm_response))

    return {
        "mean_absolute_error": mae,
        "root_mean_squared_error": rmse,
        "mean_absolute_percentage_error": mape,
        "correlation": correlation,
    }

def clean_llm_response(response: str) -> str:
    try:
        response = response.lower()
    except AttributeError:
        return ""

    # Use regular expression to find 'Yes', 'yes', 'No', or 'no'
    match = re.search(r"\b(yes|no|have received|has received|have not received|has not received|haven't received|have not yet received|i received|i've received|i've not received)", response)
    
    # If a match is found, return the matched word
    if match:
        if match.group(0) in ["yes","have received", "has received", "i received", "i've received"]:
            return "Yes"
        elif match.group(0) in ["no","have not received", "has not received", "haven't received", "have not yet received", "i've not received"]:
            return "No"
        else:
            raise ValueError(f"{match.group(0)} is not considered")

    else:
        return ""


# Define the custom mapping function
def map_response_string(response, question):
    if pd.isnull(response):
        return response
    
    if question == "In the past 12 months, have you had contact with a public clinic or hospital?":
        if response in ["No", "Yes"]:
            return response
        else:
            raise ValueError(f"{response} is not considered")
    elif question == "Have you received a vaccination against COVID-19, either one or two doses?":
        if response in ["No", "Yes"]:
            return response
        else:
            return "Invalid"
    elif question == "Have you received a vaccination against COVID-19, either one or two doses? Answer with No; Yes; Refused; Don't know":
        if response in ["No", "Yes"]:
            return response
        else:
            return "Invalid"
    elif question == "If a vaccine for COVID-19 is available , how likely are you to try to get vaccinated?":
        if response in ["Somewhat unlikely","Very unlikely","Don't know","Unlikely"]:
            return "Unlikely"
        elif response in ["Very likely","Somewhat likely","Likely"]:
            return "Likely"
        else:
            raise ValueError(f"{response} is not considered")
    elif question == "Have you received a COVID-19 vaccine?":
        if response in ["No", "Yes"]:
            return response
        else:
            raise ValueError(f"{response} is not considered")
    else:
        raise ValueError(f"{question} is not considered!")
    
custom_labels = {
    "Do you come from a rural or urban area?":"Area",
    "How old are you?":"Age",
    "What is your gender?":"Gender",
    "What is your race?":"Race",
    "What is the primary language you speak in your home?":"PrimaryLanguage",
    "What is your highest level of education?":"Education",
    "What is your religion, if any?":"Religion",
    "What is your ethnic community or cultural group?":"Ethnicity",
    "Do you have a job that pays a cash income? If yes, is it full time or part time? If no, are you currently looking for a job?":"Employment",
    "What is your main occupation? If unemployed, retired, or disabled, what was your last main occupation?":"MainOccupation",
    "Do you personally own a mobile phone? If not, does anyone else in your household own one?":"MobilePhone",
    "In general, how would you describe your own present living conditions?":"LivingConditions",

    "What region do you come from?":"Region",
    "Does the enumeration area have an electricity grid that most houses can access?":"ElectricGrid",
    "Does the enumeration area have a piped water system that most houses can access?":"WaterSystem",
    "Does the enumeration area have a sewage system that most houses can access?":"SewageSystem",
    "Does the enumeration area have a mobile phone service that most houses can access?":"MobileService",
    "Are health clinics (private or public or both) present in the enumeration area or in easy walking distance?":"ClinicPresent",
    "What is your main source of water for household use?":"WaterSource",
    "Do you have an electric connection to your home from the Electricity Company of Ghana, ECG, or the Northern Electricty Distribution Company Ltd, NEDCO?":"ElectricConnection",
    "Do you personally own a mobile phone? If yes, does your phone have access to the Internet?":"InternetOnMobilePhone",
    "Do you feel close to any particular political party?":"PoliticalParty",
    "In general, how would you describe the present economic condition of this country?":"EconomicCondition",
    "When you get together with your friends or family, how often would you say you discuss political matters?":"DiscussPolitics",
    "In this country, how free are you to say what you think?":"FreedomSpeech",
    "Over the past year, how often, if ever, have you or anyone in your family felt unsafe walking in your neighborhood?":"UnsafeNeighbourhood",
    "Over the past year, how often, if ever, have you or anyone in your family feared crime in your own home?":"FearCrime",
    "In this country, how free are you to join any political organization you want?":"JoinPolitics",
    "In this country, how free are you to choose who to vote for without feeling pressured?":"FreedomVote",
    "During the past year, how often have you contacted an assemby man or woman about some important problem or to give them your views?":"ContactAssembyMan",
    "During the past year, how often have you contacted a member of Parliament about some important problem or to give them your views?":"ContactParliament",
    "During the past year, how often have you contacted a political party official about some important problem or to give them your views?":"ContactPartyOfficial",
    "During the past year, how often have you contacted a traditional leader about some important problem or to give them your views?":"ContactTraditionalLeader",
    "Overall, how satisfied are you with the way democracy works in Ghana?":"Democracy",
    "In your opinion, how often, in this country do people have to be careful of what they say about politics?":"CarefulPolitics",
    "In your opinion, how often, in this country are people treated unequally under the law?":"UnequalTreatment",
    "How often, if ever, are people treated unfairly by the government based on their economic status, that is, how rich or poor they are?":"UnequalTreatmentEconomic",
    "To whom do you normally go to first for assistance, when you are concerned about your security and the security of your family?":"Assistance",
    "How much do you trust other Ghanaians?":"TrustGhanaians",
    "How much do you trust your relatives?":"TrustRelatives",
    "How much do you trust your neighbours?":"TrustNeighbours",
    "How much do you trust other people you know?":"TrustOtherPeople",
    "How much do you trust people from other religions?":"TrustOtherReligion",
    "How much do you trust people from other ethnic groups?":"TrustOtherEthnic",
    "How often do you use the Internet?":"UseInternet",
    "In your opinion, what are the most important problems facing this country that government should address?":"ImportantProblems",
    "Latitude":"Latitude",
    "Longitude":"Longitude",
    "What is the distance to the nearest health clinic from your location in kilometers?":"NearestClinic",
    "What district do you live in?":"District",
    "What percentage of the population in your district voted for the National Democratic Congress (NDC)?":"VoteNDC",
    "What percentage of the population in your district voted for the New Patriotic Party (NPP)?":"VoteNPP",

    'Over the past year, how often, if ever, have you or anyone in your family gone without Medicines or medical treatment?':'MedicalTreatment',
    'In the past 12 months, have you had contact with a public clinic or hospital?':'ContactClinic',
    'How easy or difficult was it to obtain the medical care or services you needed?':'ObtainMedicalCare',
    'Please tell me whether you personally or any other member of your household have became ill with, or tested positive for COVID-19 by the COVID-19 pandemic?':'ContractCovid',
    'Please tell me whether you personally or any other member of your household have temporarily or permanently lost a job, business, or primary source of income by the COVID-19 pandemic?':'LostJob',
    'Have you received a vaccination against COVID-19, either one or two doses?':'ReceivedVaccine',
    'If a vaccine for COVID-19 is available , how likely are you to try to get vaccinated?':'VaccineLikelihood',
    'What is the main reason that you would be unlikely to get a COVID-19 vaccine?':'VaccineReason',
    'How much do you trust the government to ensure that any vaccine for COVID-19 that is developed or offered to Nigerian citizens is safe before it is used in this country?':'TrustGovernment',

    "Do you think you will get a first shot of a COVID-19 vaccine within the first 6 weeks after the vaccine becomes available to you?":"VaccineIntention",

    'What is your gender?':'Gender', 
    'What is your age?':'Age',
    'Thinking back to 12 months ago, has your household income increased or decreased since then?':'HouseholdIncomeIncrement',
    "The following is a scale from 0 to 10 that goes from left to right, where 0 means 'Left' and 10 means 'Right'. Today when talking about political trends, many people talk about those who are more sympathetic to the left or the right. According to the sense that the terms 'Left' and 'Right' have for you when you think about your political point of view, where would you find yourself on this scale?":'PoliticalScale',
    'Gross HOUSEHOLD income combines your gross income with that of your partner or any other household member with whom you share financial responsibilities BEFORE any taxes are paid and BEFORE any benefits are obtained. What is your gross annual household income?':'HouseholdIncome',
    'We would like to know how good or bad your health is TODAY. How would you rate your health today on a scale numbered 0 to 100? 100 means the best health you can imagine. 0 means the worst health you can imagine.':'HealthRating',
    'What is the highest degree or level of education you have completed?':'Education',
    'Do you have any dependent children who live with you? (By "dependent" children, we mean those who are not yet financially independent).':'Dependents',
    'Are you currently married, in a civil partnership, or living with a partner?':'MaritalStatus',
    'Would you vote to re-elect this government in the next election?':'ReelectGovernment',
    'Overall, how would you rate the current government on a scale of 0 (very low rating) to 100 (very high rating)?':'GovernmentRating',
    'Where in the country do you live?':'Country',
    'Have you received a COVID-19 vaccine?':'ReceivedVaccine',

    'Since you watched this video six weeks ago, do you think you will get a first shot of a COVID-19 vaccine if the vaccine becomes available to you?':'VaccineIntention',
}

def plot_cramers_feature(experiment_version: str) -> None:
    """
    Plots Cramer's V correlation values for each pair of features in the given experiment version.

    Parameters:
        experiment_version (str): The version of the experiment.

    Returns:
        None
    """
    results = pd.read_excel(f'../results/{EXPERIMENT_ROUND}/{experiment_version}.xlsx')
    
    # Drop rows with missing responses
    results.dropna(subset=['user_response'], inplace=True)
    results = results[~results["user_response"].isin(["Don't know", "Refused to Answer"])].reset_index(drop=True)

    # Clean LLM response
    results["llm_response"] = results["llm_response"].apply(clean_llm_response)

    llm_response = {'ID': survey_data['ID'].tolist()}
    for question in results['question'].unique():
        llm_resp = results[results['question'] == question]['llm_response'].tolist()
        
        # Ensure each list has 200 items, filling with NaN if necessary
        llm_response[question] = llm_resp + [np.nan] * (NUM_SUBJECTS - len(llm_resp))

    # user_response = pd.DataFrame(user_response)
    llm_response = pd.DataFrame(llm_response)

    print(f'survey_data: {survey_data.shape}')
    survey_with_llm_response = pd.merge(
        left=survey_data,
        right=llm_response,
        on='ID',
        suffixes=('','_llm')
    )
    print(f'survey_with_llm_response: {survey_with_llm_response.shape}')

    for col in survey_questions:
        survey_with_llm_response = survey_with_llm_response[~survey_with_llm_response[col].isin(["Don't know", "Refused to Answer"])].reset_index(drop=True)

    # Convert columns to categorical
    for col in survey_with_llm_response.columns:
        if col in ["How old are you?","What is your age?"]:
            continue
        survey_with_llm_response[col] = survey_with_llm_response[col].astype('category')

    # Calculate Cramer's V for each pair of features
    correlations_user = pd.DataFrame(index=demographic_questions, columns=survey_questions, dtype=float)
    correlations_llm = pd.DataFrame(index=demographic_questions, columns=survey_questions, dtype=float)
    for col1 in demographic_questions:
        for col2 in survey_questions:
            user_mapped = survey_with_llm_response[col2].apply(lambda response: map_response(response, col2))
            llm_mapped = survey_with_llm_response[col2 + '_llm'].apply(lambda response: map_response(response, col2))
            correlations_user.loc[col1, col2] = cramers_v(survey_with_llm_response[col1], user_mapped)
            correlations_llm.loc[col1, col2] = cramers_v(survey_with_llm_response[col1], llm_mapped)

    # Plotting
    fig, axes = plt.subplots(nrows=len(survey_questions) // 2 + len(survey_questions) % 2, ncols=2, figsize=(20, 7 * (len(survey_questions) // 2 + len(survey_questions) % 2)))
    axes = axes.flatten()  # Flatten the 2D array of axes to easily use a single index

    for i, feature in enumerate(survey_questions):
        # Extract correlations for the current feature
        feature_correlations_user = correlations_user[feature].reset_index()
        feature_correlations_user.columns = ['Feature', 'Correlation']

        feature_correlations_llm = correlations_llm[feature].reset_index()
        feature_correlations_llm.columns = ['Feature', 'Correlation']
        
        # Plot scatter plot
        ax = axes[i]  # Use the flattened axes array
        ax.scatter(feature_correlations_user['Correlation'], feature_correlations_user['Feature'], label='Real', marker='^')
        ax.scatter(feature_correlations_llm['Correlation'], feature_correlations_llm['Feature'], label='GPT4.0T')
        ax.set_title(custom_labels.get(feature, feature))  # Fallback to feature if not in custom_labels
        ax.set_xlabel('Cramer\'s V Correlation Value')
        # Set custom y-ticks
        ax.set_yticks(range(len(feature_correlations_user['Feature'])))
        custom_ytick_labels = [custom_labels.get(feat, feat) for feat in feature_correlations_user['Feature']]  # Fallback to feat if not in custom_labels
        ax.grid(True)
        ax.set_yticklabels(custom_ytick_labels)

        ax.legend()

    # Adjust layout to prevent overlap
    plt.tight_layout()

    # If there's an odd number of features, hide the last subplot if unused
    if len(survey_questions) % 2 != 0:
        axes[-1].set_visible(False)
    plt.savefig(f'../results/{EXPERIMENT_ROUND}/cramer_v_correlation_{experiment_version}.png', dpi=600, bbox_inches='tight')
    plt.show()


def plot_response_distribution(experiment_version: str) -> None:
    """
    Plots the distribution of responses for each feature in the given experiment version.

    Parameters:
        experiment_version (str): The version of the experiment.

    Returns:
        None
    """
    results = pd.read_excel(f'../results/{EXPERIMENT_ROUND}/{experiment_version}.xlsx')

    # Drop rows with missing responses
    results.dropna(subset=['user_response'], inplace=True)
    results = results[~results["user_response"].isin(["Don't know", "Refused to Answer"])].reset_index(drop=True)
    
    # Clean LLM response
    results["llm_response"] = results["llm_response"].apply(clean_llm_response)

    user_response = {}
    llm_response = {}

    for question in results['question'].unique():
        user_resp = results[results['question'] == question]['user_response'].tolist()
        llm_resp = results[results['question'] == question]['llm_response'].tolist()
        
        # Ensure each list has same number of items, filling with NaN if necessary
        user_response[question] = user_resp + [np.nan] * (NUM_SUBJECTS - len(user_resp))
        llm_response[question] = llm_resp + [np.nan] * (NUM_SUBJECTS - len(llm_resp))

    user_response = pd.DataFrame(user_response)
    llm_response = pd.DataFrame(llm_response)

    # Convert columns to categorical
    for col in user_response.columns:
        user_response[col] = format_response(user_response[col].astype('category'))
        user_response[col] = user_response[col].apply(lambda response: map_response_string(response, col))
        llm_response[col] = format_response(llm_response[col].astype('category'))
        llm_response[col] = llm_response[col].apply(lambda response: map_response_string(response, col))
        
    # Calculate Cramer's V for each pair of features
    features = user_response.columns

    # Plotting
    fig, axes = plt.subplots(nrows=len(features) // 2 + len(features) % 2, ncols=2, figsize=(20, 5 * (len(features) // 2 + len(features) % 2)))
    axes = axes.flatten()  # Flatten the 2D array of axes to easily use a single index

    for i, feature in enumerate(features):
        ax = axes[i]
        user_counts = user_response[feature].value_counts().sort_index()
        llm_counts = llm_response[feature].value_counts().sort_index()

        # Ensure both user and llm responses have the same categories
        all_categories = user_counts.index.union(llm_counts.index)
        user_counts = user_counts.reindex(all_categories, fill_value=0)
        llm_counts = llm_counts.reindex(all_categories, fill_value=0)

        # Plotting
        x = np.arange(len(all_categories))  # the label locations
        width = 0.35  # the width of the bars

        ax.bar(x - width/2, user_counts, width, alpha=0.5, label='User Response')
        ax.bar(x + width/2, llm_counts, width, alpha=0.5, label='LLM Response')

        ax.set_ylabel('Counts')
        ax.set_title(custom_labels.get(feature, feature))
        ax.set_xticks(x)

        if custom_labels.get(feature, feature) == 'VaccineReason':
            ax.set_xticklabels(all_categories, rotation=90)
        else:
            ax.set_xticklabels(all_categories)
        
        # Relabel y ticks
        current_ytick_labels = ax.get_yticklabels()
        custom_ytick_labels = [custom_labels.get(label.get_text(), label.get_text()) for label in current_ytick_labels]
        ax.set_yticklabels(custom_ytick_labels)

        ax.legend()
    
    # If there's an odd number of features, hide the last subplot if unused
    if len(features) % 2 != 0:
        axes[-1].set_visible(False)
    plt.savefig(f'../results/{EXPERIMENT_ROUND}/response_distribution_{experiment_version}.png', dpi=600, bbox_inches='tight')
    plt.show()


In [3]:
# List to hold dataframes for each file's evaluation results
all_evaluation_results = []

# Loop through each .xlsx file in the results folder
# ### Configuration for Afrobarometer (START) ###
for file_path in glob.glob(f'../results/{EXPERIMENT_ROUND}/afrobarometer_replication_*.xlsx'):
    print(file_path)
    # Extract experiment version from the file name
    start = file_path.find('afrobarometer_replication_')
    end = file_path.find('.xlsx')
    experiment_version = file_path[start:end]
# ### Configuration for Afrobarometer (END) ###

### Configuration for CANDOR (START) ###
# for file_path in glob.glob(f'../results/{EXPERIMENT_ROUND}/candor_replication_*.xlsx'):
#     # Extract experiment version from the file name
#     start = file_path.find('candor_replication_')
#     end = file_path.find('.xlsx')
#     experiment_version = file_path[start:end]
### Configuration for CANDOR (END) ###

    # Read the results from the file
    results = pd.read_excel(file_path)
    results = results[~results["user_response"].isin(["Don't know", "Refused to Answer"])].reset_index(drop=True)

    # Clean LLM response
    results["llm_response"] = results["llm_response"].apply(clean_llm_response)
    
    # Perform evaluation
    evaluation_results = evaluate_responses(results)
    evaluation_results['Experiment Version'] = experiment_version
    
    # Normalize the evaluation results and append to the list
    evaluation_results = pd.json_normalize(evaluation_results, sep=' ')
    evaluation_results = evaluation_results.rename(columns=lambda x: x.replace('evaluation_result', '\n'))
    all_evaluation_results.append(evaluation_results)

# Concatenate all dataframes into a single dataframe
all_evaluation_results = pd.concat(all_evaluation_results, ignore_index=True)

# Save evaluation results to a CSV file
all_evaluation_results.to_excel(f'../results/{EXPERIMENT_ROUND}/all_evaluation_results.xlsx', index=False)

# Display the final DataFrame
all_evaluation_results

../results/round9/afrobarometer_replication_gpt4o_interviewqna+expertreflection.xlsx
3
../results/round9/afrobarometer_replication_gpt4o_interviewqna+vaccinecontext.xlsx
1
../results/round9/afrobarometer_replication_gpt4o_interviewqna+backstory.xlsx
3
../results/round9/afrobarometer_replication_gpt4o_interviewqna.xlsx
3
../results/round9/afrobarometer_replication_gpt4o_interviewsummary.xlsx
5


Unnamed: 0,Experiment Version,"Have you received a vaccination against COVID-19, either one or two doses? Answer with No; Yes; Refused; Don't know response_type","Have you received a vaccination against COVID-19, either one or two doses? Answer with No; Yes; Refused; Don't know \n accuracy","Have you received a vaccination against COVID-19, either one or two doses? Answer with No; Yes; Refused; Don't know \n macro_f1_score","Have you received a vaccination against COVID-19, either one or two doses? Answer with No; Yes; Refused; Don't know \n precision","Have you received a vaccination against COVID-19, either one or two doses? Answer with No; Yes; Refused; Don't know \n recall","Have you received a vaccination against COVID-19, either one or two doses? Answer with No; Yes; Refused; Don't know \n matthews_corrcoef","Have you received a vaccination against COVID-19, either one or two doses? Answer with No; Yes; Refused; Don't know \n cramer_v_correlation","Have you received a vaccination against COVID-19, either one or two doses? Answer with No; Yes; Refused; Don't know \n auc","Have you received a vaccination against COVID-19, either one or two doses? Answer with No; Yes; Refused; Don't know \n invalid_percent"
0,afrobarometer_replication_gpt4o_interviewqna+e...,Categorical,0.860888,0.845055,0.899826,0.83013,0.726622,0.725519,0.83013,0.126689
1,afrobarometer_replication_gpt4o_interviewqna+v...,Categorical,0.859316,0.8438,0.895528,0.829381,0.721885,0.720784,0.829381,0.04223
2,afrobarometer_replication_gpt4o_interviewqna+b...,Categorical,0.862156,0.846606,0.90057,0.831693,0.729017,0.727917,0.831693,0.126689
3,afrobarometer_replication_gpt4o_interviewqna,Categorical,0.861311,0.845386,0.901214,0.830321,0.728092,0.726989,0.830321,0.126689
4,afrobarometer_replication_gpt4o_interviewsummary,Categorical,0.751164,0.744735,0.74306,0.747798,0.490835,0.489637,0.747798,0.211149


In [4]:
def calculate_cohen_kappa(user_response: pd.Series, llm_response: pd.Series) -> float:
    """
    Calculate Cohen's kappa coefficient between user_response and llm_response.
    
    Args:
        user_response (pd.Series): A pandas Series containing the user's responses.
        llm_response (pd.Series): A pandas Series containing the LLM's responses.
    
    Returns:
        float: Cohen's kappa coefficient.
    """
    from sklearn.metrics import cohen_kappa_score
    if len(user_response) != len(llm_response):
        raise ValueError("Series must be of the same length")
    
    return cohen_kappa_score(user_response, llm_response)


def calculate_krippendorff_alpha(user_response: pd.Series, llm_response: pd.Series) -> float:
    """Calculate Krippendorff's alpha coefficient between user_response and llm_response.
    
    Args:
        user_response (pd.Series): A pandas Series containing the user's responses.
        llm_response (pd.Series): A pandas Series containing the LLM's responses.
    
    Returns:
        float: Krippendorff's alpha coefficient.
    """
    if len(user_response) != len(llm_response):
        raise ValueError("Series must be of the same length")
    
    # Prepare the data in the format required by Krippendorff's alpha function
    data = [user_response.tolist(), llm_response.tolist()]

    try:
        return krippendorff.alpha(reliability_data=data, level_of_measurement='nominal')
    except ValueError:
        return 0.0


def calculate_proportion_agreement(user_response, llm_response) -> float:
    """
    Calculate the proportion agreement between two series containing responses.

    Parameters:
        user_response (pd.Series): A pandas Series containing the user's responses.
        llm_response (pd.Series): A pandas Series containing the LLM's responses.

    Returns:
        float: The proportion agreement between the two series.
    """
    agreement_count = (user_response == llm_response).sum()
    total_count = len(user_response)
    proportion_agreement = agreement_count / total_count
    return proportion_agreement


def calculate_tetrachoric_correlation(user_response, llm_response) -> float:
    """
    Calculate the tetrachoric correlation between two series containing responses.

    Parameters:
        user_response (pd.Series): A pandas Series containing the user's responses.
        llm_response (pd.Series): A pandas Series containing the LLM's responses.

    Returns:
        float: The tetrachoric correlation between the two series.
    """
    x = np.array(user_response)
    y = np.array(llm_response)
    
    # Calculate the contingency table
    n00 = np.sum((x == 0) & (y == 0))
    n01 = np.sum((x == 0) & (y == 1))
    n10 = np.sum((x == 1) & (y == 0))
    n11 = np.sum((x == 1) & (y == 1))
    
    # Total number of observations
    n = n00 + n01 + n10 + n11

    # Proportions
    p00 = n00 / n
    p01 = n01 / n
    p10 = n10 / n
    p11 = n11 / n
    
    # Marginal proportions
    p0_ = p00 + p01
    p1_ = p10 + p11
    p_0 = p00 + p10
    p_1 = p01 + p11
    
    # Calculate the tetrachoric correlation
    # Using the inverse of the cumulative distribution function (CDF) of the normal distribution
    phi = (p00 * p11 - p01 * p10) / np.sqrt(p0_ * p1_ * p_0 * p_1 + 1e-10)
    return phi

def evaluate_responses_correlation(prompts_with_responses: pd.DataFrame, question) -> dict:
    """
    Evaluate the LLM's ability to predict the ground truth responses.

    Parameters:
        prompts_with_responses (pd.DataFrame): A DataFrame containing the prompts and the corresponding responses.

    Returns:
        dict: A dictionary containing the evaluation metrics for each question.
    """

    proportion_agreement = {"question":question, "Metric":"Proportion Agreement"}
    tetrachoric_correlation = {"question":question, "Metric":"Tetrachoric Correlation"}
    cohen_kappa = {"question":question, "Metric":"Cohen's Kappa Coefficient"}
    krippendorff_alpha = {"question":question, "Metric":"Krippendorff's Alpha Coefficient"}

    # Define the list of filters
    # ### Configuration for Afrobarometer (START) ###
    # filters = {
    #     "Whole Sample": prompts_with_responses["What is your gender?"].isin(['Man','Woman']),
    #     "Men": prompts_with_responses["What is your gender?"] == "Man",
    #     "Woman": prompts_with_responses["What is your gender?"] == "Woman",
    #     "18-30 Years Old": prompts_with_responses["How old are you?"] <= 30,
    #     "31-45 Years Old": (prompts_with_responses["How old are you?"] > 30) & (prompts_with_responses["How old are you?"] <= 45),
    #     "46-60 Years Old": (prompts_with_responses["How old are you?"] > 45) & (prompts_with_responses["How old are you?"] <= 60),
    #     "Over 60 Years Old": prompts_with_responses["How old are you?"] > 60,
    #     "Don't Feel Close to Party": prompts_with_responses["Do you feel close to any particular political party?"] == "No (does NOT feel close to ANY party)",
    #     "Feel Close to Party": prompts_with_responses["Do you feel close to any particular political party?"] == "Yes (feels close to a party)",
    #     "Discusses Politics": prompts_with_responses["When you get together with your friends or family, how often would you say you discuss political matters?"].isin(["Frequently","Occasionally"]),
    #     "Dont' Discuss Politics": prompts_with_responses["When you get together with your friends or family, how often would you say you discuss political matters?"] == "Never",
    #     "Urban Area": prompts_with_responses["Do you come from a rural or urban area?"] == "Urban",
    #     "Ruran Area": prompts_with_responses["Do you come from a rural or urban area?"] == "Rural",
    # }
    filters = {
        "Whole Sample": prompts_with_responses["Respondent's gender Answer with Man; Woman"].isin(['Man','Woman']),
        "Men": prompts_with_responses["Respondent's gender Answer with Man; Woman"] == "Man",
        "Woman": prompts_with_responses["Respondent's gender Answer with Man; Woman"] == "Woman",
        "18-30 Years Old": prompts_with_responses["How old are you? Answer with an integer above 17; Refused; Don't know"] <= 30,
        "31-45 Years Old": (prompts_with_responses["How old are you? Answer with an integer above 17; Refused; Don't know"] > 30) & (prompts_with_responses["How old are you? Answer with an integer above 17; Refused; Don't know"] <= 45),
        "46-60 Years Old": (prompts_with_responses["How old are you? Answer with an integer above 17; Refused; Don't know"] > 45) & (prompts_with_responses["How old are you? Answer with an integer above 17; Refused; Don't know"] <= 60),
        "Over 60 Years Old": prompts_with_responses["How old are you? Answer with an integer above 17; Refused; Don't know"] > 60,
        "Don't Feel Close to Party": prompts_with_responses["Do you feel close to any particular political party? Answer with No (does NOT feel close to ANY party); Yes (feels close to a party); Refused to answer; Don't know"] == "No (does NOT feel close to ANY party)",
        "Feel Close to Party": prompts_with_responses["Do you feel close to any particular political party? Answer with No (does NOT feel close to ANY party); Yes (feels close to a party); Refused to answer; Don't know"] == "Yes (feels close to a party)",
        "Discusses Politics": prompts_with_responses["When you get together with your friends or family, how often would you say you discuss political matters? Answer with Never; Occasionally; Frequently; Refused; Don't know"].isin(["Frequently","Occasionally"]),
        "Dont' Discuss Politics": prompts_with_responses["When you get together with your friends or family, how often would you say you discuss political matters? Answer with Never; Occasionally; Frequently; Refused; Don't know"] == "Never",
        "Urban Area": prompts_with_responses["PSU/EA"] == "Urban/ Peri-Urban",
        "Ruran Area": prompts_with_responses["PSU/EA"] == "Rural",
    }
    # ### Configuration for Afrobarometer (END) ###

    ### Configuration for CANDOR (START) ###
    # filters = {
    #     "Whole Sample": prompts_with_responses["What is your gender?"].isin(['Male','Female']),
    #     "Male": prompts_with_responses["What is your gender?"] == "Male",
    #     "Female": prompts_with_responses["What is your gender?"] == "Female",
    #     "18-30 Years Old": prompts_with_responses["What is your age?"] <= 30,
    #     "31-45 Years Old": (prompts_with_responses["What is your age?"] > 30) & (prompts_with_responses["What is your age?"] <= 45),
    #     "46-60 Years Old": (prompts_with_responses["What is your age?"] > 45) & (prompts_with_responses["What is your age?"] <= 60),
    #     "Over 60 Years Old": prompts_with_responses["What is your age?"] > 60,
    #     "Don't Re-elect Incumbent Party": prompts_with_responses["Would you vote to re-elect this government in the next election?"] == "No",
    #     "Re-elect Incumbent Party": prompts_with_responses["Would you vote to re-elect this government in the next election?"] == "Yes",
    # }
    ### Configuration for CANDOR (END) ###

    # Loop through each filter
    for filter_name, filter in filters.items():
        filtered_data = prompts_with_responses[filter]
        filtered_data = filtered_data.dropna(subset=['user_response','llm_response']).reset_index(drop=True)

        # Map the user and llm responses using the custom mapping function
        user_mapped = filtered_data['user_response'].apply(lambda response: map_response(response, question))
        llm_mapped = filtered_data['llm_response'].apply(lambda response: map_response(response, question))
        proportion_agreement[filter_name] = calculate_proportion_agreement(user_mapped, llm_mapped)
        tetrachoric_correlation[filter_name] = calculate_tetrachoric_correlation(user_mapped, llm_mapped)
        cohen_kappa[filter_name] = calculate_cohen_kappa(user_mapped, llm_mapped)
        krippendorff_alpha[filter_name] = calculate_krippendorff_alpha(user_mapped, llm_mapped)

    return proportion_agreement, tetrachoric_correlation, cohen_kappa, krippendorff_alpha

# Loop through each .xlsx file in the results folder
# ### Configuration for Afrobarometer (START) ###
for file_path in glob.glob(f'../results/{EXPERIMENT_ROUND}/afrobarometer_replication_*.xlsx'):
    print(file_path)
    # List to hold dataframes for each file's evaluation results
    all_evaluation_results = []

    # Extract experiment version from the file name
    start = file_path.find('afrobarometer_replication_')
    end = file_path.find('.xlsx')
    experiment_version = file_path[start:end]
# ### Configuration for Afrobarometer (END) ###

### Configuration for CANDOR (START) ###
# for file_path in glob.glob(f'../results/{EXPERIMENT_ROUND}/candor_replication_*.xlsx'):
#     print(file_path)
#     # List to hold dataframes for each file's evaluation results
#     all_evaluation_results = []

#     # Extract experiment version from the file name
#     start = file_path.find('candor_replication_')
#     end = file_path.find('.xlsx')
#     experiment_version = file_path[start:end]
### Configuration for CANDOR (END) ###
    
    # Read the results from the file
    results = pd.read_excel(file_path)
    results = results[~results["user_response"].isin(["Don't know", "Refused to Answer"])].reset_index(drop=True)

    # Clean LLM response
    results["llm_response"] = results["llm_response"].apply(clean_llm_response)

    for question in survey_questions:
        question_results = results[results['question']==question].reset_index()
    
        # Perform evaluation
        result_proportion_agreement, result_tetrachoric_correlation, result_cohen_kappa, result_krippendorff_alpha = evaluate_responses_correlation(question_results, question)
        
        # Normalize the evaluation results and append to the list
        all_evaluation_results.append(result_proportion_agreement)
        all_evaluation_results.append(result_tetrachoric_correlation)
        all_evaluation_results.append(result_cohen_kappa)
        all_evaluation_results.append(result_krippendorff_alpha)

    # Convert results into a single dataframe
    all_evaluation_results = pd.DataFrame(all_evaluation_results).T
    all_evaluation_results.columns = all_evaluation_results.iloc[0]
    all_evaluation_results = all_evaluation_results[1:]

    # Save evaluation results to a CSV file
    all_evaluation_results.to_excel(f'../results/{EXPERIMENT_ROUND}/evaluation_results_correlation_{experiment_version}.xlsx', index=True)

all_evaluation_results

../results/round9/afrobarometer_replication_gpt4o_interviewqna+expertreflection.xlsx
../results/round9/afrobarometer_replication_gpt4o_interviewqna+vaccinecontext.xlsx
../results/round9/afrobarometer_replication_gpt4o_interviewqna+backstory.xlsx
../results/round9/afrobarometer_replication_gpt4o_interviewqna.xlsx
../results/round9/afrobarometer_replication_gpt4o_interviewsummary.xlsx


question,"Have you received a vaccination against COVID-19, either one or two doses? Answer with No; Yes; Refused; Don't know","Have you received a vaccination against COVID-19, either one or two doses? Answer with No; Yes; Refused; Don't know.1","Have you received a vaccination against COVID-19, either one or two doses? Answer with No; Yes; Refused; Don't know.2","Have you received a vaccination against COVID-19, either one or two doses? Answer with No; Yes; Refused; Don't know.3"
Metric,Proportion Agreement,Tetrachoric Correlation,Cohen's Kappa Coefficient,Krippendorff's Alpha Coefficient
Whole Sample,0.749578,0.490835,0.487926,0.487594
Men,0.752525,0.498271,0.495464,0.495521
Woman,0.74661,0.483636,0.480329,0.479665
18-30 Years Old,0.730387,0.463555,0.460395,0.460601
31-45 Years Old,0.765269,0.522428,0.518984,0.518828
46-60 Years Old,0.765281,0.509429,0.499324,0.49529
Over 60 Years Old,0.739726,0.434096,0.427727,0.425226
Don't Feel Close to Party,0.733393,0.464667,0.461778,0.461317
Feel Close to Party,0.761736,0.503777,0.500627,0.500619


In [8]:
# Load the results
experiment_version = 'afrobarometer_replication_gpt4o_interviewqna'  # TODO need to be updated

# plot_cramers_feature(experiment_version=experiment_version)
plot_response_distribution(experiment_version=experiment_version)

In [None]:
from adjustText import adjust_text
import matplotlib.patches as mpatches

# Define the function to check for partial string matches
def extract_model_size(scenario, model_size):
    for key in model_size:
        if key in scenario:
            return model_size[key]
    return None  # Return None if no match is found

def extract_model_color(scenario, model_color):
    for key in model_color:
        if key in scenario:
            return model_color[key]
    return "black"  # Return None if no match is found

def extract_model_marker(scenario, model_marker):
    for key in model_marker:
        if key in scenario:
            return model_marker[key]
    return "x"  # Return None if no match is found

def extract_model_label(scenario, model_label, is_description=False):
    for key in model_label:
        if key in scenario:
            return model_label[key]
            # if is_description or "gpt" in scenario:
            #     return model_label[key]
            # else:
            #     return model_label[key] + " " + model_acronym[scenario[-2:]]
    return None  # Return None if no match is found

# Create a custom sorting function that checks for partial matches
def get_order(label):
    if label is None:
        return float('inf')
    desired_order = ["INS:","CEN:","UNC:","INS+CT:","CEN+CT:","UNC+CT:"]
    for i, desired_label in enumerate(desired_order):
        if desired_label in label:
            return i
    return float('inf')

model_size = {
    "gemma-2-2b":2,
    "llama-3.2-3b":3,
    "mistral-7b":7,
    "llama-3.1-8b":8,
    "gpt4omini":8,
    "gemma-2-9b":9,
    "gpt4o":1000,
    "gemma-2-27b":27,
    "llama-3.1-70b":70,
    "gpt4.0turbo":1000,
}

model_label = {
    "gemma-2-2b":"G-2-2B",
    "llama-3.2-3b":"L-3.2-3B",
    "mistral-7b":"M-7B",
    "llama-3.1-8b":"L-3.1-8B",
    "gpt4omini":"GPT4o-Mini",
    "gemma-2-9b":"G-2-9B",
    "gpt4o":"GPT4o",
    "gemma-2-27b":"G-2-27B",
    "llama-3.1-70b":"L-3.1-70B",
    "gpt4.0turbo":"GPT4.0-Turbo",
}

model_colour = {
    "s4":'blue',
    "s5":'blue',
    "s6":'blue',
}

model_marker = {
    "s1":'x',
    "s2":'o',
    "s3":'v',
    "s4":'^',
    "s5":'s',
    "s6":'D',
}

model_description = {
    "s1":'INS: Instruct Model',
    "s2":'CEN: Pretrained Model + Instruction-Tuning Censored',
    "s3":'UNC: Pretrained Model + Instruction-Tuning Uncensored',
    "s4":'INS+CT: INS + Context-Tuning',
    "s5":'CEN+CT: CEN + Context-Tuning',
    "s6":'UNC+CT: UNC + Context-Tuning',
}

model_acronym = {
    "s1":'INS',
    "s2":'CEN',
    "s3":'UNC',
    "s4":'INS+CT',
    "s5":'CEN+CT',
    "s6":'UNC+CT',
}

METRIC = "macro_f1_score"
METRIC_READABLE = "Macro-Average F1 Score"

all_results = pd.read_excel("../results/round8/all_evaluation_results.xlsx")
question = "Have you received a vaccination against COVID-19, either one or two doses?"
metric_col = question + f" \n {METRIC}"
all_results = all_results[["Experiment Version",metric_col]]
all_results.rename(columns={metric_col:METRIC}, inplace=True)
all_results["model_scenario"] = all_results["Experiment Version"].apply(lambda x: x.replace("afrobarometer_replication_",""))
all_results['model_size'] = all_results['model_scenario'].apply(lambda x: extract_model_size(x, model_size))
all_results["color"] = all_results['model_scenario'].apply(lambda x: extract_model_color(x, model_colour))
all_results["marker"] = all_results['model_scenario'].apply(lambda x: extract_model_marker(x, model_marker))
all_results["label"] = all_results['model_scenario'].apply(lambda x: extract_model_label(x, model_label))
all_results["description"] = all_results['model_scenario'].apply(lambda x: extract_model_label(x, model_description, is_description=True))

# Create subplots
model_categories = ["INS","CEN","UNC"]
fig, axes = plt.subplots(len(model_categories), 1, figsize=(10, 6 * len(model_categories)), sharey=True)

# Initialize lists to collect handles and labels for the legend
all_handles = []
all_labels = []

# Plot each subset in its own subplot
texts_list = []
for i, (ax, description) in enumerate(zip(axes, model_categories)):
    subset = all_results[all_results['description'].str.contains(description, na=False) | all_results['model_scenario'].str.contains("gpt")]
    for marker in subset["marker"].unique():
        marker_subset = subset[subset["marker"] == marker].reset_index(drop=True)
        subset_label = marker_subset.loc[0, "description"]
        # scatter = ax.scatter(marker_subset['model_size'], marker_subset[METRIC], c=marker_subset["color"], marker=marker, label=subset_label)
        scatter = ax.scatter(marker_subset['model_size'], marker_subset[METRIC], c=marker_subset["color"], marker="x")
        if (description in ["CEN", "UNC"] and marker != "x") or description == "INS":
            all_handles.append(scatter)
            all_labels.append(subset_label)

    # Annotate the scatter plot with model_scenario
    texts = []
    for j, row in subset.iterrows():
        texts.append(ax.text(row['model_size'], row[METRIC], row['label'], fontsize=8, ha='center', color=row["color"]))
    texts_list.append(texts)

    # Set the x-axis to a logarithmic scale
    ax.set_xscale('log')

    # Set the x-label only for the last subplot
    if i == len(axes) - 1:
        ax.set_xlabel('Model Parameter Size in Log Scale (Billions)')
    
    # Set the y-label for each subplot
    ax.set_ylabel(METRIC_READABLE)
    ax.set_title(f'Scatter Plot of {METRIC_READABLE} vs. Model Parameter Size in Log Scale ({description})')

    ax.grid(True)

# Set the y-axis limits to be the same for all plots
axes[0].set_ylim(all_results[METRIC].min() - 0.01, all_results[METRIC].max() + 0.01)

# Adjust text to prevent overlap
for i, texts in enumerate(texts_list):
    adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'), ax=axes[i])

# # Sort the handles and labels based on the custom order
# sorted_labels_handles = sorted(zip(all_labels, all_handles), key=lambda x: get_order(x[0]))
# sorted_labels, sorted_handles = zip(*sorted_labels_handles)

# # Set the legend with the sorted labels and handles
# fig.legend(sorted_handles, sorted_labels, loc='upper center', bbox_to_anchor=(0.5, 0), ncol=2)

context_tuning_patch = mpatches.Patch(color='blue', label='Includes Context-Tuning')

# Set the legend with the sorted labels and handles, including the custom legend entry
fig.legend(handles=[context_tuning_patch], loc='upper center', bbox_to_anchor=(0.5, 0), ncol=1)

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def format_model_response(experiment, response_type="llm_response") -> pd.DataFrame:
    results = pd.read_excel(f'../results/{EXPERIMENT_ROUND}/{experiment}.xlsx')
    
    # Clean LLM response
    results["llm_response"] = results["llm_response"].apply(clean_llm_response)

    # Drop rows with missing responses
    results.dropna(subset=['user_response'], inplace=True)
    if response_type == "llm_response":
        results = results[~results["llm_response"].isin([""])].reset_index(drop=True)
    else:
        results = results[~results["user_response"].isin(["Don't know", "Refused to Answer"])].reset_index(drop=True)
    
    response = {}


    for question in results['question'].unique():
        resp = results[results['question'] == question][response_type].tolist()
        # Ensure each list has same number of items, filling with NaN if necessary
        response[question] = resp + [np.nan] * (NUM_SUBJECTS - len(resp))

    response = pd.DataFrame(response)

    # Convert columns to categorical
    for col in response.columns:
        response[col] = format_response(response[col].astype('category'))
        response[col] = response[col].apply(lambda response: map_response_string(response, col))

    response.rename(columns={question:"response"}, inplace=True)

    return response

def plot_llm_response_distribution(dataframes, labels, title):
    """
    Plot the distribution of "Yes" and "No" responses in the "llm_response" column for multiple DataFrames.

    Parameters:
        dataframes (list of pd.DataFrame): List of DataFrames to plot.
        labels (list of str): List of labels for the DataFrames.
    """
    # Initialize a dictionary to store the counts
    response_counts = {label: {'Yes': 0, 'No': 0} for label in labels}

    # Calculate the counts for each DataFrame
    for df, label in zip(dataframes, labels):
        response_counts[label]['Yes'] = df['response'].str.lower().value_counts(normalize=True).get('yes', 0) * 100
        response_counts[label]['No'] = df['response'].str.lower().value_counts(normalize=True).get('no', 0) * 100

    # Create a DataFrame from the counts dictionary
    counts_df = pd.DataFrame(response_counts).T

    # Plot the distribution
    counts_df.plot(kind='bar', stacked=False, figsize=(10, 6), alpha=0.7)
    plt.xlabel('LLM Models')
    plt.ylabel('Percentage (%)')
    plt.title(title)
    plt.legend(title='Response')
    plt.tight_layout()
    plt.show()


df1 = format_model_response(experiment="afrobarometer_replication_llama-3.2-3b_s6", response_type="user_response")
df2 = format_model_response(experiment="afrobarometer_replication_llama-3.2-3b_s6", response_type="llm_response")
df3 = format_model_response(experiment="afrobarometer_replication_mistral-7b_s6", response_type="llm_response")
df4 = format_model_response(experiment="afrobarometer_replication_llama-3.1-8b_s6", response_type="llm_response")
df5 = format_model_response(experiment="afrobarometer_replication_gemma-2-9b_s6", response_type="llm_response")
# df6 = format_model_response(experiment="afrobarometer_replication_gpt4omini", response_type="llm_response")
# df7 = format_model_response(experiment="afrobarometer_replication_gpt4o", response_type="llm_response")
# df8 = format_model_response(experiment="afrobarometer_replication_gpt4.0turbo", response_type="llm_response")

dataframes = [df1, df2, df3, df4, df5]#, df6, df7, df8]
labels = ["Human","Llama 3.2 3B","Mistral 7B","Llama 3.1 8B","Gemma 2 9B"]#,"GPT 4o Mini","GPT 4o","GPT 4.0 Turbo"]
plot_llm_response_distribution(dataframes, labels, title="Vaccination Response Distribution for S6 (Pretrained Model + Instruction-Tuning with Uncensored Data + Context-Tuning)")

In [8]:
def plot_synthetic_response_distribution(experiment_version: str) -> None:
    """
    Plots the distribution of responses for each feature in the given experiment version.

    Parameters:
        experiment_version (str): The version of the experiment.

    Returns:
        None
    """
    results = pd.read_excel(f'../results/{EXPERIMENT_ROUND}/{experiment_version}.xlsx')
    
    # Clean LLM response
    results["llm_response"] = results["llm_response"].apply(clean_llm_response)

    llm_response = {}

    for question in results['question'].unique():
        llm_response[question] = results[results['question'] == question]['llm_response'].tolist()

    llm_response = pd.DataFrame(llm_response)

    # Convert columns to categorical
    for col in llm_response.columns:
        llm_response[col] = format_response(llm_response[col].astype('category'))
        
    features = llm_response.columns

    # Plotting
    fig, axes = plt.subplots(nrows=len(features) // 2 + len(features) % 2, ncols=2, figsize=(20, 5 * (len(features) // 2 + len(features) % 2)))
    axes = axes.flatten()  # Flatten the 2D array of axes to easily use a single index

    for i, feature in enumerate(features):
        ax = axes[i]
        llm_counts = llm_response[feature].value_counts().sort_index()
        llm_counts = llm_counts.reindex(['Yes', 'No'], fill_value=0)

        # Ensure both user and llm responses have the same categories
        all_categories = llm_counts.index.union(llm_counts.index)

        # Plotting
        x = np.arange(len(all_categories))  # the label locations
        width = 0.35  # the width of the bars

        ax.bar(x, llm_counts, width, alpha=0.5, label='LLM Response')

        ax.set_ylabel('Counts')
        ax.set_title(custom_labels.get(feature, feature))
        ax.set_xticks(x)

        ax.set_xticklabels(all_categories)
        
        # Relabel y ticks
        current_ytick_labels = ax.get_yticklabels()
        custom_ytick_labels = [custom_labels.get(label.get_text(), label.get_text()) for label in current_ytick_labels]
        ax.set_yticklabels(custom_ytick_labels)

        ax.legend()
    
    # If there's an odd number of features, hide the last subplot if unused
    if len(features) % 2 != 0:
        axes[-1].set_visible(False)
    plt.savefig(f'../results/{EXPERIMENT_ROUND}/synthetic_response_distribution_{experiment_version}.png', dpi=600, bbox_inches='tight')
    plt.show()


def plot_synthetic_treatment_response_distribution(experiment_version: str, groupby_category: str) -> None:
    """
    Plots the distribution of responses for each treatment in the given experiment version.

    Parameters:
        experiment_version (str): The version of the experiment.
        groupby_category (str): How the data will be separately visualised

    Returns:
        None
    """
    results = pd.read_excel(f'../results/{EXPERIMENT_ROUND}/{experiment_version}.xlsx')
    
    # Clean LLM response
    results["llm_response"] = results["llm_response"].apply(clean_llm_response)

    # Group data by treatment type
    grouped = results.groupby(groupby_category)
    
    # Determine the number of unique treatments
    num_groups = len(grouped)
    
    # Create a 2x2 grid of subplots
    fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 13))
    axes = axes.flatten()
    
    # Plot each treatment type
    for ax, (category, group) in zip(axes, grouped):
        # Count the occurrences of each response
        response_counts = group['llm_response'].value_counts()
        response_counts = response_counts.reindex(['Yes', 'No'], fill_value=0)
        
        # Plot the bar plot
        response_counts.plot(kind='bar', ax=ax, alpha=0.5)
        
        # Set the title and labels
        ax.set_title(category)
        ax.set_xlabel('Response')
        ax.set_ylabel('Counts')
        
        # Set consistent x-tick labels
        ax.set_xticks([0, 1])
        # ax.set_xticklabels(['Yes', 'No'])
    
    # Hide any unused subplots
    for i in range(num_groups, len(axes)):
        fig.delaxes(axes[i])
    
    # Adjust layout
    plt.tight_layout()
    plt.savefig(f'../results/{EXPERIMENT_ROUND}/treatment_response_distribution_{experiment_version}.png', dpi=600, bbox_inches='tight')
    plt.show()


In [None]:
experiment_version = 'candor_synthetic_backstory+cotreasoning_placebo_sample_600'

# plot_synthetic_response_distribution(experiment_version=experiment_version)
plot_synthetic_treatment_response_distribution(experiment_version=experiment_version, groupby_category="country")