# Helper Functions

In [2]:
import pandas as pd
import os
import json
import random
from sklearn.model_selection import train_test_split

RANDOM_STATE = 42
random.seed(RANDOM_STATE)

def generate_demographic_prompt(row, excluded_columns): 
    demographic_questions = [question for question in list(row.index) if question not in excluded_columns]
    random.shuffle(demographic_questions)
    demographic_prompt = ""
    counter = 1
    for question in demographic_questions:
        if pd.isnull(row[question]) or row[question] == "NA" or row[question] == "N/A":
            continue
        demographic_prompt += f"{counter}) Interviewer: {question} Me: {row[question]} "
        counter += 1

    return demographic_prompt

def generate_demographic_prompt_with_labels(row, excluded_columns, question_label_dict): 
    demographic_questions = [question for question in list(row.index) if question not in excluded_columns]
    random.shuffle(demographic_questions)
    demographic_prompt = ""
    counter = 1
    for question in demographic_questions:
        if pd.isnull(row[question]) or row[question] == "NA" or row[question] == "N/A":
            continue
        demographic_prompt += f"{counter}) Interviewer: {question_label_dict[question]} Me: {row[question]} "
        counter += 1

    return demographic_prompt

def format_prompts(row: pd.Series, target_outcome) -> str:
    prompt = [{"role":"system","content":row["system_prompt"]},
              {"role":"user","content":row["user_prompt"]},
              {"role":"assistant","content":row[target_outcome]}
            ]
    
    # Convert the prompt list to a JSON-formatted string
    prompt_string = json.dumps(prompt)

    return prompt_string

def include_variable_names(
    data_with_responses: pd.DataFrame, data_file_path: str
) -> pd.DataFrame:
    """Include variable names from the original data file into the provided DataFrame.
    This function reads the original data file (CSV or XLSX) to extract the column headers,
    maps the current column headers in the provided DataFrame to the original headers,
    and then inserts the current headers as the first row in the resulting DataFrame.

    Args:
        data_with_responses (pd.DataFrame): DataFrame containing the data with responses.
        data_file_path (str): Path to the original data file (CSV or XLSX) containing the headers.
    Returns:
        pd.DataFrame: DataFrame with the original headers included and the current headers as the first row.
    Raises:
        ValueError: If the provided file format is not supported (neither CSV nor XLSX).
    """

    def get_key_by_value(d, value):
        for key, val in d.items():
            if val == value:
                return key
        return value

    if data_file_path.endswith(".csv"):
        original_data_with_headers = pd.read_csv(data_file_path)
    elif data_file_path.endswith(".xlsx"):
        original_data_with_headers = pd.read_excel(data_file_path)
    else:
        raise ValueError("Unsupported file format. Please provide a CSV or XLSX file.")

    # Extract the first row from the original data
    col_name_mapping = original_data_with_headers.iloc[0].to_dict()

    new_col_headers = []
    for col in data_with_responses.columns:
        new_col_headers.append(get_key_by_value(col_name_mapping, col))

    # Push the current column headers into the first row
    headers_as_first_row = pd.DataFrame(
        [data_with_responses.columns], columns=data_with_responses.columns
    )

    # Concatenate the headers_as_first_row with the results dataframe
    data_with_response_headers = pd.concat(
        [headers_as_first_row, data_with_responses], ignore_index=True
    )

    # Assign new column headers to the results dataFrame
    data_with_response_headers.columns = new_col_headers

    return data_with_response_headers


# Prepare General Healthcare Data for Context-tuning

## Candour Data Ghana (LMIC Survey)

In [None]:
DATA = "candour_2_ghana_training"

def format_candour_prompts(row: pd.Series) -> str:
    prompt = [{"role":"system","content":row["system_prompt"]},
              {"role":"user","content":"Have you already been offered, or had an opportunity to receive, a COVID-19 vaccine?"},
              {"role":"assistant","content":row["Have you already been offered, or had an opportunity to receive, a COVID-19 vaccine?"]}
            ]

    if row["Have you already been offered, or had an opportunity to receive, a COVID-19 vaccine?"] == "Yes":
        prompt += [
            {"role":"user", "content":"Have you received a COVID-19 vaccine?"},
            {"role":"assistant", "content":row["Have you received a COVID-19 vaccine?"]}
        ]
        
        if row["Have you received a COVID-19 vaccine?"].startswith("Yes"):
            prompt += [
                {"role":"user", "content":"Why did you decide to get vaccinated against COVID-19, what were the reasons?"},
                {"role":"assistant", "content":row["Why did you decide to get vaccinated against COVID-19, what were the reasons?"]}
            ]

        elif row["Have you received a COVID-19 vaccine?"] == "No, I have declined the offer to be vaccinated":
            prompt += [
                {"role":"user", "content":"Why did you decide NOT to get vaccinated against COVID-19, what are / were reasons?"},
                {"role":"assistant", "content":row["Why did you decide NOT to get vaccinated against COVID-19, what are / were reasons?"]}
            ]

        else:  # Not yet but I am waiting for my appointment, Prefer not to say
            pass

    elif row["Have you already been offered, or had an opportunity to receive, a COVID-19 vaccine?"] == "No":
        prompt += [
            {"role":"user", "content":"If a COVID-19 vaccine was available to you, would you definitely get it, probably get it, probably not get it or definitely not get it?"},
            {"role":"assistant", "content":row["If a COVID-19 vaccine was available to you, would you definitely get it, probably get it, probably not get it or definitely not get it?"]}
        ]

        if row["If a COVID-19 vaccine was available to you, would you definitely get it, probably get it, probably not get it or definitely not get it?"] == "Definitely get it":
            prompt += [
                {"role":"user", "content":"What are your reasons for getting vaccinated for COVID-19?"},
                {"role":"assistant", "content":row["What are your reasons for getting vaccinated for COVID-19?"]}
            ]

        elif row["If a COVID-19 vaccine was available to you, would you definitely get it, probably get it, probably not get it or definitely not get it?"] == "Definitely not get it":
            prompt += [
                {"role":"user", "content":"What are your reasons for NOT getting vaccinated for COVID-19?"},
                {"role":"assistant", "content":row["What are your reasons for NOT getting vaccinated for COVID-19?"]}
            ]

        elif row["If a COVID-19 vaccine was available to you, would you definitely get it, probably get it, probably not get it or definitely not get it?"] in ["Probably not get it","Probably get it"]:
            prompt += [
                {"role":"user", "content":"What are your reasons for getting vaccinated for COVID-19?"},
                {"role":"assistant", "content":row["What are your reasons for getting vaccinated for COVID-19?"]},
                {"role":"user", "content":"What are your reasons for NOT getting vaccinated for COVID-19?"},
                {"role":"assistant", "content":row["What are your reasons for NOT getting vaccinated for COVID-19?"]}
            ]

        else: # Prefer not to say, Do not know
            pass

    else:  # Don't Know, Prefer not to Say
        pass

    # Convert the prompt list to a JSON-formatted string
    prompt_string = json.dumps(prompt)

    return prompt_string

#  CANDOR
target_outcomes = [
    "Have you already been offered, or had an opportunity to receive, a COVID-19 vaccine?",
    "Have you received a COVID-19 vaccine?",
    "Why did you decide NOT to get vaccinated against COVID-19, what are / were reasons?",
    "Why did you decide to get vaccinated against COVID-19, what were the reasons?",
    "If a COVID-19 vaccine was available to you, would you definitely get it, probably get it, probably not get it or definitely not get it?",
    "What are your reasons for NOT getting vaccinated for COVID-19?",
    "What are your reasons for getting vaccinated for COVID-19?",
]

system_prompt = """Please put yourself in the shoes of a human subject participating in a healthcare survey in Ghana. You will be provided with a demographic profile that describes your age, gender, region/district where you live in, the highest education level you achieved, ideology, political point-of-view, ethnicity, religion, martial status, household size, economic situation and attitude, vaccination hesitancy, views on your country's health policies, health conditions, and EQ-5D health-related quality of life. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. After you receive your complete human subject profile, you will be asked whether you received the COVID-19 vaccination. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile:
{demographic_prompt}
"""

# Load Candor data
data = pd.read_csv(f"data/{DATA}.csv", header=1)

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

# Remove entries with missing responses
data = data.dropna(
    subset=target_outcomes[0]
).reset_index(drop=True)

# Construct system and question prompts
data["demographic_prompt"] = data.apply(generate_demographic_prompt, axis=1, args=(target_outcomes + ["ID"],))
data["system_prompt"] = data["demographic_prompt"].apply(lambda x: system_prompt.format(demographic_prompt=x))

# Format data for fine-tuning
data["text"] = data.apply(format_candour_prompts, axis=1)

# Save files in CSV format
data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

## Afrobarometer Data Ghana and Sierra Leone (LMIC Survey)

In [None]:
RANDOM_STATE = 42
COUNTRIES = ["Ghana", "Sierra Leone"]

target_outcomes = [
    "Have you received a vaccination against COVID-19, either one or two doses? Answer with No; Yes; Refused; Don't know",
    "If a vaccine for COVID-19 is available, how likely are you to try to get vaccinated? Answer with Very unlikely; Somewhat unlikely; Somewhat likely; Very likely; Refused; Don't know. Answer Not applicable if you have never received a COVID-19 vaccination"
]

system_prompt = """Please put yourself in the shoes of a human subject participating in a healthcare survey in {country}. You will be provided with a demographic profile that describes the geographical area/region/district where you live, the facilities and services in your area, your age, view of the country, living conditions, voting preferences, trust in different authorities, experience when seeking healthcare, experience with COVID-19, views on vaccination, preferred sources of information, employment status, highest education level, religion, gender, and race. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. After you receive your complete human subject profile, you will be asked whether you received the COVID-19 vaccination. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile:
{demographic_prompt}
"""

def format_afrobarometer_prompts(row: pd.Series, target_outcomes) -> str:
    prompt = [{"role":"system","content":row["system_prompt"]},
        {"role":"user","content":target_outcomes[0]},
        {"role":"assistant","content":row[target_outcomes[0]]}
        ]

    if row[target_outcomes[0]] != "Yes":
        prompt += [
            {"role":"user","content":target_outcomes[1]},
            {"role":"assistant","content":row[target_outcomes[1]]}
        ]
    
    # Convert the prompt list to a JSON-formatted string
    prompt_string = json.dumps(prompt)

    return prompt_string


for COUNTRY in COUNTRIES:
    if COUNTRY == "Ghana":
        DATA = "afrobarometer_r9_ghana_latlong_training"
    elif COUNTRY == "Sierra Leone":
        DATA = "afrobarometer_SL_training"
    else:
        raise ValueError("Country {COUNTRY} is not supported. Choose either 'Ghana' or 'Sierra Leone'.")

    # Load Afrobarometer data
    data = pd.read_csv(f"data/{DATA}.csv", header=1)

    data_folder_dir = DATA
    if not os.path.exists(f"data/{data_folder_dir}"):
        # Create the folder
        os.makedirs(f"data/{data_folder_dir}")

    # Remove entries with missing responses
    data = data.dropna(subset=target_outcomes).reset_index(drop=True)

    # Construct system, user, and response prompts
    data["demographic_prompt"] = data.apply(generate_demographic_prompt, axis=1, args=([target_outcomes, "ID", "Respondent Number"],))
    data["system_prompt"] = data["demographic_prompt"].apply(lambda x: system_prompt.format(country=COUNTRY, demographic_prompt=x))

    # Format data for fine-tuning
    data["text"] = data.apply(format_afrobarometer_prompts, axis=1, args=(target_outcomes,))

    # Save files in CSV format
    data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

## Arce et al 2021 (LMIC and HIC Survey)

In [None]:
DATA = "arce_et_al_2021_training"
COUNTRIES = ["United States", "Sub-Saharan Africa"]

target_outcome = "Respondent would take the vaccine if available?"
system_prompt = """Please put yourself in the shoes of a human subject participating in a healthcare survey in {country}. You will be provided with a demographic profile that describes your country, age, highest level of education, gender, reasons for taking the COVID-19 vaccine, reasons for not taking the COVID-19 vaccine, and people you would trust to help you decide to get vaccinated. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. After you receive your complete human subject profile, you will be asked whether you would take the COVID-19 vaccination, if available. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

# Your demographic profile:
# {demographic_prompt}
# """

for COUNTRY in COUNTRIES:
    # Load Arce et al. data
    data = pd.read_csv(f"data/{DATA}.csv", header=1)

    if COUNTRY == "Sub-Saharan Africa":
        data_folder_dir = DATA + "_lmic"
    else:
        data_folder_dir = DATA + "_hic"
    
    if not os.path.exists(f"data/{data_folder_dir}"):
        # Create the folder
        os.makedirs(f"data/{data_folder_dir}")

    # Remove entries with missing responses
    data = data.dropna(subset=[target_outcome]).reset_index(drop=True)

    # Retain records from African countries
    if COUNTRY == "Sub-Saharan Africa":
        countries = [
            'Burkina Faso', 
            'Mozambique',
            'Sierra Leone 2', 
            'Uganda 2', 
            'Uganda 1', 
            'Rwanda',
            'Sierra Leone 1', 
            'Nigeria',
        ]
    elif COUNTRY == "United States":
        countries = [
            'USA',
        ]
    else:
        raise ValueError("Country {COUNTRY} is not supported. Choose either 'United States' or 'Sub-Saharan Africa'.")

    sampled_data = data[data["Country where the study took place"].isin(countries)].reset_index(drop=True)

    # Construct system and question prompts
    sampled_data["demographic_prompt"] = sampled_data.apply(generate_demographic_prompt, axis=1, args=([target_outcome],))
    sampled_data["system_prompt"] = sampled_data["demographic_prompt"].apply(lambda x: system_prompt.format(country=COUNTRY, demographic_prompt=x))
    sampled_data["user_prompt"] = target_outcome

    # Format data for fine-tuning
    sampled_data["text"] = sampled_data.apply(format_prompts, axis=1, args=(target_outcome,))

    # Save files in CSV format
    sampled_data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

## HPS 2021 Phase US (HIC Survey)

In [None]:
RANDOM_STATE = 42
DATA = "HPS_2021_training"

target_outcomes = [
    "Have you received a COVID-19 vaccine? Answer with Yes; No",
    
    "Did you receive (or do you plan to receive) all required doses Answer with Yes; No. Answer Not applicable if you have not received a COVID-19 vaccination",
    
    "Once a vaccine to prevent COVID-19 is available to you, would you get vaccinated? Answer with Definitely get a vaccine; Probably get a vaccine; Probably NOT get a vaccine; Definitely NOT get a vaccine. Answer Not applicable if you have already received a COVID-19 vaccination",
    
    "Which of the following, if any, are reasons for your previous response? Answer with Concerned about possible side effects of a COVID-19 vaccine; Do not know if a COVID-19 vaccine will work; Do not believe you need a COVID-19 vaccine; Do not like vaccines; My doctor has not recommended it; Plan to wait and see if it is safe and may get it later; Think other people need it more than I do right now; Concerned about the cost of a COVID-19 vaccine; Do not trust COVID-19 vaccines; Do not trust the government; Other reasons; Not applicable",
    
    "Why do you believe that you don’t need a COVID-19 vaccine? Answer with Already had COVID-19; Not a member of a high-risk group; Plan to use masks or other precautions instead; Do not believe COVID-19 is a serious illness; Do not think vaccines are beneficial; Other reasons; Not applicable",
]

system_prompt = """Please put yourself in the shoes of a human subject participating in a healthcare survey in the United States in January-March 2021. You will be provided with a demographic profile that describes, among other things, the geographical area/region/district where you live, your gender, age, race, marital status, household size, employment status, social security status, household purchasing and spending habits, mental health state, household income, and the impact COVID-19 had on you and your household. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. After you receive your complete human subject profile, you will be asked whether you received the COVID-19 vaccination or whether you would get vaccinated once a vaccine to prevent COVID-19 is available to you. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile:
{demographic_prompt}
"""

def format_hps_prompts(row: pd.Series, target_outcomes) -> str:
    prompt = [{"role":"system", "content":row["system_prompt"]}]

    for target_outcome in target_outcomes:
        if not pd.isnull(row[target_outcome]):
            prompt += [
                {"role":"user", "content":target_outcome},
                {"role":"assistant", "content":row[target_outcome]}
            ]
        else:
            continue

    # Convert the prompt list to a JSON-formatted string
    prompt_string = json.dumps(prompt)

    return prompt_string


# Load HPS data
data = pd.read_csv(f"data/{DATA}.csv", header=1)

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

# Remove entries with missing responses
data[target_outcomes] = data[target_outcomes].replace('NA', None)
data[target_outcomes] = data[target_outcomes].replace('Not applicable', None)
data = data.dropna(subset=target_outcomes, how="all").reset_index(drop=True)

# Construct system, user, and response prompts
data["demographic_prompt"] = data.apply(generate_demographic_prompt, axis=1, args=(target_outcomes + ["Week of interview", "Household level weight", "Person level weight"],))
data["system_prompt"] = data["demographic_prompt"].apply(lambda x: system_prompt.format(demographic_prompt=x))

# Format data for fine-tuning
data["text"] = data.apply(format_hps_prompts, axis=1, args=(target_outcomes,))

# Save files in CSV format
data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

## Brailovskaia Et Al (HIC Survey)

In [None]:
RANDOM_STATE = 42
COUNTRIES = ["US", "Sweden"]
DATA = "brailovskaia_et_al_2021_{country}_training"

target_outcome = "Have you already been vaccinated against COVID-19 at least once? Answer with Yes; No, but I would like to be vaccinated; No, and I do not want to be vaccinated"

system_prompt = """Please put yourself in the shoes of a human subject participating in a healthcare survey in the {country} in January-March 2021. You will be provided with a demographic profile that describes, among other things, the geographical area/region/district where you live, your gender, age, race, marital status, social class, mental health state, household income, and the impact COVID-19 had on you and your household. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. After you receive your complete human subject profile, you will be asked whether you received the COVID-19 vaccination or whether you would get vaccinated once a vaccine to prevent COVID-19 is available to you. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile:
{demographic_prompt}
"""

def format_brail_prompts(row: pd.Series, target_outcome) -> str:
    prompt = [
        {"role":"system","content":row["system_prompt"]},
        {"role":"user","content":target_outcome},
        {"role":"assistant","content":row[target_outcome]}
    ]
    
    # Convert the prompt list to a JSON-formatted string
    prompt_string = json.dumps(prompt)

    return prompt_string

for country in COUNTRIES:
    # Load Brailovskaia data
    data = pd.read_csv(f"data/{DATA.format(country=country)}.csv", header=1)

    data_folder_dir = DATA.format(country=country)
    if not os.path.exists(f"data/{data_folder_dir}"):
        # Create the folder
        os.makedirs(f"data/{data_folder_dir}")

    # Remove entries with missing responses
    data = data.dropna(subset=target_outcome).reset_index(drop=True)

    # Construct system, user, and response prompts
    data["demographic_prompt"] = data.apply(generate_demographic_prompt, axis=1, args=([target_outcome],))
    data["system_prompt"] = data["demographic_prompt"].apply(lambda x: system_prompt.format(country=country, demographic_prompt=x))

    # Format data for fine-tuning
    data["text"] = data.apply(format_brail_prompts, axis=1, args=(target_outcome,))

    # Save files in CSV format
    data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

## CANDOUR II US (HIC Survey)

In [None]:
DATA = "candour_wave_2_US_training"

def format_candour_prompts(row: pd.Series) -> str:
    prompt = [{"role":"system","content":row["system_prompt"]},
              {"role":"user","content":"Have you already been offered, or had an opportunity to receive, a COVID-19 vaccine?"},
              {"role":"assistant","content":row["Have you already been offered, or had an opportunity to receive, a COVID-19 vaccine?"]}
            ]

    if row["Have you already been offered, or had an opportunity to receive, a COVID-19 vaccine?"] == "Yes":
        prompt += [
            {"role":"user", "content":"Have you received a COVID-19 vaccine?"},
            {"role":"assistant", "content":row["Have you received a COVID-19 vaccine?"]}
        ]
        
        if row["Have you received a COVID-19 vaccine?"].startswith("Yes"):
            prompt += [
                {"role":"user", "content":"Why did you decide to get vaccinated against COVID-19, what were the reasons?"},
                {"role":"assistant", "content":row["Why did you decide to get vaccinated against COVID-19, what were the reasons?"]}
            ]

        elif row["Have you received a COVID-19 vaccine?"] == "No, I have declined the offer to be vaccinated":
            prompt += [
                {"role":"user", "content":"Why did you decide NOT to get vaccinated against COVID-19, what are / were reasons?"},
                {"role":"assistant", "content":row["Why did you decide NOT to get vaccinated against COVID-19, what are / were reasons?"]}
            ]

        else:  # Not yet but I am waiting for my appointment, Prefer not to say
            pass

    elif row["Have you already been offered, or had an opportunity to receive, a COVID-19 vaccine?"] == "No":
        prompt += [
            {"role":"user", "content":"If a COVID-19 vaccine was available to you, would you definitely get it, probably get it, probably not get it or definitely not get it?"},
            {"role":"assistant", "content":row["If a COVID-19 vaccine was available to you, would you definitely get it, probably get it, probably not get it or definitely not get it?"]}
        ]

        if row["If a COVID-19 vaccine was available to you, would you definitely get it, probably get it, probably not get it or definitely not get it?"] == "Definitely get it":
            prompt += [
                {"role":"user", "content":"What are your reasons for getting vaccinated for COVID-19?"},
                {"role":"assistant", "content":row["What are your reasons for getting vaccinated for COVID-19?"]}
            ]

        elif row["If a COVID-19 vaccine was available to you, would you definitely get it, probably get it, probably not get it or definitely not get it?"] == "Definitely not get it":
            prompt += [
                {"role":"user", "content":"What are your reasons for NOT getting vaccinated for COVID-19?"},
                {"role":"assistant", "content":row["What are your reasons for NOT getting vaccinated for COVID-19?"]}
            ]

        elif row["If a COVID-19 vaccine was available to you, would you definitely get it, probably get it, probably not get it or definitely not get it?"] in ["Probably not get it","Probably get it"]:
            prompt += [
                {"role":"user", "content":"What are your reasons for getting vaccinated for COVID-19?"},
                {"role":"assistant", "content":row["What are your reasons for getting vaccinated for COVID-19?"]},
                {"role":"user", "content":"What are your reasons for NOT getting vaccinated for COVID-19?"},
                {"role":"assistant", "content":row["What are your reasons for NOT getting vaccinated for COVID-19?"]}
            ]

        else: # Prefer not to say, Do not know
            pass

    else:  # Don't Know, Prefer not to Say
        pass

    # Convert the prompt list to a JSON-formatted string
    prompt_string = json.dumps(prompt)

    return prompt_string

#  CANDOR
target_outcomes = [
    "Have you already been offered, or had an opportunity to receive, a COVID-19 vaccine?",
    "Have you received a COVID-19 vaccine?",
    "Why did you decide NOT to get vaccinated against COVID-19, what are / were reasons?",
    "Why did you decide to get vaccinated against COVID-19, what were the reasons?",
    "If a COVID-19 vaccine was available to you, would you definitely get it, probably get it, probably not get it or definitely not get it?",
    "What are your reasons for NOT getting vaccinated for COVID-19?",
    "What are your reasons for getting vaccinated for COVID-19?",
]

system_prompt = """Please put yourself in the shoes of a human subject participating in a healthcare survey in US. You will be provided with a demographic profile that describes your age, gender, region/district where you live in, the highest education level you achieved, ideology, political point-of-view, ethnicity, religion, martial status, household size, economic situation and attitude, vaccination hesitancy, views on your country's health policies, health conditions, and EQ-5D health-related quality of life. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. After you receive your complete human subject profile, you will be asked whether you received the COVID-19 vaccination. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile:
{demographic_prompt}
"""

# Load Candor data
data = pd.read_csv(f"data/{DATA}.csv", header=1)

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

# Remove entries with missing responses
data = data.dropna(
    subset=target_outcomes[0]
).reset_index(drop=True)

# Construct system and question prompts
data["demographic_prompt"] = data.apply(generate_demographic_prompt, axis=1, args=(target_outcomes + ["ID"],))
data["system_prompt"] = data["demographic_prompt"].apply(lambda x: system_prompt.format(demographic_prompt=x))

# Format data for fine-tuning
data["text"] = data.apply(format_candour_prompts, axis=1)

# Save files in CSV format
data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

## Eurobarometer (HIC Survey)

In [None]:
RANDOM_STATE = 42
DATA = "eurobarometer_94.3_Sweden_training"

target_outcome = "If a vaccine against COVID-19 (coronavirus) is authorised by public authorities and available for you, when would you like to get vaccinated? Answer with As soon as possible; Some time in 2021; Later; Never; Don't know; I have already been vaccinated"

system_prompt = """Please put yourself in the shoes of a human subject participating in a healthcare survey in Sweden in February-March 2021. You will be provided with a demographic profile that describes, among other things, the geographical area/region/district where you live, your gender, age, race, marital status, household size, employment status, your social media usage, your health, and the people that you trust. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. After you receive your complete human subject profile, you will be asked when you would like to get vaccinated if a vaccine against COVID-19 (coronavirus) is authorised by public authorities and available for you. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile:
{demographic_prompt}
"""

def format_eurobarometer_prompts(row: pd.Series, target_outcome) -> str:
    prompt = [
        {"role":"system","content":row["system_prompt"]},
        {"role":"user","content":target_outcome},
        {"role":"assistant","content":row[target_outcome]}
    ]
    
    # Convert the prompt list to a JSON-formatted string
    prompt_string = json.dumps(prompt)

    return prompt_string


# Load Brailovskaia data
data = pd.read_csv(f"data/{DATA}.csv", header=1)

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

# Remove entries with missing responses
data = data.dropna(subset=target_outcome).reset_index(drop=True)

# Construct system, user, and response prompts
data["demographic_prompt"] = data.apply(generate_demographic_prompt, axis=1, args=([target_outcome],))
data["system_prompt"] = data["demographic_prompt"].apply(lambda x: system_prompt.format(demographic_prompt=x))

# Format data for fine-tuning
data["text"] = data.apply(format_eurobarometer_prompts, axis=1, args=(target_outcome,))

# Save files in CSV format
data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

## Duch 2023 Vaccination Status (LMIC RCT)

In [10]:
RANDOM_STATE = 42
DATA = "duch_et_al_2023_vaccine_financial_vaccination_status_training"
WITH_INTENTION = False

if WITH_INTENTION:
    demographic_questions = [
        "Start Date",
        "What is your current age?",
        "What is your gender?",
        "What is the highest educational qualification you have completed?",
        "Which region do you live in?",
        "Which distric do you live in?",
        "What is the name of the community you live in?",
        "How many people live in your village?",
        "What is the distance in km of the nearest health clinic from where you live?",
        "How many people live in the house together with you (NOT including you) at this moment?",
        "How many children below 18 years old are currently living in your home?",
        "What is your current working situation?",
        "How much (in Ghanaian Cedis) on average does your household spend in a typical week on food?",
        "How much (in Ghanaian Cedis) on average does your household spend in a typical week on non-food items (electricity, water, rent, school fees)?",
        "How would you rate the overall economic or financial condition of your household today?",
        "Do you have a registered mobile number?",
        "How many family members do you have in another village?",
        "How many friends and acquaintances who are not part of your family do you have in another village?",
        "How many individuals can you identify in your social network? Think of friends and relatives that live close to you",
        "How often do you use social media?",
        "Do you think you will get a first shot of a COVID-19 vaccine within the first 6 weeks after the vaccine becomes available to you?",
        "Why will you NOT get vaccinated against COVID-19?",
        "We understand that there is always some uncertainty regarding all decisions. From 0% to 100%, what do you think are the chances that you will choose to get a first shot of a COVID-19 vaccine within the first 6 weeks after the vaccine becomes available to you? - 4",
    ]
else:
    demographic_questions = [
        "Start Date",
        "What is your current age?",
        "What is your gender?",
        "What is the highest educational qualification you have completed?",
        "Which region do you live in?",
        "Which distric do you live in?",
        "What is the name of the community you live in?",
        "How many people live in your village?",
        "What is the distance in km of the nearest health clinic from where you live?",
        "How many people live in the house together with you (NOT including you) at this moment?",
        "How many children below 18 years old are currently living in your home?",
        "What is your current working situation?",
        "How much (in Ghanaian Cedis) on average does your household spend in a typical week on food?",
        "How much (in Ghanaian Cedis) on average does your household spend in a typical week on non-food items (electricity, water, rent, school fees)?",
        "How would you rate the overall economic or financial condition of your household today?",
        "Do you have a registered mobile number?",
        "How many family members do you have in another village?",
        "How many friends and acquaintances who are not part of your family do you have in another village?",
        "How many individuals can you identify in your social network? Think of friends and relatives that live close to you",
        "How often do you use social media?",
    ]
    

target_outcomes = [
    "Have you received a COVID-19 vaccine?",
    "Have you actually received a COVID-19 vaccine and can this be verified in the records of the Ghanaian District Health Offices?",
]

system_prompt = """Please put yourself in the shoes of a human subject participating in a healthcare survey in Ghana about the COVID-19 vaccine. You will be provided with a demographic profile that describes your age, gender, highest education level you achieved, region/district you live in, size of your village, distance to nearest health clinic in km, household size, current employment situation, average household spending, household economic/financial condition, number of family members and friends in another village, social network, social media use, and vaccination intention. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. Lastly, you will watch a video and receive further information on the vaccination intention of your human subject. Thereafter, you will be asked whether you received the COVID-19 vaccination. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile:
{demographic_prompt}

You are asked to watch a video at this point. Here you are provided with the transcript of the video. You have to read the full transcript in order to continue the survey:
{treatment_prompt}
"""

treatment_transcript = {
    "CDC Health": "Health authorities are working hard to distribute the COVID-19 vaccines free for everyone with no strings attached. COVID 19 vaccines are safe and effective. After you have been fully vaccinated you can resume activities that you did prior to the pandemic. Getting the COVID-19 vaccine will help prevent you from getting COVID-19 and reduce your risk of being hospitalized with COVID-19. COVID 19 vaccine help you to protect yourself your environment and your loved ones from COVID-19 exposure.\n\nWe indicated that we will follow up with you in 6 weeks. We will contact you in order to verify your vaccination status. If you can provide us with your COVID-19 vaccination carnet at the time, we will upload a copy of the vaccination carnet to our secure server for verification",
    "Placebo": "The Sun lights up our lives for business for education even for socializing but when the Sun sets many people use candles who are quality battery-operated torches and kerosene lamps as inefficient and expensive ways to create light. What if you can take some Sun with you at night?  You can with portable solar products there are different types, but each portable solar product is made up of three basic parts: a small solar panel, a modern rechargeable battery and an LED bulb. The solar panel catches the light from the Sun and stores this energy in the battery. This can now be used for much needed light when it's dark. Many can even charge phones portable solar products should be reliable affordable and warranted be sure to demand top quality solar products look for these products lighting Africa shining the way.\n\nWe indicated that we will follow up with you in 6 weeks. We will contact you in order to verify your vaccination status.  If you can provide us with your COVID-19 vaccination carnet at the time, we will upload a copy of the vaccination carnet to our secure server for verification.",
    "Low Cash": "Health authorities are working hard to distribute the COVID-19 vaccines free for everyone with no strings attached. COVID-19 vaccines are safe and effective. After you have been fully vaccinated you can resume activities that you did prior to the pandemic. If you have at least one COVID-19 vaccine shot you will receive 20 Cedi. If you get vaccinated, you will get rewarded.\n\nWe indicated that we will follow up with you in 30 days.  We will contact you in order to verify your vaccination status. If you can provide us with your COVID-19 vaccination carnet at the time, we will upload a copy of the vaccination carnet to our secure server for verification and you will be paid your 20 Cedi via cell phone money payment or by cash if you prefer.",
    "High Cash": "Health authorities are working hard to distribute the COVID-19 vaccines free for everyone with no strings attached. COVID-19 vaccines are safe and effective. After you have been fully vaccinated you can resume activities that you did prior to the pandemic. If you have at least one COVID-19 vaccine shot you will receive 60 Cedi. If you get vaccinated, you will get rewarded.\n\nWe indicated that we will follow up with you in 6 weeks.  We will contact you in order to verify your vaccination status. If you can provide us with your COVID-19 vaccination carnet at the time, we will upload a copy of the vaccination carnet to our secure server for verification and you will be paid your 60 Cedi via cell phone money payment or by cash if you prefer.",
}


def format_duch_2023_system_prompt(row: pd.Series) -> str:
    final_system_prompt = system_prompt.format(
        demographic_prompt=row["demographic_prompt"],
        treatment_prompt=treatment_transcript[row["treatment"]],
    )

    return final_system_prompt


def format_duch_2023_user_prompt(row: pd.Series, target_outcomes) -> str:
    question = " ".join([target_outcomes[0], target_outcomes[1]])
    question_prompt = f"{question} Please give your response to both questions in the structured format below:\nQuestion 1: [Yes/No]\nQuestion 2: [Yes/No]"
    response = f"Question 1: {row[target_outcomes[0]]}\nQuestion 2: {row[target_outcomes[1]]}"

    prompt = [{"role":"system","content":row["system_prompt"]},
              {"role":"user","content":question_prompt},
              {"role":"assistant","content":response}
            ]
        
    # Convert the prompt list to a JSON-formatted string
    prompt_string = json.dumps(prompt)

    return prompt_string
    

# Load data
data = pd.read_csv(f"data/{DATA}.csv", header=1)
data[target_outcomes] = data[target_outcomes].replace("NA", None)

# Perform fine-tuning data (10%) and holdout data (90%) split
training_data, holdout_data = train_test_split(data, test_size=0.9, random_state=RANDOM_STATE)

# Construct system and question prompts
training_data_finetune = training_data[demographic_questions + target_outcomes + ["treatment"]].copy()
training_data_finetune["demographic_prompt"] = training_data_finetune.apply(generate_demographic_prompt, axis=1, args=(target_outcomes + ["treatment"],))
training_data_finetune["system_prompt"] = training_data_finetune.apply(format_duch_2023_system_prompt, axis=1)

holdout_data = include_variable_names(holdout_data, f"data/{DATA}.csv")
training_data = include_variable_names(training_data, f"data/{DATA}.csv")

# Format data for fine-tuning
training_data_finetune["text"] = training_data_finetune.apply(format_duch_2023_user_prompt, axis=1, args=(target_outcomes,))

# Save files in CSV format
if WITH_INTENTION:
    data_folder_dir = DATA + "_with_intention"
    if not os.path.exists(f"data/{data_folder_dir}"):
        # Create the folder
        os.makedirs(f"data/{data_folder_dir}")

    holdout_data.to_csv(f"data/{DATA}_with_intention_holdout.csv", index=False)
    training_data.to_csv(f"data/{DATA}_with_intention_training.csv", index=False)
    training_data_finetune[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)
else:
    data_folder_dir = DATA + "_wo_intention"
    if not os.path.exists(f"data/{data_folder_dir}"):
        # Create the folder
        os.makedirs(f"data/{data_folder_dir}")

    holdout_data.to_csv(f"data/{DATA}_wo_intention_holdout.csv", index=False)
    training_data.to_csv(f"data/{DATA}_wo_intention_training.csv", index=False)
    training_data_finetune[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

## Duch 2025 TB Screening (LMIC RCT)

In [None]:
RANDOM_STATE = 42
DATA = "duch_et_al_2025_ghana_tubercolosis_screening_training"

demographic_questions = [
    "When did this survey start?",
    "What is the name of the district you live in?",
    "What is the name of the community you live in?",
    "How many people live in your community?",
    "What is your current age?",
    "What is your gender?",
    "Which ethnicity best describes you?",
    "What is your current working situation?",
    "How much on average does your household spend in a typical week on food?",
    "How much on average does your household spend in a typical week on non-food items (electricity, water, rent, school fees)?",
    "How would you rate the overall economic or financial condition of your household today?",
    "What is the highest educational qualification you have completed?",
    "Do you live with a spouse or partner?",
    "Imagine the following situation: Today you unexpectedly received GH‚Çµ 1,610. How much of this amount would you donate to a good cause?",
    "How many villages in the district do you think you have visited in the last  month?",
    "How many villages in the district do you think you have visited in the last year?",
    "How many family members do you have in another village?",
    "How many friends and acquaintances who are not part of your family do you have in another village?",
    "How many individuals can you identify in your social network? Think of friends and relatives that live close to you",
    "How often do you use WhatsApp?",
    "What social media have you used in the last year?",
    "How often do you use social media?",
    "Thinking now about health matters, how familiar are you with tuberculosis? Please indicate your familiarity by responding with one of these options: Very High, High, Average, Low, Very Low.",
    "Thinking now about health matters, how familiar are you with high blood pressure/hypertension? Please indicate your familiarity by responding with one of these options: Very High, High, Average, Low, Very Low.",
    "Thinking now about health matters, how familiar are you with diabetes? Please indicate your familiarity by responding with one of these options: Very High, High, Average, Low, Very Low.",
    "Thinking now about health matters, how familiar are you with asthma? Please indicate your familiarity by responding with one of these options: Very High, High, Average, Low, Very Low.",
    "Thinking now about health matters, how familiar are you with heart disease? Please indicate your familiarity by responding with one of these options: Very High, High, Average, Low, Very Low.",
    "Which underlying health conditions do you have?",
    "How is your health in general?",
    "Please indicate for the following statement which is closest to how you have been feeling over the last two weeks: I have felt cheerful and in good spirits. Please respond with one of these options: At no time, Some of the time, Less than half of the time, More than half of the time, Most of the time, All the time",
    "Please indicate for the following statement which is closest to how you have been feeling over the last two weeks: I have felt calm and relaxed. Please respond with one of these options: At no time, Some of the time, Less than half of the time, More than half of the time, Most of the time, All the time",
    "Please indicate for the following statement which is closest to how you have been feeling over the last two weeks: I have felt active and vigorous. Please respond with one of these options: At no time, Some of the time, Less than half of the time, More than half of the time, Most of the time, All the time",
    "Please indicate for the following statement which is closest to how you have been feeling over the last two weeks: I woke up feeling fresh and rested. Please respond with one of these options: At no time, Some of the time, Less than half of the time, More than half of the time, Most of the time, All the time",
    "Please indicate for the following statement which is closest to how you have been feeling over the last two weeks: My daily life has been filled with things that interest me. Please respond with one of these options: At no time, Some of the time, Less than half of the time, More than half of the time, Most of the time, All the time",
    "How much do you trust the following people? - Your relatives",
    "How much do you trust the following people? - Your neighbors",
    "How much do you trust the following people? - Someone in your own tribe",
    "How much do you trust the following people? - Ghanaians from other tribes",
    "How much do you trust the following institutions? - Chiefs",
    "How much do you trust the following institutions? - District assemblies",
    "How much do you trust the following institutions? - The police",
    "How much do you trust the following institutions? - Courts of law",
    "How much do you trust the following institutions? - Political parties",
    "How much do you trust the following institutions? - The army",
    "How much do you trust the following institutions? - Parliament",
    "How much do you trust the following institutions? - President",
    "How much do you trust the following institutions? - Ghana Broadcasting Corporation",
    "How much do you trust the following institutions? - Electoral Commission",
    "How much do you trust the following non-governmental organizations? - Churches",
    "How much do you trust the following non-governmental organizations? - Mosques",
    "How much do you trust the following non-governmental organizations? - Trade unions",
    "How much do you trust the following non-governmental organizations? - Banks",
    "How much do you trust the following non-governmental organizations? - Businesses",
    "How is your mobility TODAY?",
    "How is your self-care TODAY?",
    "How are your usual activities TODAY (e.g. work, study, housework, family or leisure activities)?",
    "How is your your pain / discomfort TODAY?",
    "How is your anxiety / depression TODAY?",
    "How is your health TODAY on a scale from 0 to 100?",
    "How many People live in your Household",
    "How many children below 18 years old are currently living in your household?",
]

target_outcomes = [
    "The Health District Tuberculosis screening team will be in your village within the next two weeks. Would you be willing to get the tuberculosis screening, when the Heath District Tuberculosis screening team is in your village? Please respond with one of these options: Yes, No, Do not know, Prefer not to say.",
    "Did you get the tuberculosis screening, when the Heath District Tuberculosis screening team was in your village?",
]

system_prompt = """Please put yourself in the shoes of a human subject participating in a healthcare survey in a remote rural community in Ghana about tuberculosis screening. You will be provided with a demographic profile that describes, among other things, your age, gender, the name and size of your community, your work, your social network, your social media usage, your health, and the people that you trust. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. Lastly, you will be provided with the description of an initiative that was carried out in your area and you will watch a video. After you receive your complete human subject profile, you will be asked whether you are willing to get a screening for tuberculosis and whether you received a tuberculosis screening within two weeks from the day of the survey. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile:
{demographic_prompt}

In your area, an initiative was implemented that consisted in bringing pop-up tuberculosis clinics in several villages - including yours - so that most villagers could walk to the clinics within minutes.

When you were informed of the initiative, you were also presented with a video message. Here is the transcript of the video:
{treatment_prompt}
"""

treatment_transcript = {
    "tbhealth": "Health authorities are working hard to test people for being at risk of getting ill with tuberculosis. If you are at risk you will be treated to stop you getting ill with tuberculosis. The tests and treatment are safe and effective and free for everyone with no strings attached. After you have been tested and treated you will be safe from tuberculosis. One in four people have sleeping tuberculosis. If you have sleeping tuberculosis, you will feel well, but there is a risk that the tuberculosis bacteria will wake up and give you active tuberculosis, a serious illness. Getting tested and treated for sleeping tuberculosis will prevent you from getting active tuberculosis and reduce your risk of being hospitalized with tuberculosis. Tuberculosis testing will help you to protect yourself and your environment and your loved ones from tuberculosis exposure.",
    "tbhealthplus3": "Health authorities are working hard to test people for being at risk of getting ill with tuberculosis. If you are at risk you will be treated to stop you getting ill with tuberculosis. The tests and treatment are safe and effective and free for everyone with no strings attached. After you have been tested and treated you will be safe from tuberculosis. If you show up for the scheduled tuberculosis testing in your village and get the tuberculosis test you will receive 20 Cedi. If you get tuberculosis tested, you will get rewarded.",
    "tbhealthplustext": "Health authorities are working hard to test people for being at risk of getting ill with tuberculosis. If you are at risk you will be treated to stop you getting ill with tuberculosis. The tests and treatment are safe and effective and free for everyone with no strings attached. After you have been tested and treated you will be safe from tuberculosis. One in four people have sleeping tuberculosis. If you have sleeping tuberculosis, you will feel well, but there is a risk that the tuberculosis bacteria will wake up and give you active tuberculosis, a serious illness. Getting tested and treated for sleeping tuberculosis will prevent you from getting active tuberculosis and reduce your risk of being hospitalized with tuberculosis. Tuberculosis testing will help you to protect yourself and your environment and your loved ones from tuberculosis exposure. You also received a telephone call reminding you to go for your tuberculosis screening appointment.",
}


def format_system_prompt(row: pd.Series) -> str:
    final_system_prompt = system_prompt.format(
        demographic_prompt=row["demographic_prompt"],
        treatment_prompt=treatment_transcript[row["treatment"]],
    )

    return final_system_prompt


def format_duch_user_prompts(row: pd.Series, target_outcomes) -> str:
    prompt = [{"role":"system","content":row["system_prompt"]},
              {"role":"user","content":target_outcomes[0]},
              {"role":"assistant","content":row[target_outcomes[0]]}
            ]
    
    if not pd.isnull(row[target_outcomes[1]]):
        prompt += [
            {"role":"user","content":target_outcomes[1]},
            {"role":"assistant","content":row[target_outcomes[1]]}
        ]
    
    # Convert the prompt list to a JSON-formatted string
    prompt_string = json.dumps(prompt)

    return prompt_string
    

# Load data
data = pd.read_csv(f"data/{DATA}.csv", header=1)
data = data[demographic_questions + target_outcomes + ["treatment"]]
data[target_outcomes] = data[target_outcomes].replace("NA", None)

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

# Remove entries with missing target responses
data = data.dropna(subset=target_outcomes[0]).reset_index(drop=True)

# Perform fine-tuning data (10%) and holdout data (90%) split
finetune_data, holdout_data = train_test_split(data, test_size=0.9, random_state=RANDOM_STATE)
holdout_data = include_variable_names(holdout_data, f"data/{DATA}.csv")
holdout_data.to_csv(f"data/{DATA}_holdout.csv", index=False)

# Construct system and question prompts
finetune_data["demographic_prompt"] = finetune_data.apply(generate_demographic_prompt, axis=1, args=(target_outcomes + ["treatment"],))
finetune_data["system_prompt"] = finetune_data.apply(format_system_prompt, axis=1)

# Format data for fine-tuning
finetune_data["text"] = finetune_data.apply(format_duch_user_prompts, axis=1, args=(target_outcomes,))

# Save files in CSV format
finetune_data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

## Meriggi et al. 2024 (LMIC RCT)

In [None]:
RANDOM_STATE = 42
DATA = "meriggi_et_al_2024_last_mile_training_data"

question_mapping = {
    "villsize":"How many people live in your village?",
    "periphery":"Do you live on the periphery of your community? Please, reply with Yes or No",
    "vaccinated_team":"Have you been vaccinated against COVID-19 in a mobile vaccination clinic? Please, reply with Yes or No",
    "vaccinated_baseline":"Have you been vaccinated against COVID-19 and have vaccine card to prove it? Please, reply with Yes or No",
    "preg":"Are you pregnant? Please, reply with Yes or No",
    "breast":"Are you breastfeeding? Please, reply with Yes or No",
    "age":"What is your age in years?",
    "female":"What is your gender? Please, reply with Male or Female",
    "hh_gender":"What is the gender of the head of your household? Please, reply with Male or Female",
    "farmer":"Is farming the chief source of income for your household? Please, reply with Yes or No",
    "anyschooling":"Has the head of your household received any formal education? Please, reply with Yes or No",
    "BSL_owns_land":"Do you or your household members own any land? Please, reply with Yes or No",
    "BSL_covid_believe":"Do you believe that COVID-19 exists in the world? Please, reply with Yes or No",
    "BSL_covid_know":"Do you know about the COVID-19 vaccine/marklate? Please, reply with Yes or No",
    "BSL_covid_wouldtake":"Would you take a COVID-19 vaccine/marklate if it were offered to you? Please, reply with Yes or No",
    "BSL_reduced_portions":"Over the last 7 days (week), has your household had reduced portions/quantities served per meal for more than 1 day? Please, reply with Yes or No",
    "BSL_safe_stragree":"Do you strongly agree with this statement: COVID-19 vaccines are safe. Please, reply with Yes or No",
    "BSL_effect_stragree":"Do you strongly agree with this statement: COVID-19 vaccines are effective. Please, reply with Yes or No",
    "END_covid_believe":"Do you believe that COVID-19 exists in the world? Please, reply with Yes or No",
    "END_covid_know":"Do you know about the COVID-19 vaccine/marklate? Please, reply with Yes or No",
    "END_safe_stragree":"Do you strongly agree with this statement: COVID-19 vaccines are safe. Please, reply with Yes or No",
    "END_effect_stragree":"Do you strongly agree with this statement: COVID-19 vaccines are effective. Please, reply with Yes or No",
    "religion":"What is the religion of the head of the household?",
    "BSL_trust":"Who do you most trust getting information about COVID-19? Please, reply with CHC, Family and friends, Social media, Media (i.e. news/radio/tv), or Ministry of Health and Sanitation",
    "END_trust":"Who do you most trust getting information about COVID-19? Please, reply with CHC, Family and friends, Social media, Media (i.e. news/radio/tv), or Ministry of Health and Sanitation",
}

demographic_questions = [
    "villsize",
    "periphery",
    "preg",
    "breast",
    "age",
    "female",
    "hh_gender",
    "farmer",
    "anyschooling",
    "BSL_owns_land",
    "BSL_covid_believe",
    "BSL_covid_know",
    "BSL_covid_wouldtake",
    "BSL_reduced_portions",
    "BSL_safe_stragree",
    "BSL_effect_stragree",
    "religion",
    "BSL_trust",
    "vaccinated_baseline",
]

target_outcomes = [
    "vaccinated_team",
    "END_covid_believe",
    "END_covid_know",
    "END_safe_stragree",
    "END_effect_stragree",
    "END_trust",
]

system_prompt_1 = """Please put yourself in the shoes of a human subject participating in a healthcare survey in a remote rural community in Sierra Leone about the COVID-19 vaccine. You will be provided with a demographic profile that describes your age, gender, whether the head of your household received any formal education, size of your village, whether you live on the periphery of your community, and whether you or your household own any land. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. Lastly, you will be provided with the description of an initiative that was carried out in your area. Thereafter, you will be asked whether you received the COVID-19 vaccination from a mobile vaccination team. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile:
"""

system_prompt_2 = """

In your area, an initiative was implemented in March–April 2022 in partnership with the Sierra Leone Ministry of Health and Sanitation (MoHS) and the international non-governmental organization (NGO) Concern Worldwide. The primary aim of this intervention was to take vaccine doses and nurses to administer vaccines to remote, rural communities. The initiative involved the following steps.

Step 1. On the first day of the intervention, a social mobilization team—trained and supervised by the MoHS—organized a conversation with all village leaders, including the town chief, mammy queen, town elders, the youth leaders and religious leaders, and any other important stakeholders including the paramount and section chiefs if they were available. The mobilization team explained the purpose of the visit, answered questions about the available vaccines and asked leaders for their cooperation in encouraging eligible community members to take the COVID-19 vaccine.

Step 2. Social mobilizers then asked leaders to convene a community meeting that same evening (when people return home from farms) to allow mobilizers to talk directly with all village residents about vaccine efficacy and safety, the importance of getting vaccinated, and to address villagers’ questions and concerns. This process ended with social mobilizers explaining the location and timing of the mobile vaccination site that they were about to set up.

Step 3. Vaccine doses, nurses to administer vaccines and MoHS staff who could register the vaccinated were brought into the community either the same evening or early the next morning. The vaccine doses and staff often travelled on motorbikes or on boats given the difficult terrain they had to traverse to reach these remote communities.

Step 4. Once the team was in place, the temporary vaccination site started operating in a central location in the village. The vaccination site remained operational from sunrise to sunset over the next 2–3 days.

"""


def format_meriggi_user_prompts(row: pd.Series, target_outcomes) -> str:
    prompt = [{"role":"system","content":row["system_prompt"]}]


    for target_outcome in target_outcomes:
        if pd.isnull(row[target_outcome]) or row[target_outcome] == "NA":
            continue

        prompt += [
            {"role":"user", "content":question_mapping[target_outcome]},
            {"role":"assistant", "content":row[target_outcome]}
        ]
    
    # Convert the prompt list to a JSON-formatted string
    prompt_string = json.dumps(prompt)

    return prompt_string


def format_meriggi_system_prompt(row: pd.Series) -> str:
    final_system_prompt = system_prompt_1 + row["demographic_prompt"] + system_prompt_2

    if row["treatment"] == "Group Mobilization":
        final_system_prompt += "Step 5. Social mobilizers targeted social groups who gathered at fixed spots in and around the village (for example, groups of farmers in fields, mosque attendees or women collecting water). Social mobilizers engaged the group to have joint conversations about the vaccines."

        attended_value = str(row["attended"])
        if attended_value == "1":
            final_system_prompt += " You attended a group session organised by MoHS staff."
        elif attended_value == "0":
            final_system_prompt += " You did not attend a group session organised by MoHS staff."
        elif attended_value == "NA" or pd.isnull(row["attended"]):
            pass
        else:
            raise ValueError(f"{row['attended']} is not considered.")
        
    elif row["treatment"] == "Individual Mobilization":
        dtd_value = str(row["dtd"])
        if dtd_value == "1":
            final_system_prompt += "Step 5. Social mobilizers came to your household to privately discuss any concerns about that vaccine that the household residents had and to encourage them to visit the vaccination site."
        elif dtd_value in ["0", "NA"] or pd.isnull(row["dtd"]):
            pass
        else:
            raise ValueError(f"{row['dtd']} is not considered.")
    
    else:
        raise ValueError(f"{row['treatment']} is not considered.")
                 
    return final_system_prompt


def generate_meriggi_demographic_prompt(row, excluded_columns): 
    demographic_questions = [question for question in list(row.index) if question not in excluded_columns]
    random.shuffle(demographic_questions)
    demographic_prompt = ""
    counter = 1
    for question in demographic_questions:
        if pd.isnull(row[question]) or row[question] == "NA":
            continue
        demographic_prompt += f"{counter}) Interviewer: {question_mapping[question]} Me: {row[question]} "
        counter += 1

    return demographic_prompt


# Load data
data = pd.read_csv(f"data/{DATA}.csv")

# Drop row with survey question description
data = data.drop(data.index[0]).reset_index(drop=True)

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

# Construct system and question prompts
data["demographic_prompt"] = data.apply(generate_meriggi_demographic_prompt, axis=1, args=(target_outcomes + ["master_person_id", "treatment", "dtd", "attended", "incomplete_observations"],))
data["system_prompt"] = data.apply(format_meriggi_system_prompt, axis=1)

# Format data for fine-tuning
data["text"] = data.apply(format_meriggi_user_prompts, axis=1, args=(target_outcomes,))

# Save files in CSV format
data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

## Fink et al. 2018 (LMIC RCT)

In [None]:
RANDOM_STATE = 42
DATA = "fink_et_al_2018_training"

target_outcomes = [
    "In 2014: if you own an insecticide-treated net, did you sleep under it last night?",
    "In 2016: if you own an insecticide-treated net, did you sleep under it last night?",
    "In 2014: if your household owns an insecticide-treated net, did your youngest child sleep under it last night?",
    "In 2016: if your household owns an insecticide-treated net, did your youngest child sleep under it last night?",
    "In 2014: were you assisted by someone (e.g., a doctor, nurse, midwife, SBA, or community health worker) during your last delivery?",
    "In 2016: were you assisted by someone (e.g., a doctor, nurse, midwife, SBA, or community health worker) during your last delivery?",
    "In 2014: did you exclusively breastfeed your youngest child when they were younger than six months?",
    "In 2016: did you exclusively breastfeed your youngest child when they were younger than six months?",
    "In 2014: did you give your child ORS or go to the clinic the last time they had diarrhea?",
    "In 2016: did you give your child ORS or go to the clinic the last time they had diarrhea?",
    "In 2014: if you have a handwashing station in your household, is there soap available to wash your hands?",
    "In 2016: if you have a handwashing station in your household, is there soap available to wash your hands?",
    "In 2014: did you exclusively breastfeed your youngest child when they were younger than two years?",
    "In 2016: did you exclusively breastfeed your youngest child when they were younger than two years?",
]

system_prompt_1 = """Please put yourself in the shoes of a human female subject participating in a healthcare survey. You will be provided with a demographic profile that describes your region and district, the level of urbanization of your area, whether you own a mobile phone and whose phone you use, your age, whether you were born in your current community, how long have you lived there, who heads your household, your religion, your marital status, your partner’s religion, the highest level of education attained by you and your partner, whether you have ever given birth, how many children you desire, your literacy level, and health-related topics such as your use of the healthcare system, sources of health information, the distance to the nearest clinic/hospital, and your knowledge and attitudes regarding hand-washing, exclusive breastfeeding, Oral Rehydration Solutions (ORS) for diarrhea, the use of insecticide treated nets, and skilled birth attendance during delivery. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. Lastly, you will be provided with a description of an initiative that was carried out in your area. Thereafter, you will be asked whether you and your youngest child slept under an insecticide treated net last night, whether you received assistance during your last delivery, whether you exclusively breastfed your youngest child before they turned 6 months, whether you gave ORS tables to your child or visited a clinic the last time they had diarrhea, whether you washed your hands with soap, and whether you exclusively breastfed your youngest child before they turned 2 years. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile:
"""

system_prompt_2 = """

In 2012, an initiative was implemented in your area by the Ghana Health Services (GHS), with funding from the United Nations Children’s Fund (UNICEF). The primary aim of this intervention was to encourage families to adopt and consistently practice five health behaviours which are critical for preventing under-five mortality: sleeping under an insecticide treated mosquito net, utilization of oral rehydration solution (ORS) for the treatment of diarrhea, hand-washing with soap, exclusive breastfeeding, and delivery with a skilled birth attendant. 

"""


def format_fink_user_prompts(row: pd.Series, target_outcomes) -> str:
    prompt = [{"role":"system","content":row["system_prompt"]}]

    for target_outcome in target_outcomes:
        if pd.isnull(row[target_outcome]) or row[target_outcome] == "NA":
            continue

        prompt += [
            {"role":"user", "content":target_outcome},
            {"role":"assistant", "content":row[target_outcome]}
        ]
    
    # Convert the prompt list to a JSON-formatted string
    prompt_string = json.dumps(prompt)

    return prompt_string


def format_fink_system_prompt(row: pd.Series) -> str:
    final_system_prompt = system_prompt_1 + row["demographic_prompt"] + system_prompt_2

    if row["In your community, did the Centre for National Culture showed a video or a live drama about health behaviors (e.g., washing hand, exclusive breastfeeding, ORS)?"] == "Live drama shows by Center for National Culture":
        final_system_prompt += "The initiative consisted of a theatre (or live) drama that was played in your community with the support of the Centre for National Culture (CNC). These dramas aimed at informing on the five health behaviors that are critical for preventing under-five mortality (i.e., sleeping under an insecticide treated mosquito net, utilization of oral rehydration solution (ORS) for the treatment of diarrhea, hand-washing with soap, exclusive breastfeeding, and delivery with a skilled birth attendant)." + "\n\n"

    if row["In your community, did the Centre for National Culture showed a video or a live drama about health behaviors (e.g., washing hand, exclusive breastfeeding, ORS)?"] == "Video by Center for National Culture":
        final_system_prompt += "The initiative consisted of a video screening of a recorded drama which was played in your community with the support of the CNC. These videos aimed at informing on the five health behaviors that are critical for preventing under-five mortality (i.e., sleeping under an insecticide treated mosquito net, utilization of oral rehydration solution (ORS) for the treatment of diarrhea, hand-washing with soap, exclusive breastfeeding, and delivery with a skilled birth attendant)." + "\n\n"

    if row["Did your community radio broadcast programs about health behaviors?"] == "Yes":
        final_system_prompt += "The initiative consisted of focus group discussions and jingles broadcasted by the Ghana Community Radio Network (GCRN). These focus group discussions and jingles aimed at informing on the five health behaviors that are critical for preventing under-five mortality (i.e., sleeping under an insecticide treated mosquito net, utilization of oral rehydration solution (ORS) for the treatment of diarrhea, hand-washing with soap, exclusive breastfeeding, and delivery with a skilled birth attendant)." + "\n\n"

    if row["Did you received phone calls from healthcare workers informing you about health behaviours? If so, was the person who called always the same?"] == "Always called by the same person":
        final_system_prompt += "Voice messages were sent by the same person to your mobile phone. This message aimed at informing on the five health behaviors that are critical for preventing under-five mortality (i.e., sleeping under an insecticide treated mosquito net, utilization of oral rehydration solution (ORS) for the treatment of diarrhea, hand-washing with soap, exclusive breastfeeding, and delivery with a skilled birth attendant)." + "\n\n"

    if row["Did you received phone calls from healthcare workers informing you about health behaviours? If so, was the person who called always the same?"] == "Called by different people":
        final_system_prompt += "Voice messages were sent by different people to your mobile phone. This message aimed at informing on the five health behaviors that are critical for preventing under-five mortality (i.e., sleeping under an insecticide treated mosquito net, utilization of oral rehydration solution (ORS) for the treatment of diarrhea, hand-washing with soap, exclusive breastfeeding, and delivery with a skilled birth attendant)." + "\n\n"

    return final_system_prompt + "The results of this initiative were evaluated in two different years: 2014 and 2016."


# Load data
data = pd.read_csv(f"data/{DATA}.csv", header=1)

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

# Construct system and question prompts
data["demographic_prompt"] = data.apply(generate_demographic_prompt, axis=1, args=(target_outcomes + 
                                                                                   ["Household ID", 
                                                                                    "Community ID", 
                                                                                    "In you community, did the Centre for National Culture showed a video or a live drama about health behaviors (e.g., washing hand, exclusive breastfeeding, ORS)?", 
                                                                                    "Did your community radio broadcast programs about health behaviors?", 
                                                                                    "Did you received phone calls from healthcare workers informing you about health behaviours? If so, was the person who called always the same?"],))
data["system_prompt"] = data.apply(format_fink_system_prompt, axis=1)

# Format data for fine-tuning
data["text"] = data.apply(format_fink_user_prompts, axis=1, args=(target_outcomes,))

# Save files in CSV format
data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

## Chamie et al. 2021 (LMIC RCT)

In [None]:
RANDOM_STATE = 42
DATA = "chamie_et_al_2021_training"

target_outcomes = [
    "After 3 months: Did you come back for Month 3 retesting?",
    "After 6 months: Did you come back for month 6 retesting?"
]

treatment_columns = [
    "studygroup",
]

system_prompt = """Please put yourself in the shoes of a human subject participating in a healthcare initiative on HIV testing in a peri-urban town in Ibanda District, southwestern Uganda, where adult HIV prevalence is 5.1%. You will be provided with a demographic profile that describes your age, gender, whether the head of your household received any formal education, size of your village, whether you live on the periphery of your community, and whether you or your household own any land.

The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. Lastly, you will be provided with the description of an initiative that was carried out in your area. Thereafter, you will be asked whether you took a HIV test at 3 and 6 months from the beginning of the initiative. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile:
{demographic_prompt_1}

In your area, an initiative to promote frequent retesting for HIV among persons who face a high risk of HIV acquisition was implemented from October to December 2018. During this initiative, 1,482 persons were screened for study eligibility following community-based recruitment. Individuals who came for an evaluation were eligible for the study if they were aged 18 to 59 years, tested negative for HIV, and reported at least one of the following risk factors in the prior 12 months: (i) >1 partner; (ii) a known HIV–infected partner; (iii) a history of a sexually transmitted infection; or (iv) paid or received money or gifts in exchange for sex. We excluded participants who reported an intention to move away from the community for ≥4 of the 6 months following recruitment, were unwilling to retest for HIV in the future, or had tested for HIV ≥3 times in the past 12 months. In the context of the initiative, HIV–negative adults with self-reported risk were randomized to 1 of 3 strategies to promote HIV retesting: (1) no incentive; (2) cash incentives (US$7) for retesting at 3 and 6 months (total US$14); or (3) deposit contracts: participants could voluntarily deposit US$6 at baseline and at 3 months that would be returned with interest (total US$7) upon retesting at 3 and 6 months (total US$14) or lost if participants failed to retest. Measurement of retesting was conducted at the local government-run clinics where baseline enrollment occurred. This also means that during the initiative only clinic-based (rather than community-based) HIV retesting was offered.
You were recruited to participate to the initiative and were randomly assigned to the strategy {treatment_prompt}

Here are some additional interview responses collected after the treatment is assigned:
{demographic_prompt_2}
"""

def generate_demographic_prompt_inclusive(row, demographic_questions): 
    random.shuffle(demographic_questions)
    demographic_prompt = ""
    counter = 1
    for question in demographic_questions:
        if pd.isnull(row[question]) or row[question] == "N/A":
            continue
        demographic_prompt += f"{counter}) Interviewer: {question} Me: {row[question]} "
        counter += 1

    return demographic_prompt


def format_chamie_user_prompts(row: pd.Series, target_outcomes) -> str:  # TODO might need to be updated
    prompt = [{"role":"system","content":row["system_prompt"]},]

    # User prompt 1
    user_prompt_1 = """Here are some additional interview responses collected during a follow-up interview conducted 3 months from the beginning of the initiative:
{demographic_prompt}
    
Question:
{question}""".format(
        demographic_prompt=row["demographic_prompt_3"],
        question=target_outcomes[0]
    )
    prompt += [
                {"role":"user", "content":user_prompt_1},
                {"role":"assistant", "content":row[target_outcomes[0]]}
            ]
    
    # User prompt 2
    user_prompt_2 = """Here are some additional interview responses collected during a follow-up interview conducted 6 months from the beginning of the initiative:
{demographic_prompt}
    
Question:
{question}""".format(
        demographic_prompt=row["demographic_prompt_4"],
        question=target_outcomes[1]
    )
    prompt += [
                {"role":"user", "content":user_prompt_2},
                {"role":"assistant", "content":row[target_outcomes[1]]}
            ]
    
    # Convert the prompt list to a JSON-formatted string
    prompt_string = json.dumps(prompt)

    return prompt_string


def format_chamie_system_prompt(row: pd.Series) -> str:
    if row["studygroup"] == "Control":
        treatment_prompt = "(1) receiving no incentive for retesting."
    elif row["studygroup"] == "Incentive":
        treatment_prompt = "(2) receiving cash incentives (US$7) for retesting at 3 and 6 months (total US$14)."
    elif row["studygroup"] == "Deposit":
        treatment_prompt = "(3) deposit contracts: you can voluntarily deposit US$6 at baseline and at 3 months that would be returned with interest (total US$7) upon retesting at 3 and 6 months (total US$14) or lost if you failed to retest." 
    else:
        ValueError(f"Treatment {row['studygroup']} is not considered.")

    final_system_prompt = system_prompt.format(
        demographic_prompt_1=row["demographic_prompt_1"],
        treatment_prompt=treatment_prompt,
        demographic_prompt_2=row["demographic_prompt_2"],
    )

    return final_system_prompt


# Load data
data = pd.read_csv(f"data/{DATA}.csv", header=1)

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

# Construct system and question prompts

demographic_questions_2 = [
    "Did the participant accept or decline the deposit?   ",
    "How much more would the participant be willing to deposit? ",
]
demographic_questions_3 = [
    "After 3 months: Do you think your chances of having HIV today are high, moderate, low, or no risk at all?",
    "After 3 months: Why do you think that you have a low chance or no risk of having HIV today?",
    "After 3 months: Why do you think that you have a moderate or high chance of having HIV today?",
    "After 3 months: What is your main reason for coming for an HIV test today?",
    "After 3 months: What are other reasons you came for HIV testing today?",
    "After 3 months: Do you think you would have tested for HIV again at this time if you were not in this study?",
    "After 3 months: Did knowing that you would receive your deposit back with interest motivate you?",
    "After 3 months: Do you think you would have tested for HIV today if you had not deposited any money?",
    "After 3 months: What do you think about the amount of money you were asked to deposit when we first offered you HIV testing?",
    "After 3 months: You can voluntarily make another deposit now, just as you did 3 months ago, that will be repaid to you with interest if you come to retest for HIV again in 3 months. Would you like to make another deposit now?",
    "After 3 months: Do you think you would have tested for HIV again today if you were NOT offered an incentive for repeat testing?",
    "After 3 months: Did you make a deposit at 3 months",
]
demographic_questions_4 = [
    "After 6 months: Do you think your chances of having HIV today are high, moderate, low, or no risk at all?",
    "After 6 months: Why do you think that you have a low chance or no risk of having HIV today?",
    "After 6 months: Why do you think that you have a moderate or high chance of having HIV today?",
    "After 6 months: What is your main reason for coming for an HIV test today?",
    "After 6 months: What are other reasons you came for HIV testing today?",
    "After 6 months: Do you think you would have tested for HIV again at this time if you were not in this study?",
    "After 6 months: Did knowing that you would receive your deposit back with interest motivate you?",
    "After 6 months: Do you think you would have tested for HIV today if you had not deposited any money?",
    "After 6 months: What do you think about the amount of money you were asked to deposit when we first offered you HIV testing?",
    "After 6 months: Would you be willing to make a deposit again now if you had another chance to receive a payment with interest for retesting for HIV in 3 months?",
    "After 6 months: Do you think you would have tested for HIV again today if you were NOT offered an incentive for repeat testing?",
]
demographic_questions_1 = [col for col in data.columns if col not in demographic_questions_2 + demographic_questions_3 + demographic_questions_4 + target_outcomes + treatment_columns + ["What is your study ID number?", "What is your screening ID?"]]

data["demographic_prompt_1"] = data.apply(generate_demographic_prompt_inclusive, axis=1, args=(demographic_questions_1,))
data["demographic_prompt_2"] = data.apply(generate_demographic_prompt_inclusive, axis=1, args=(demographic_questions_2,))
data["demographic_prompt_3"] = data.apply(generate_demographic_prompt_inclusive, axis=1, args=(demographic_questions_3,))
data["demographic_prompt_4"] = data.apply(generate_demographic_prompt_inclusive, axis=1, args=(demographic_questions_4,))
data["system_prompt"] = data.apply(format_chamie_system_prompt, axis=1)

# Format data for fine-tuning
data["text"] = data.apply(format_chamie_user_prompts, axis=1, args=(target_outcomes,))

# Save files in CSV format
data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

## Duflo et al. 2019 (LMIC RCT)

In [None]:
RANDOM_STATE = 42
DATA = "duflo_et_2019_HIV_prevention_among_youth_training"

target_outcomes = [
    "You have been tested for HSV-2. Is your test positive? Please, answer with one of the following options: Yes, No",
    "You have been administered an HIV test. What was the result of the test?",
    "Have you ever received condoms for free? Please, answer with one of the following options: Yes, No, Don't know",
    "I donât want to know your results, but have you ever received Voluntary Counseling and Testing for HIV (VCT)? Please, answer with one of the following options: Yes, No",
    "How many times have you received VCT?",
    "Are you married? Please, answer with one of the following options: Yes, No",
    "In the last 6 months, how many sexual partners have you had in total?",
    "Have you ever used a male condom?  Please, answer with one of the following options: Yes, No",
    "Did you use a condom the last time you had sex? Please, answer with one of the following options: Yes, No",
    "During the last 12 months, have you had a sexually transmitted infection (i.e. syphilis, gonorrhea, Chlamydia, herpes)? Please, answer with one of the following options: Yes, No, Don't know",
    "Can you tell me all the ways you know of that people can protect themselves from HIV? Please, answer with any of the following options: Abstinence, Being faithful, Using condoms, Using condoms correctly and consistently, Going for VCT before engaging in sex, Not sharing sharp objects, Avoiding drugs/alcohol/anything which hampers, judgment, Avoiding bad company, Avoiding prostitution, Avoiding walking alone at night, Avoiding contact between bloody wounds and skin, Ensuring safe blood transfusions, Ensuring clean medical equipment, Avoiding circumcision with unsafe tools, Avoiding wife-inheritance, Avoiding sugar daddies/mummies, Avoiding multiple sexual partners, Other, Donât know/ Donât remember"
]

treatment_column = "treatment"

system_prompt = """Please put yourself in the shoes of a human subject participating in a healthcare survey in a remote rural community in Kenya about the HIV/AIDS pandemic. You will be provided with a demographic profile that describes your age, gender, education, household, work, and opinions on several topics. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. Lastly, you will be provided with the description of an initiative that was carried out in your area. Thereafter, you will be asked some follow-up questions about your sexual behaviour and whether you received a HIV test. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

In your area, an initiative was implemented consisting in randomly assigning youth aged 17 to 24 to receive community-based VCT, 150 male condoms, both VCT and condoms, or neither program. All had access to standard HIV services available within their communities. 
Surveys and blood samples for HSV-2 testing were collected at baseline (2009–2010) and at follow up (2011–2013).

{treatment_prompt}

Your demographic profile based on information collected at baseline (2009–2010):
{demographic_prompt}

You are now taking part in the follow-up (2011–2013).
"""

def format_duflo_user_prompts(row: pd.Series, target_outcomes) -> str:
    prompt = [{"role":"system","content":row["system_prompt"]},]

    for target_outcome in target_outcomes:
        if not pd.isnull(row[target_outcome]):
            prompt += [
                {"role":"user", "content":target_outcome},
                {"role":"assistant", "content":row[target_outcome]}
            ]
    
    # Convert the prompt list to a JSON-formatted string
    prompt_string = json.dumps(prompt)

    return prompt_string


def format_duflo_system_prompt(row: pd.Series) -> str:
    if row["treatment"] == "Control":
        treatment_prompt = "You did not receive any of the programs."

    elif row["treatment"] == "VCTonly":
        treatment_prompt = "You received the community-based VCT program."

    elif row["treatment"] == "Conly":
        treatment_prompt = "You received 150 male condoms."

    elif row["treatment"] == "CV":
        treatment_prompt = "You received both the community-based VCT program and 150 male condoms."

    else:
        raise ValueError(f"Treatment {row['treatment']} is not considered.")

    final_system_prompt = system_prompt.format(
        treatment_prompt=treatment_prompt,
        demographic_prompt=row["demographic_prompt"]
    )

    return final_system_prompt


# Load data
data = pd.read_csv(f"data/{DATA}.csv", header=1)
data[target_outcomes] = data[target_outcomes].replace('NA', None)

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

# Construct system and question prompts
data["demographic_prompt"] = data.apply(generate_demographic_prompt, axis=1, args=(target_outcomes + ["pupilid", treatment_column],))
data["system_prompt"] = data.apply(format_duflo_system_prompt, axis=1)

# Format data for fine-tuning
data["text"] = data.apply(format_duflo_user_prompts, axis=1, args=(target_outcomes,))

# Save files in CSV format
data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

## Bartos et al. 2022 (HIC RCT)

In [None]:
RANDOM_STATE = 42
DATA = "bartos_et_al_2022_communicating_doctors_consensus_training"

target_outcomes = [
    "nQ275_r1_vlna26",
    "nQ277_r1_vlna26",
    "nQ300_1_1_vlna26",
    "nQ301_1_1_vlna26",
    "vaccine_doses_vlna26",
    "nQ275_r1_vlna36",
    "nQ277_r1_vlna36",
    "nQ300_1_1_vlna36",
    "nQ301_1_1_vlna36",
    "vaccine_doses_vlna36",
    "vaccinated_certcode"
]

treatment_column = "treatment"

system_prompt = """Please put yourself in the shoes of a human subject participating in a multi-wave healthcare survey in Czech Republic about the COVID-19 vaccine conducted in 2021. You will be provided with a demographic profile that describes, among other things, your age, gender, education, work status, and the town you live in. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. Thereafter, you will be asked whether you are willing to get a COVID-19 vaccination and whether you have received the COVID-19 vaccination. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile based on your responses in the first wave of the survey:
{demographic_prompt}

{treatment_prompt}
"""

def format_bartos_user_prompts(row: pd.Series, target_outcomes, question_label_dict) -> str:
    prompt = [{"role":"system", "content":row["system_prompt"]},]

    first_wave_info = False
    second_wave_info = False
    for i in range(len(target_outcomes)):
        if not pd.isnull(row[target_outcomes[i]]):

            if i < 5 and first_wave_info == False:
                prompt += [
                    {"role":"user", "content":"Two weeks after the first wave, you participate in the second wave.\n" + question_label_dict[target_outcomes[i]]},
                    {"role":"assistant", "content":row[target_outcomes[i]]}
                ]
                first_wave_info = True
            elif i < 5 and first_wave_info == True:
                prompt += [
                    {"role":"user", "content":question_label_dict[target_outcomes[i]]},
                    {"role":"assistant", "content":row[target_outcomes[i]]}
                ]
            elif i >= 5 and second_wave_info == False:
                prompt += [
                    {"role":"user", "content":"At the end of November 2021, you participate in the last wave.\n" + question_label_dict[target_outcomes[i]]},
                    {"role":"assistant", "content":row[target_outcomes[i]]}
                ]
                second_wave_info = True
            elif i >= 5 and second_wave_info == True:
                prompt += [
                    {"role":"user", "content":question_label_dict[target_outcomes[i]]},
                    {"role":"assistant", "content":row[target_outcomes[i]]}
                ]
            else:
                raise ValueError("Something is wrong.")
    
    # Convert the prompt list to a JSON-formatted string
    prompt_string = json.dumps(prompt)

    return prompt_string


def format_bartos_system_prompt(row: pd.Series) -> str:
    treatment_prompt = """During the first survey conducted on 15 March 2021, you are shown the following 4 slides.

Slide 1: In recent weeks, the Czech Medical Chamber conducted a survey among all medical doctors in the Czech Republic regarding vaccination. Almost 10,000 medical doctors from all parts of the country, from small and large municipalities, and from all age categories responded to the survey. We would like to share the results with you. The results do not differ across different groups of physicians.

Slide 2: The interest of Czech medical doctors in vaccination against Covid-19 is large. 90% of medical doctors are already vaccinated or are interested in getting vaccinated. Only 4% of doctors would not get vaccinated.

Slide 3: Most Czech medical doctors would recommend vaccination against Covid-19 to their healthy patients. 96% of physicians would recommend vaccination to their healthy patients either on their own initiative or if their patients ask for their opinion.

Slide 4: Czech medical doctors' trust in Covid-19 vaccines is strong. 89% of doctors trust vaccines approved by the European Medicines Agency. Only 2% of doctors do not trust them.
"""

    if row["treatment"] == "Experimental group":
        final_system_prompt = system_prompt.format(
            demographic_prompt=row["demographic_prompt"],
            treatment_prompt=treatment_prompt
        )
    else:
        final_system_prompt = system_prompt.format(
            demographic_prompt=row["demographic_prompt"],
            treatment_prompt=""
        )

    return final_system_prompt


# Load data
data = pd.read_csv(f"data/{DATA}.csv")
question_label_dict = data.iloc[0].to_dict() # Read the first row as a dictionary: {column_name: value_in_first_row}
data = data.drop(data.index[0]).reset_index(drop=True)
data[target_outcomes] = data[target_outcomes].replace('NA', None)

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

# Construct system and question prompts
data["demographic_prompt"] = data.apply(generate_demographic_prompt_with_labels, axis=1, args=(target_outcomes + ["respondentId", "vlna", treatment_column], question_label_dict),)
data["system_prompt"] = data.apply(format_bartos_system_prompt, axis=1)

# Format data for fine-tuning
data["text"] = data.apply(format_bartos_user_prompts, axis=1, args=(target_outcomes, question_label_dict))

# Save files in CSV format
data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

## Steinert et al. 2022 (HIC RCT)

In [None]:
RANDOM_STATE = 42
DATA = "steinert_et_al_2022_training"

target_outcomes = [
    "How would you decide if you had the opportunity to get vaccinated against COVID-19 next week?",
    "If you have certainly decided to get vaccinated next week, will the vaccine be Moderna?",
    "If you have certainly decided to get vaccinated next week, will the vaccine be Astra Zeneca?",
    "If you have certainly decided to get vaccinated next week, will the vaccine be Johnson&Johnson?",
    "If you have certainly decided to get vaccinated next week, will the vaccine be BioNTech/Pfizer?",
    "Please name reasons for which you are undecided.",
    "Please name reasons why you wouldn't get vaccinated under any circumstances.",
]

treatment_column = "Treatment"

system_prompt = """Please put yourself in the shoes of a human subject participating in a healthcare study in Sweden about the COVID-19 vaccine conducted from 15 and 24 June 2021. You will be provided with a demographic profile that describes, among other things, your geographical region, age, gender, civil status, education, employment status, household size and income, religion, and responses to questions related to COVID-19. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”.{treatment_prompt_1} Thereafter, you will be asked some questions about when you think that you will get a first shot of a COVID-19 vaccine and, finally, if you got a first shot of a COVID-19 vaccine within the first 30 days after the vaccine became available to you. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile:
{demographic_prompt}

{treatment_prompt_2}
"""

def format_steinert_user_prompts(row: pd.Series, target_outcomes) -> str:
    prompt = [{"role":"system", "content":row["system_prompt"]},]

    for target_outcome in target_outcomes:
        if not pd.isnull(row[target_outcome]):
            prompt += [
                {"role":"user", "content":target_outcome},
                {"role":"assistant", "content":row[target_outcome]}
            ]
    
    # Convert the prompt list to a JSON-formatted string
    prompt_string = json.dumps(prompt)

    return prompt_string


def format_steinert_system_prompt(row: pd.Series) -> str:
    if row["Treatment"] == "Control":
        treatment_prompt_1 = ""
        treatment_prompt_2 = ""

    elif row["Treatment"] == "Risk reduction":
        treatment_prompt_1 = " Lastly, you will be presented with a text encouraging you to get a COVID-19 vaccine."
        treatment_prompt_2 = """The efficacy of the COVID-19 vaccines licensed in Sweden was determined in large-scale scientific studies. The results of these studies can be explained using a simplified example. Imagine three groups of 1000 people each:

Group 1 receives a placebo vaccine without effect: 80 people from group 1 will become infected with COVID-19 and develop symptoms during the observation period. One person of these will die from COVID-19.

Group 2 receives the AstraZeneca vaccine: In group 2, people will also contract COVID-19, but only 32, which is 60 percent less compared to group 1. No infected person dies from COVID-19 or has a severe disease progression.

Group 3 receives the BioNTech/Moderna vaccine: In group 3, people will also contract COVID-19, but only 4. That is 95 per cent fewer compared to group 1. No infected person dies from COVID-19 or has a severe disease progression.

Most importantly, no person in the AstraZeneca/J&J or BioNTech/Moderna groups becomes seriously ill or dies from COVID-19!"""

    elif row["Treatment"] == "Hedonistic benefits":
        treatment_prompt_1 = " Lastly, you will be presented with a text encouraging you to get a COVID-19 vaccine."
        treatment_prompt_2 = """If everyone gets vaccinated against COVID-19, public life can be fully restored and we will be able to see our family and friends again.

Get vaccinated so that restaurants, cafés, pubs, cinemas, theatres, discos, gyms, concert halls, sports stadiums and the like can reopen and that everyone can go on vacation and travel to other countries."""

    elif row["Treatment"] == "Vaccination certificate":
        treatment_prompt_1 = " Lastly, you will be presented with a text encouraging you to get a COVID-19 vaccine."
        treatment_prompt_2 = """The government of Sweden has been discussing the introduction of Corona vaccination certificates. This vaccination passport is supposed to be issued to those people who have been vaccinated against the Coronavirus.

Once sufficient vaccine doses are available in Sweden and everyone can be vaccinated, everyone will receive a Corona vaccination certificate after their second COVID-19 vaccination.

Travel to other countries will only be allowed for people who can show a valid Corona vaccination passport."""

    else:
        raise ValueError(f"Treatment {row['Treatment']} is not considered.")

    final_system_prompt = system_prompt.format(
        treatment_prompt_1=treatment_prompt_1,
        treatment_prompt_2=treatment_prompt_2,
        demographic_prompt=row["demographic_prompt"]
    )

    return final_system_prompt


# Load data
data = pd.read_csv(f"data/{DATA}.csv", header=1)
data[target_outcomes] = data[target_outcomes].replace('N/A', None)

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

# Construct system and question prompts
data["demographic_prompt"] = data.apply(generate_demographic_prompt, axis=1, args=(target_outcomes + [treatment_column],))
data["system_prompt"] = data.apply(format_steinert_system_prompt, axis=1)

# Format data for fine-tuning
data["text"] = data.apply(format_steinert_user_prompts, axis=1, args=(target_outcomes,))

# Save files in CSV format
data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

## Schneider et al. 2023 (HIC RCT)

In [None]:
RANDOM_STATE = 42
DATA = "schneider_et_al_2023_training"

target_outcomes = [
    "Do you plan to take a COVID-19 vaccine shot (regardless of whether it is your first, second, third, or fourth shot) within the next 6 months? Choose one of the following responses: Yes, No.",
    "Suppose that there would be a new outbreak of the COVID-19 pandemic in 6 months and the Center for Disease Control would recommend people to take an additional COVID-19 vaccine shot (regardless of the number of shots they got in the past). Thinking of such a situation, would you take an additional shot? Choose one of the following responses: Yes, No.",
    "To what extent do you agree with the following statement? In general, COVID-19 vaccines are safe. Choose one of the following responses: completely agree, agree, neither agree nor disagree, disagree, completely disagree.",
    "To what extent do you agree with the following statement? I am worried about the side effects from COVID-19 vaccines. Choose one of the following responses: completely agree, agree, neither agree nor disagree, disagree, completely disagree.",
    "To what extent do you agree with the following statements? In general, COVID-19 vaccines are highly effective at protecting my health.",
    "Suppose that there would be a new outbreak of the COVID-19 pandemic in 6 months and the Center for Disease Control would recommend people to take an additional COVID-19 vaccine shot (regardless of the number of shots they got in the past). In this situation, to what extent do you agree with the following statement? I would be willing to take the personal costs of getting an additional COVID-19 vaccine shot (e.g., time, discomfort, mild side effects) for the greater good of society. Choose one of the following responses: completely agree, agree, neither agree nor disagree, disagree, completely disagree.",
    "Suppose that there would be a new outbreak of the COVID-19 pandemic in 6 months and the Center for Disease Control would recommend people to take an additional COVID-19 vaccine shot (regardless of the number of shots they got in the past). In this situation, to what extent do you agree with the following statement? I think people would have a civic duty or a moral obligation to get an additional COVID19 vaccine shot. Choose one of the following responses: completely agree, agree, neither agree nor disagree, disagree, completely disagree.",
    "Suppose that there would be a new outbreak of the COVID-19 pandemic in 6 months and the Center for Disease Control would recommend people to take an additional COVID-19 vaccine shot (regardless of the number of shots they got in the past). Not taking the COVID-19 vaccine shot would be generally viewed as socially inappropriate in this situation. Choose one of the following responses: completely agree, agree, neither agree nor disagree, disagree, completely disagree.",
    "Do you plan to take a flu vaccine in the 2022-2023 winter? Choose one of the following responses: Yes, No.",
    "Do you plan to donate blood in the next 6 months? Choose one of the following responses: Yes, No.",
    "How much trust and confidence do you have in the government of the state where you live when it comes to handling state problems – a great deal, a fair amount, not very much or none at all?  Choose one of the following responses: a great deal, a fair amount, not very much, not at all.",
    "Suppose that there would be a new outbreak of the COVID-19 pandemic in 6 months, the Center for Disease Control would recommend people to take an additional COVID-19 vaccine shot (regardless of the number of shots they got in the past) and that every person getting an additional shot would receive $20. Thinking of such a situation, would you take an additional shot? Choose one of the following responses: Yes, No.",
]

treatment_column = "Did you receive detailed information about your state's COVID-19 incentive program?"

system_prompt = """Please put yourself in the shoes of a human subject participating in a healthcare study in the USA about the COVID-19 vaccine conducted in June and July 2022. You will be provided with a demographic profile that describes, among other things, your geographical region, age, gender, education, employment status, household income, and responses to questions related to COVID-19. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”.{treatment_prompt_1} Thereafter, you will be asked some questions about whether you plan to take COVID-19 vaccine shot and your opinions on the COVID-19 vaccine. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile:
{demographic_prompt}

{treatment_prompt_2}
"""

def format_schneider_user_prompts(row: pd.Series, target_outcomes) -> str:
    prompt = [{"role":"system", "content":row["system_prompt"]},]

    for target_outcome in target_outcomes:
        if not pd.isnull(row[target_outcome]):
            prompt += [
                {"role":"user", "content":target_outcome},
                {"role":"assistant", "content":row[target_outcome]}
            ]
    
    # Convert the prompt list to a JSON-formatted string
    prompt_string = json.dumps(prompt)

    return prompt_string


def format_schneider_system_prompt(row: pd.Series) -> str:
    if row[treatment_column] == "No":
        treatment_prompt_1 = ""
        treatment_prompt_2 = ""

    elif row[treatment_column] == "Yes":
        treatment_prompt_1 = " Lastly, you will be presented with a description of a governmental program implemented in your state of residence offering monetary compensation to people who got a COVID-19 shot."

        treatment_prompt_2 = "Here you receive detailed information about your state’s (or counties’) COVID-19 vaccine incentive program.\n\n"

        if row["What was your state of residence in 2021?"] == "California":
            treatment_prompt_2 += """California’s “Vax for the Win” program.
The “Vax for the Win” Program was a Californian $116.5 million COVID-19 vaccine incentive program implemented by the government of California which took place between May 2021 and January 2022. The purpose of this program was to increase COVID-19 vaccination rates.

All Californians aged 12 and over who were at least partially vaccinated against COVID were automatically eligible for the cash prize drawings. In total $16.5 million in cash prizes were paid out.

Moreover, two million people who completed their COVID-19 vaccination were automatically eligible to receive a $50 present or grocery card, worth a total of $100 million."""

        elif row["What was your state of residence in 2021?"] == "Illinois":
            treatment_prompt_2 += """Illinois “All in for the win” program.
The “All in for the win” program was an Illinois $10 million COVID-19 vaccine incentive program implemented by the government of Illinois which took place between July 2021 and August 2021. The purpose of this program was to increase COVID-19 vaccination rates.

Illinois residents aged 18 and over who had received at least one dose of a COVID-19 vaccine automatically participated in three drawings to win a $1 million cash prize and forty $100k cash prizes.

Youth between the ages of 12 and 17 who had received at least one dose of a COVID-19 vaccine automatically participated in twenty drawings to win $150,000 scholarships."""

        elif row["What was your state of residence in 2021?"] == "Kentucky":
            treatment_prompt_2 += """Kentucky’s “Shot At A Million” program.
The “Shot At A Million” program was a Kentucky COVID-19 vaccine incentive program implemented by the government of Kentucky which took place between June 2021 and August 2021. The purpose of this program was to increase COVID-19 vaccination rates.

Kentuckians aged 18 and over who were at least partially vaccinated against COVID were eligible for cash prize drawings. Three individuals each received a cash prize of $1 million.

Youth between the ages of 12 and 17 who were at least partially vaccinated against COVID participated in fifteen drawings of a full scholarships (including room-and-board and tuition) to any Kentucky public college, university, technical or trade school."""

        elif row["What was your state of residence in 2021?"] == "Louisiana":
            treatment_prompt_2 += """Louisiana’s “Shot For $100” program.
The “Shot For $100” program was a Louisiana COVID-19 vaccine incentive program implemented by the Louisiana Department of Health which took place between October 2021 and December 2021. The purpose of this program was to increase COVID-19 vaccination rates.

Anyone who took a first dose of a COVID-19 vaccine was eligible to receive a $100 incentive card. Minors were eligible for the cash incentive but required parental consent to get the vaccine."""

        elif row["What was your state of residence in 2021?"] == "Michigan":
            treatment_prompt_2 += """Michigan’s “Shot To Win Sweepstakes” program.
The “Shot To Win Sweepstakes” program was a Michigan $5.5 million COVID-19 vaccine incentive program implemented by the government of Michigan which ended in July 2021. The purpose of this program was to increase COVID-19 vaccination rates.

Michigan residents aged 18 and over who were at least partially vaccinated against COVID were eligible for cash prize drawings. In total $5 million in cash prizes were paid out,including 30 daily prizes of $50,000, a $1 million prize and a $2 million grand prize.

Youth between the ages of 12 and 17 who were at least partially vaccinated against COVID automatically participated in nine drawings to win $55,000 scholarships."""

        elif row["What was your state of residence in 2021?"] == "Missouri":
            treatment_prompt_2 += """Missouri’s “MO VIP” program.
The “MO VIP” program was a Missourian $9 million COVID-19 vaccine incentive program implemented by the government of Missouri which took place between July 2021 and October 2021. The purpose of this program was to increase COVID-19 vaccination rates.

Missourian residents aged 18 and over who were at least partially vaccinated against COVID were eligible for cash prize drawings. 800 individuals each received a cash prize of $10,000.

Youth between the ages of 12 and 17 who were at least partially vaccinated against COVID participated in 100 drawings to win $10,000 scholarships."""

        elif row["What was your state of residence in 2021?"] == "New York":
            treatment_prompt_2 += """New York’s “Vax Scratch” program.
The “Vax Scratch” Program was a New York million COVID-19 vaccine incentive program implemented by the government of New York which took place between May 2021 and June 2021. The purpose of this program was to increase COVID-19 vaccination rates.

New Yorkers aged 18 and over who took a first dose of a COVID-19 vaccine at select state-run vaccination sites received a lottery scratch ticket with prizes from $20 up to $5 million. The lottery scratch tickets, which are also sold by retailers across the state, were worth $20."""

        elif row["What was your state of residence in 2021?"] == "North Carolina":
            treatment_prompt_2 += """North Carolina’s “Summer Cash Draw-ing” program.
The “Summer Cash Drawing” program was a North Carolinian $4.5 million COVID-19 vaccine incentive program implemented by the government of North Carolina which took place betweenJune 2021 and August 2021. The purpose of this program was to increase COVID-19 vaccination rates.

North Carolinians aged 18 and over who had received at least one dose of a COVID-19 vaccine automatically participated in four drawings for a chance to win a $1 million cash prize.

Youth between the ages of 12 and 17 who had received at least one dose of a COVID-19 vaccine automatically participated in four drawings to win $125,000 to pay for their post-secondary education."""

        elif row["What was your state of residence in 2021?"] == "Ohio":
            treatment_prompt_2 += """Ohio’s “Vax-A-Million” program.
The “Vax-A-Million” program was an Ohioan $5 million COVID-19 vaccine incentive program implemented by the government of Ohio which took place between May 2021 and June 2021. The purpose of this program was to increase COVID-19 vaccination rates.

Ohioans aged 18 and older who had received at least one dose of a COVID-19 vaccine could enter a lottery to win one of five $1 million prizes.
Ohioans ages 12-17 who had received at least one dose of a COVID-19 vaccine could enter a lottery to win one of five scholarships which would pay for their college degree at any Ohio state college or university."""

        elif row["What was your state of residence in 2021?"] == "Pennsylvania":
            treatment_prompt_2 += """Philadelphia’s vaccine incentive program.
The Philadelphia vaccine incentive program was a Pennsylvania COVID-19 vaccine incentive program implemented by the Philadelphia Health Department which ended in March 2022. The purpose of this program was to increase COVID-19 vaccination rates.

Philadelphia residents who completed their COVID vaccine series at a participating Health Department vaccine clinic received a payment of $100. In total, $20,000 in incentives were paid out."""

        elif row["What was your state of residence in 2021?"] == "Texas":
            treatment_prompt_2 += """Houston’s vaccine incentive program.
The Houston vaccine incentive program was a Texan COVID-19 vaccine incentive program implemented by the Houston Health Department which ended in March 2022. The purpose of this program was to increase COVID-19 vaccination rates.

The program provided a total of fourteen $1,000 weekly prizes to people who had gotten their first dose or booster shot. People of all ages eligible for vaccination were allowed to participate in the incentive program."""

        elif row["What was your state of residence in 2021?"] == "Florida":
            treatment_prompt_2 += """Alachua County and Flagler County vaccine incentive programs
The Alachua County vaccine incentive program and the Flagler County vaccine incentive program were Florida COVID-19 vaccine incentive programs implemented by Florida counties. The purpose of these programs was to increase COVID-19 vaccination rates.

Alachua County vaccine incentive program: Residents of Alachua County who had received a dose of a COVID-19 vaccine (including booster shots) in September 2021 received $25 Visa gift cards.

Flagler County vaccine incentive program: Residents of Flagler County were paid a $10 food coupon as incentive to receive a dose of a COVID-19 by July 2021. The incentive was only offered at specific locations and days of the week."""

        else:
            raise ValueError(f"Country {row['What was your state of residence in 2021?']} is not considered.")

    else:
        raise ValueError(f"Treatment {row[treatment_column]} is not considered.")

    final_system_prompt = system_prompt.format(
        treatment_prompt_1=treatment_prompt_1,
        treatment_prompt_2=treatment_prompt_2,
        demographic_prompt=row["demographic_prompt"]
    )

    return final_system_prompt


# Load data
data = pd.read_csv(f"data/{DATA}.csv", header=1)
data[target_outcomes] = data[target_outcomes].replace('N/A', None)

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

# Construct system and question prompts
data["demographic_prompt"] = data.apply(generate_demographic_prompt, axis=1, args=(target_outcomes + [treatment_column],))
data["system_prompt"] = data.apply(format_schneider_system_prompt, axis=1)

# Format data for fine-tuning
data["text"] = data.apply(format_schneider_user_prompts, axis=1, args=(target_outcomes,))

# Save files in CSV format
data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

## Galasso et al. 2023 (HIC RCT)

In [None]:
RANDOM_STATE = 42
DATA = "galasso_et_al_2023_training"

target_outcomes = [
    "Assuming the vaccine becomes available, on a scale from 0 to 10 - where 0 means 'not at all likely' and 10 means 'completely certain' - how likely are you to get vaccinated?",
    "After 6 months: Do you trust scientists?",
    "After 6 months: Do you generally trust others?",
    "After 6 months: On a scale from 0 to 10 - where 0 means 'not at all likely' and 10 means 'completely certain' - how likely do you think it is that China is responsible for the outbreak of the virus?",
    "After 6 months: How often do you experience low interest or pleasure in general?",
    "After 6 months: How often do you feel down or low in mood?",
    "After 6 months: On a scale from 0 to 10 - where 0 means 'not at all likely' and 10 means 'completely certain' - how likely do you think it is that you will get infected if you resume the usual daily activities?",
    "After 6 months: On a scale from 0 to 10 - where 0 means 'not at all likely' and 10 means 'completely certain' - how likely do you think it is that you will get seriously ill if you get infected?",
    "After 6 months: Do you think Covid-19 pandemic is having serious consequences on health in your country?",
    "After 6 months: On a scale from 0 to 10 - where 0 means 'not at all likely' and 10 means 'completely certain' - how likely do you think it is that the virus was created by 'big pharma companies' primarily to generate profit?",
    "After 6 months: On a scale from 0 to 10 - where 0 means 'not at all likely' and 10 means 'completely certain' - how likely do you think it is that the vaccine is the solution?",
    "After 6 months: Due to the expedition of clinical trials for the Covid-19 vaccines, the possible side effects of the vaccine are unknown. On a scale from 0 to 10 - where 0 means 'not at all likely' and 10 means 'completely certain' - how likely do you think it is that the vaccine has side effects?",
    "After 6 months: On a scale from 0 to 10 - where 0 means 'not at all likely' and 10 means 'completely certain' - how likely do you think it is that strictly complying with social distancing and other health measures reduces the risk of being infected?",
    "After 6 months: Have you ever had covid?",
    "After 6 months: Has anyone in your household had covid?",
    "After 6 months: Has anyone in your family had covid?",
    "After 6 months: Has any of your friends had covid?",
    "After 6 months: On a scale from 0 to 100 - where 0 means 'not at all compliant' and 100 'fully compliant' - how compliant do you think others are with restrictions",
    "After 6 months: On a scale from 0 to 10 - where 0 means 'not at all risk-averse' and 10 means 'very risk-averse' - How risk-averse are you?",
    "After 6 months: Do you live alone, with your family, or with other people who are not your family?",
    "After 6 months: What is your political affilitation? Choose one of the following options: liberal, centrist, conservative, or I don't know.",
    "Did you get vaccinated against COVID-19 by July 2021?",
    "After 6 months: Have you received all mandatory vaccinations?",
    "After 6 months: Assuming the vaccine becomes available, on a scale from 0 to 10 - where 0 means 'not at all likely' and 10 means 'completely certain' - how likely are you to get vaccinated?"
]

treatment_column = "Treatment"

system_prompt = """Please put yourself in the shoes of a human subject participating in a healthcare survey in {country} about the COVID-19 vaccine conducted in December 2020. You will be provided with a demographic profile that describes, among other things, your geographical region, age, gender, civil status, education, employment status, household income, parent’s birthplace, social preferences, and responses to questions related to COVID-19. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. Lastly, you will be presented with a text encouraging you to get a COVID-19 vaccine. Thereafter, you will be asked some questions about whether you think that you will get a COVID-19 vaccine and, finally, if you got at least one dose of COVID-19 vaccine by July 2021. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile:
{demographic_prompt}

{treatment_prompt}
"""

def format_galasso_user_prompts(row: pd.Series, target_outcomes) -> str:
    prompt = [{"role":"system","content":row["system_prompt"]},]

    for target_outcome in target_outcomes:
        if not pd.isnull(row[target_outcome]):
            prompt += [
                {"role":"user", "content":target_outcome},
                {"role":"assistant", "content":row[target_outcome]}
            ]
    
    # Convert the prompt list to a JSON-formatted string
    prompt_string = json.dumps(prompt)

    return prompt_string


def format_galasso_system_prompt(row: pd.Series) -> str:
    if row[treatment_column] == "Control":
        treatment_prompt = "The only way to become immune to COVID-19 in the long run is by vaccination."

    elif row[treatment_column] == "Self-protection":
        treatment_prompt = "The only way to become immune to COVID-19 in the long run is by vaccination. If you were vaccinated, you could avoid getting infected with the virus."
    
    elif row[treatment_column] == "Protecting others":
        treatment_prompt = "The only way to become immune to COVID-19 in the long run is by vaccination. If you were vaccinated, you might be able to avoid passing the virus on to others."

    elif row[treatment_column] == "Health risk":
        treatment_prompt = f"The only way to become immune to COVID-19 in the long run is by vaccination. If a person was vaccinated, they could avoid getting infected with the virus. This would protect the health of people in {row['What was your country of residence at the beginning of December 2020?']}."

    elif row[treatment_column] == "Economic protection":
        treatment_prompt = "The only way to become immune to COVID-19 in the long run is by vaccination. If a person was vaccinated, they could avoid getting infected with the virus. It would allow a return to normal economic activity and reduce unemployment."

    else:
        raise ValueError(f"Treatment {row[treatment_column]} is not considered.")

    final_system_prompt = system_prompt.format(
        country=row["What was your country of residence at the beginning of December 2020?"],
        demographic_prompt=row["demographic_prompt"],
        treatment_prompt=treatment_prompt,
    )

    return final_system_prompt


# Load data
data = pd.read_csv(f"data/{DATA}.csv", header=1)
data[target_outcomes] = data[target_outcomes].replace('N/A', None)

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

# Construct system and question prompts
data["demographic_prompt"] = data.apply(generate_demographic_prompt, axis=1, args=(target_outcomes + [treatment_column],))
data["system_prompt"] = data.apply(format_galasso_system_prompt, axis=1)

# Format data for fine-tuning
data["text"] = data.apply(format_galasso_user_prompts, axis=1, args=(target_outcomes,))

# Save files in CSV format
data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

## Kluver et al. 2021 (HIC RCT)

In [None]:
RANDOM_STATE = 42
DATA = "kluver_et_al_2021_vaccine_hesitancy_training"

target_outcomes = [
    "v_74",
    "v_77",
]

treatment_columns = ["c_0031","c_0032"]

system_prompt = """Please put yourself in the shoes of a human subject participating in a healthcare survey in Germany about the COVID-19 vaccine conducted from 5 March to 25 March 2021. You will be provided with a demographic profile that describes, among other things, your geographical region, age, gender, civil status, education, employment status, household income, parent’s birthplace, social preferences, and responses to questions related to COVID-19. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”.  Lastly, you will be presented twice with a text describing a scenario of a policy instrument aiming to increase COVID-19 vaccination and you will be asked whether you are willing to get a COVID-19 vaccination under that scenario. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile based on your responses in the first wave of the survey:
{demographic_prompt}

{treatment_prompt}
"""

def format_kluver_user_prompts(row: pd.Series, target_outcomes) -> str:
    prompt = [{"role":"system","content":row["system_prompt"]},]

    if not pd.isnull(row[target_outcomes[0]]):
        prompt += [
            {"role":"user", "content":"Please indicate how likely it is that you would be vaccinated against corona under that policy scenario on the scale from 0 to 10. A 0 means I will definitely not be vaccinated against corona and a 10 means I am sure to get vaccinated against corona."},
            {"role":"assistant", "content":row[target_outcomes[0]]}
        ]

    if row[treatment_columns[1]] == 1:
        treatment_prompt_2 = """There are no special regulations for vaccinated people when the Corona incidence is high. For example, they cannot travel again, visit cinemas, restaurants or concerts and are still subject to contact restrictions.

Eligible citizens can get vaccinated against Corona at the nearest vaccination center, but not from their family doctor.

Citizens who are vaccinated will not receive any allowance after receiving the vaccination."""

    elif row[treatment_columns[1]] == 2:
        treatment_prompt_2 = """There are no special regulations for vaccinated people when the Corona incidence is high. For example, they cannot travel again, visit cinemas, restaurants or concerts and are still subject to contact restrictions.

Eligible citizens can have themselves vaccinated against corona at the nearest vaccination center or their family doctor.

Citizens who are vaccinated will not receive any allowance after receiving the vaccination."""

    elif row[treatment_columns[1]] == 3:
        treatment_prompt_2 = """There are no special regulations for vaccinated people when the Corona incidence is high. For example, they cannot travel again, visit cinemas, restaurants or concerts and are still subject to contact restrictions.

Eligible citizens can get vaccinated against Corona at the nearest vaccination center, but not from their family doctor.

Citizens who get vaccinated receive an expense allowance of 25 euros after receiving the vaccination."""

    elif row[treatment_columns[1]] == 4:
        treatment_prompt_2 = """There are no special regulations for vaccinated people when the Corona incidence is high. For example, they cannot travel again, visit cinemas, restaurants or concerts and are still subject to contact restrictions.

Eligible citizens can get vaccinated against Corona at the nearest vaccination center, but not from their family doctor.

Citizens who get vaccinated receive an expense allowance of 50 euros after receiving the vaccination."""

    elif row[treatment_columns[1]] == 5:
        treatment_prompt_2 = """Special regulations apply to vaccinated people. For example, even when the Corona incidence is high, they can travel again, visit cinemas, restaurants or concerts and are not subject to any contact restrictions.

Eligible citizens can get vaccinated against Corona at the nearest vaccination center, but not from their family doctor.

Citizens who are vaccinated will not receive any allowance after receiving the vaccination."""

    elif row[treatment_columns[1]] == 6:
        treatment_prompt_2 = """Special regulations apply to vaccinated people. For example, even when the Corona incidence is high, they can travel again, visit cinemas, restaurants or concerts and are not subject to any contact restrictions.

Eligible citizens can have themselves vaccinated against corona at the nearest vaccination center or their family doctor.

Citizens who are vaccinated will not receive any allowance after receiving the vaccination."""

    elif row[treatment_columns[1]] == 7:
        treatment_prompt_2 = """Special regulations apply to vaccinated people. For example, even when the Corona incidence is high, they can travel again, visit cinemas, restaurants or concerts and are not subject to any contact restrictions.

Eligible citizens can have themselves vaccinated against corona at the nearest vaccination center or their family doctor.

Citizens who get vaccinated receive an expense allowance of 25 euros after receiving the vaccination."""

    elif row[treatment_columns[1]] == 8:
        treatment_prompt_2 = """Special regulations apply to vaccinated people. For example, even when the Corona incidence is high, they can travel again, visit cinemas, restaurants or concerts and are not subject to any contact restrictions.

Eligible citizens can have themselves vaccinated against corona at the nearest vaccination center or their family doctor.

Citizens who get vaccinated receive an expense allowance of 50 euros after receiving the vaccination."""

    elif row[treatment_columns[1]] == 9:
        treatment_prompt_2 = """There are no special regulations for vaccinated people when the Corona incidence is high. For example, they cannot travel again, visit cinemas, restaurants or concerts and are still subject to contact restrictions.

Eligible citizens can have themselves vaccinated against corona at the nearest vaccination center or their family doctor.

Citizens who get vaccinated receive an expense allowance of 25 euros after receiving the vaccination."""

    elif row[treatment_columns[1]] == 10:
        treatment_prompt_2 = """There are no special regulations for vaccinated people when the Corona incidence is high. For example, they cannot travel again, visit cinemas, restaurants or concerts and are still subject to contact restrictions.

Eligible citizens can have themselves vaccinated against corona at the nearest vaccination center or their family doctor.

Citizens who get vaccinated receive an expense allowance of 50 euros after receiving the vaccination."""

    elif row[treatment_columns[1]] == 11:
        treatment_prompt_2 = """Special regulations apply to vaccinated people. For example, even when the Corona incidence is high, they can travel again, visit cinemas, restaurants or concerts and are not subject to any contact restrictions.

Eligible citizens can get vaccinated against Corona at the nearest vaccination center, but not from their family doctor.

Citizens who get vaccinated receive an expense allowance of 25 euros after receiving the vaccination."""

    elif row[treatment_columns[1]] == 12:
        treatment_prompt_2 = """Special regulations apply to vaccinated people. For example, even when the Corona incidence is high, they can travel again, visit cinemas, restaurants or concerts and are not subject to any contact restrictions.

Eligible citizens can get vaccinated against Corona at the nearest vaccination center, but not from their family doctor.

Citizens who get vaccinated receive an expense allowance of 50 euros after receiving the vaccination."""

    else:
        raise ValueError(f"Treatment {row[treatment_columns[1]]} is not considered.")

    if not pd.isnull(row[target_outcomes[1]]):
        prompt += [
            {"role":"user", "content":treatment_prompt_2 + "\n\n" + "Please indicate how likely it is that you would be vaccinated against corona under that policy scenario on the scale from 0 to 10. A 0 means I will definitely not be vaccinated against corona and a 10 means I am sure to get vaccinated against corona."},
            {"role":"assistant", "content":row[target_outcomes[1]]}
        ]

    # Convert the prompt list to a JSON-formatted string
    prompt_string = json.dumps(prompt)

    return prompt_string


def format_kluver_system_prompt(row: pd.Series) -> str:
    if row[treatment_columns[0]] == 1:
        treatment_prompt = """There are no special regulations for vaccinated people when the Corona incidence is high. For example, they cannot travel again, visit cinemas, restaurants or concerts and are still subject to contact restrictions.

Eligible citizens can get vaccinated against Corona at the nearest vaccination center, but not from their family doctor.

Citizens who are vaccinated will not receive any allowance after receiving the vaccination."""

    elif row[treatment_columns[0]] == 2:
        treatment_prompt = """There are no special regulations for vaccinated people when the Corona incidence is high. For example, they cannot travel again, visit cinemas, restaurants or concerts and are still subject to contact restrictions.

Eligible citizens can have themselves vaccinated against corona at the nearest vaccination center or their family doctor.

Citizens who are vaccinated will not receive any allowance after receiving the vaccination."""

    elif row[treatment_columns[0]] == 3:
        treatment_prompt = """There are no special regulations for vaccinated people when the Corona incidence is high. For example, they cannot travel again, visit cinemas, restaurants or concerts and are still subject to contact restrictions.

Eligible citizens can get vaccinated against Corona at the nearest vaccination center, but not from their family doctor.

Citizens who get vaccinated receive an expense allowance of 25 euros after receiving the vaccination."""

    elif row[treatment_columns[0]] == 4:
        treatment_prompt = """There are no special regulations for vaccinated people when the Corona incidence is high. For example, they cannot travel again, visit cinemas, restaurants or concerts and are still subject to contact restrictions.

Eligible citizens can get vaccinated against Corona at the nearest vaccination center, but not from their family doctor.

Citizens who get vaccinated receive an expense allowance of 50 euros after receiving the vaccination."""

    elif row[treatment_columns[0]] == 5:
        treatment_prompt = """Special regulations apply to vaccinated people. For example, even when the Corona incidence is high, they can travel again, visit cinemas, restaurants or concerts and are not subject to any contact restrictions.

Eligible citizens can get vaccinated against Corona at the nearest vaccination center, but not from their family doctor.

Citizens who are vaccinated will not receive any allowance after receiving the vaccination."""

    elif row[treatment_columns[0]] == 6:
        treatment_prompt = """Special regulations apply to vaccinated people. For example, even when the Corona incidence is high, they can travel again, visit cinemas, restaurants or concerts and are not subject to any contact restrictions.

Eligible citizens can have themselves vaccinated against corona at the nearest vaccination center or their family doctor.

Citizens who are vaccinated will not receive any allowance after receiving the vaccination."""

    elif row[treatment_columns[0]] == 7:
        treatment_prompt = """Special regulations apply to vaccinated people. For example, even when the Corona incidence is high, they can travel again, visit cinemas, restaurants or concerts and are not subject to any contact restrictions.

Eligible citizens can have themselves vaccinated against corona at the nearest vaccination center or their family doctor.

Citizens who get vaccinated receive an expense allowance of 25 euros after receiving the vaccination."""

    elif row[treatment_columns[0]] == 8:
        treatment_prompt = """Special regulations apply to vaccinated people. For example, even when the Corona incidence is high, they can travel again, visit cinemas, restaurants or concerts and are not subject to any contact restrictions.

Eligible citizens can have themselves vaccinated against corona at the nearest vaccination center or their family doctor.

Citizens who get vaccinated receive an expense allowance of 50 euros after receiving the vaccination."""
    
    elif row[treatment_columns[0]] == 9:
        treatment_prompt = """There are no special regulations for vaccinated people when the Corona incidence is high. For example, they cannot travel again, visit cinemas, restaurants or concerts and are still subject to contact restrictions.

Eligible citizens can have themselves vaccinated against corona at the nearest vaccination center or their family doctor.

Citizens who get vaccinated receive an expense allowance of 25 euros after receiving the vaccination."""

    elif row[treatment_columns[0]] == 10:
        treatment_prompt = """There are no special regulations for vaccinated people when the Corona incidence is high. For example, they cannot travel again, visit cinemas, restaurants or concerts and are still subject to contact restrictions.

Eligible citizens can have themselves vaccinated against corona at the nearest vaccination center or their family doctor.

Citizens who get vaccinated receive an expense allowance of 50 euros after receiving the vaccination."""

    elif row[treatment_columns[0]] == 11:
        treatment_prompt = """Special regulations apply to vaccinated people. For example, even when the Corona incidence is high, they can travel again, visit cinemas, restaurants or concerts and are not subject to any contact restrictions.

Eligible citizens can get vaccinated against Corona at the nearest vaccination center, but not from their family doctor.

Citizens who get vaccinated receive an expense allowance of 25 euros after receiving the vaccination."""

    elif row[treatment_columns[0]] == 12:
        treatment_prompt = """Special regulations apply to vaccinated people. For example, even when the Corona incidence is high, they can travel again, visit cinemas, restaurants or concerts and are not subject to any contact restrictions.

Eligible citizens can get vaccinated against Corona at the nearest vaccination center, but not from their family doctor.

Citizens who get vaccinated receive an expense allowance of 50 euros after receiving the vaccination."""

    else:
        raise ValueError(f"Treatment {row[treatment_columns[0]]} is not considered.")

    final_system_prompt = system_prompt.format(
        demographic_prompt=row["demographic_prompt"],
        treatment_prompt=treatment_prompt,
    )

    return final_system_prompt


# Load data
data = pd.read_csv(f"data/{DATA}.csv")
question_label_dict = data.iloc[0].to_dict() # Read the first row as a dictionary: {column_name: value_in_first_row}
data = data.drop(data.index[0]).reset_index(drop=True)
data[target_outcomes] = data[target_outcomes].replace('NA', None)
data[treatment_columns[0]] = data[treatment_columns[0]].astype(int)
data[treatment_columns[1]] = data[treatment_columns[1]].astype(int)

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

# Construct system and question prompts
data["demographic_prompt"] = data.apply(generate_demographic_prompt_with_labels, axis=1, args=(target_outcomes + treatment_columns, question_label_dict))
data["system_prompt"] = data.apply(format_kluver_system_prompt, axis=1)

# Format data for fine-tuning
data["text"] = data.apply(format_kluver_user_prompts, axis=1, args=(target_outcomes,))

# Save files in CSV format
data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

# Combine Datasets for LMIC Survey data

In [None]:
RANDOM_STATE = 42
DATA = "lmic_survey_data"

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

afrobarometer_ghana = pd.read_csv("data/afrobarometer_r9_ghana_latlong_training/train.csv")
afrobarometer_sl = pd.read_csv("data/afrobarometer_SL_training/train.csv")
candor_ghana = pd.read_csv("data/candour_2_ghana_training/train.csv")
arce_lmic = pd.read_csv("data/arce_et_al_2021_training_lmic/train.csv")

print(afrobarometer_ghana.shape)
print(afrobarometer_sl.shape)
print(candor_ghana.shape)
print(arce_lmic.shape)

merged_data = pd.concat([
    afrobarometer_ghana,
    afrobarometer_sl,
    candor_ghana,
    arce_lmic,
], ignore_index=False)
merged_data = merged_data.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
print(merged_data.shape)
merged_data.to_csv(f"data/{data_folder_dir}/train.csv", index=False)

# Combine Datasets for LMIC RCT data

In [None]:
RANDOM_STATE = 42
DATA = "lmic_rct_data"

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

fink = pd.read_csv("data/fink_et_al_2018_training/train.csv")
merggi = pd.read_csv("data/meriggi_et_al_2024_last_mile_training_data/train.csv")
chamie = pd.read_csv("data/chamie_et_al_2021_training/train.csv")
duflo = pd.read_csv("data/duflo_et_2019_HIV_prevention_among_youth_training/train.csv")

print(fink.shape)
print(merggi.shape)
print(chamie.shape)
print(duflo.shape)

merged_data = pd.concat([
    fink,
    merggi,
    chamie,
    duflo,
], ignore_index=False)
merged_data = merged_data.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
print(merged_data.shape)
merged_data.to_csv(f"data/{data_folder_dir}/train.csv", index=False)

# Combine Datasets for HIC Survey data

In [None]:
RANDOM_STATE = 42
DATA = "HIC Survey Data"

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

brailovskaia_sweden = pd.read_csv("data/brailovskaia_et_al_2021_Sweden_training/train.csv")
brailovskaia_us = pd.read_csv("data/brailovskaia_et_al_2021_US_training/train.csv")
eurobarometer = pd.read_csv("data/eurobarometer_94.3_Sweden_training/train.csv")
candour = pd.read_csv("data/candour_wave_2_US_training/train.csv")
hps = pd.read_csv("data/HPS_2021_training/train.csv")
arce_hic = pd.read_csv("data/arce_et_al_2021_training_hic/train.csv")

print(brailovskaia_sweden.shape)
print(brailovskaia_us.shape)
print(eurobarometer.shape)
print(candour.shape)
print(hps.shape)
print(arce_hic.shape)

merged_data = pd.concat([
    brailovskaia_sweden,
    brailovskaia_us,
    eurobarometer,
    candour,
    hps,
    arce_hic
], ignore_index=False)
merged_data = merged_data.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
print(merged_data.shape)
merged_data.to_csv(f"data/{data_folder_dir}/train.csv", index=False)

# Combine Datasets for HIC RCT data

In [None]:
RANDOM_STATE = 42
DATA = "HIC RCT Data"

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

steinert = pd.read_csv("data/steinert_et_al_2022_training/train.csv")
schneider = pd.read_csv("data/schneider_et_al_2023_training/train.csv")
galasso = pd.read_csv("data/galasso_et_al_2023_training/train.csv")
bartos = pd.read_csv("data/bartos_et_al_2022_communicating_doctors_consensus_training/train.csv")
kluver = pd.read_csv("data/kluver_et_al_2021_vaccine_hesitancy_training/train.csv")

print(steinert.shape)
print(schneider.shape)
print(galasso.shape)
print(bartos.shape)
print(kluver.shape)

merged_data = pd.concat([
    steinert,
    schneider,
    galasso,
    bartos,
    kluver
], ignore_index=False)
merged_data = merged_data.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
print(merged_data.shape)
merged_data.to_csv(f"data/{data_folder_dir}/train.csv", index=False)