# Helper Functions

In [38]:
import pandas as pd
import os
import json
import random

RANDOM_STATE = 42
random.seed(RANDOM_STATE)

def generate_demographic_prompt(row, excluded_columns): 
    demographic_questions = [question for question in list(row.index) if question not in excluded_columns]
    random.shuffle(demographic_questions)
    demographic_prompt = ""
    counter = 1
    for question in demographic_questions:
        if pd.isnull(row[question]) or row[question] == "NA":
            continue
        demographic_prompt += f"{counter}) Interviewer: {question} Me: {row[question]} "
        counter += 1

    # if include_backstories:
    #     demographic_prompt = demographic_prompt + f"\n\n{row['backstory']}"

    return demographic_prompt

def format_prompts(row: pd.Series, target_outcome) -> str:
    prompt = [{"role":"system","content":row["system_prompt"]},
              {"role":"user","content":row["user_prompt"]},
              {"role":"assistant","content":row[target_outcome]}
            ]
    
    # Convert the prompt list to a JSON-formatted string
    prompt_string = json.dumps(prompt)

    return prompt_string


# Prepare General Healthcare Data for Context-tuning

## Prepare Candour Data for Context-tuning

In [24]:
DATA = "candour_2_ghana_training"

def format_candour_prompts(row: pd.Series) -> str:
    prompt = [{"role":"system","content":row["system_prompt"]},
              {"role":"user","content":"Have you already been offered, or had an opportunity to receive, a COVID-19 vaccine?"},
              {"role":"assistant","content":row["Have you already been offered, or had an opportunity to receive, a COVID-19 vaccine?"]}
            ]

    if row["Have you already been offered, or had an opportunity to receive, a COVID-19 vaccine?"] == "Yes":
        prompt += [
            {"role":"user", "content":"Have you received a COVID-19 vaccine?"},
            {"role":"assistant", "content":row["Have you received a COVID-19 vaccine?"]}
        ]
        
        if row["Have you received a COVID-19 vaccine?"].startswith("Yes"):
            prompt += [
                {"role":"user", "content":"Why did you decide to get vaccinated against COVID-19, what were the reasons?"},
                {"role":"assistant", "content":row["Why did you decide to get vaccinated against COVID-19, what were the reasons?"]}
            ]

        elif row["Have you received a COVID-19 vaccine?"] == "No, I have declined the offer to be vaccinated":
            prompt += [
                {"role":"user", "content":"Why did you decide NOT to get vaccinated against COVID-19, what are / were reasons?"},
                {"role":"assistant", "content":row["Why did you decide NOT to get vaccinated against COVID-19, what are / were reasons?"]}
            ]

        else:  # Not yet but I am waiting for my appointment, Prefer not to say
            pass

    elif row["Have you already been offered, or had an opportunity to receive, a COVID-19 vaccine?"] == "No":
        prompt += [
            {"role":"user", "content":"If a COVID-19 vaccine was available to you, would you definitely get it, probably get it, probably not get it or definitely not get it?"},
            {"role":"assistant", "content":row["If a COVID-19 vaccine was available to you, would you definitely get it, probably get it, probably not get it or definitely not get it?"]}
        ]

        if row["If a COVID-19 vaccine was available to you, would you definitely get it, probably get it, probably not get it or definitely not get it?"] == "Definitely get it":
            prompt += [
                {"role":"user", "content":"What are your reasons for getting vaccinated for COVID-19?"},
                {"role":"assistant", "content":row["What are your reasons for getting vaccinated for COVID-19?"]}
            ]

        elif row["If a COVID-19 vaccine was available to you, would you definitely get it, probably get it, probably not get it or definitely not get it?"] == "Definitely not get it":
            prompt += [
                {"role":"user", "content":"What are your reasons for NOT getting vaccinated for COVID-19?"},
                {"role":"assistant", "content":row["What are your reasons for NOT getting vaccinated for COVID-19?"]}
            ]

        elif row["If a COVID-19 vaccine was available to you, would you definitely get it, probably get it, probably not get it or definitely not get it?"] in ["Probably not get it","Probably get it"]:
            prompt += [
                {"role":"user", "content":"What are your reasons for getting vaccinated for COVID-19?"},
                {"role":"assistant", "content":row["What are your reasons for getting vaccinated for COVID-19?"]},
                {"role":"user", "content":"What are your reasons for NOT getting vaccinated for COVID-19?"},
                {"role":"assistant", "content":row["What are your reasons for NOT getting vaccinated for COVID-19?"]}
            ]

        else: # Prefer not to say, Do not know
            pass

    else:  # Don't Know, Prefer not to Say
        pass

    # Convert the prompt list to a JSON-formatted string
    prompt_string = json.dumps(prompt)

    return prompt_string

#  CANDOR
target_outcomes = [
    "Have you already been offered, or had an opportunity to receive, a COVID-19 vaccine?",
    "Have you received a COVID-19 vaccine?",
    "Why did you decide NOT to get vaccinated against COVID-19, what are / were reasons?",
    "Why did you decide to get vaccinated against COVID-19, what were the reasons?",
    "If a COVID-19 vaccine was available to you, would you definitely get it, probably get it, probably not get it or definitely not get it?",
    "What are your reasons for NOT getting vaccinated for COVID-19?",
    "What are your reasons for getting vaccinated for COVID-19?",
]

system_prompt = f"""Please put yourself in the shoes of a human subject participating in a healthcare survey in Ghana. You will be provided with a demographic profile that describes your age, gender, region/district where you live in, the highest education level you achieved, ideology, political point-of-view, ethnicity, religion, martial status, household size, economic situation and attitude, vaccination hesitancy, views on your country's health policies, health conditions, and EQ-5D health-related quality of life. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. Additionally, we will provide you with some general findings from past studies on Ghana’s COVID-19 vaccination efforts. After you receive your complete human subject profile, you will be asked whether you received the COVID-19 vaccination. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile:
"""

# Load Candor data
candour_data = pd.read_csv(f"data/{DATA}.csv", header=1)

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

# Remove entries with missing responses
candour_data = candour_data.dropna(
    subset="Have you already been offered, or had an opportunity to receive, a COVID-19 vaccine?"
).reset_index(drop=True)

# Construct system and question prompts
candour_data["demographic_prompt"] = candour_data.apply(generate_demographic_prompt, axis=1, args=(target_outcomes + ["ID"],))
candour_data["system_prompt"] = candour_data["demographic_prompt"].apply(lambda x: system_prompt + x)

# Format data for fine-tuning
candour_data["text"] = candour_data.apply(format_candour_prompts, axis=1)

# Save files in CSV format
candour_data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

## Prepare Afrobarometer Data for Context-tuning

In [4]:
RANDOM_STATE = 42
DATA = "afrobarometer_r9_ghana_latlong_training"

# Afrobarometer
demographic_questions = [
    "Country",
    "PSU/EA",
    "Region/Province/State",
    "Are the following services present in the primary sampling unit/enumeration area: Electricity grid that most houses can access? Answer with No; Yes; Can't determine",
    "Are the following services present in the primary sampling unit/enumeration area: Sewage system that most houses can access? Answer with No; Yes; Can't determine",
    "Are the following services present in the primary sampling unit/enumeration area: Mobile phone service? Answer with No; Yes; Can't determine",
    "Are the following facilities present in the primary sampling unit/enumeration area or in easy walking distance: School (private or public or both)? Answer with No; Yes; Can't determine",
    "Are the following facilities present in the primary sampling unit/enumeration area or in easy walking distance: Health clinic (private or public or both)? Answer with No; Yes; Can't determine",
    "Date of interview",
    "How old are you? Answer with an integer above 17; Refused; Don't know",
    "What is the primary language you speak in your home? Answer with Achode; Akan; Atwede; Baasare; Banda; Basare; Bem; Bimoba; Bisa; Bowiri; Brefo; Bulisa; Busanga; Busi; Buuzu; Chamba; Chokosi; Dagaare/Waale; Dagbani; Dagomba; Ekpana; English; Ewe/Anlo; Frafra; Fulani; Ga/Dangbe; Gawo; Gonja; Gruma; Gruni; Grusi; Guan; Hausa; Kabre; Kassem; Konkonba; Kotokoli; Kusaal; Kusasi; Likpakpaln; Mampruli; Moar; Moli; Moshie; Nabt; Nankani; Safalba; Sissali; Taln; Tampulima; Tsala; Wala; Zamrama; Refused; Don't know",
    "Let's start with your general view about the current direction of our country. Some people might think the country is going in the wrong direction. Others may feel it is going in the right direction. So let me ask YOU about the overall direction of the country: Would you say that the country is going in the wrong direction or going in the right direction? Answer with Going in the wrong direction; Going in the right direction; Refused; Don't know",
    "In general, how would you describe: the present economic condition of this country? Answer with Very bad; Fairly bad; Neither good nor bad; Fairly good; Very good; Refused; Don't know",
    "In general, how would you describe: Your own present living conditions? Answer with Very bad; Fairly bad; Neither good nor bad; Fairly good; Very good; Refused; Don't know",
    "Over the past year, how often, if ever, have you or anyone in your family gone without: Medicines or medical treatment? Answer with Never; Just once or twice; Several times; Many times; Always; Refused; Don't\nknow",
    "When you get together with your friends or family, how often would you say you discuss political matters? Answer with Never; Occasionally; Frequently; Refused; Don't know",
    "In this country, how free are you: to choose who to vote for without feeling pressured? Answer with Not at all free; Not very free; Somewhat free; Completely free; Refused; Don't\nknow",
    "Let's talk about the last national election held in 2020. People are not always able to vote in elections, for example, because they weren't registered, they were unable to go, or someone prevented them from voting. How about you? In the last national election held in 2020, did you vote, or not, or were you too young to vote? Or can’t you remember whether you voted? Answer with I did not vote; I was too young to vote; I can't remember whether I voted; I voted in the election; Refused; Don't know",
    "How much do you trust each of the following, or haven't you heard enough about them to say: the [president]? Answer with Not at all; Just a little; Somewhat; A lot; Refused; Don't know/Haven't heard enough",
    "How much do you trust each of the following, or haven't you heard enough about them to say: [Parliament]? Answer with Not at all; Just a little; Somewhat; A lot; Refused; Don't know/Haven't heard enough",
    "How much do you trust each of the following, or haven't you heard enough about them to say: your [local government council]? Answer with Not at all; Just a little; Somewhat; A lot; Refused; Don't know/Haven't heard enough",
    "How much do you trust each of the following, or haven't you heard enough about them to say: the ruling party? Answer with Not at all; Just a little; Somewhat; A lot; Refused; Don't know/Haven't heard enough",
    "How much do you trust each of the following, or haven't you heard enough about them to say: traditional leaders? Answer with Not at all; Just a little; Somewhat; A lot; Refused; Don't know/Haven't heard enough",
    "How much do you trust each of the following, or haven't you heard enough about them to say: religious leaders? Answer with Not at all; Just a little; Somewhat; A lot; Refused; Don't know/Haven't heard enough",
    "In the past 12 months have you had contact with a public clinic or hospital? Answer with No; Yes; Refused; Don't know",
    "How easy or difficult was it to obtain the medical care or services you needed? Answer with Very easy; Easy; Difficult; Very difficult; Refused; Don't know. Answer No contact if you haven't had any contact with a public clinic or hospital in the past 12 months",
    "How often, if ever, did you have to pay a bribe, give a gift, or do a favour for a health worker or clinic or hospital staff in order to get the medical care or services you needed? Answer with Never; Once or twice; A few times; Often; Refused; Don't know. Answer No contact if you haven't had any contact with a public clinic or hospital in the past 12 months",
    "In general, when dealing with health workers and clinic or hospital staff, how much do you feel that they treat you with respect? Answer with Not at all; A little bit; Somewhat; A lot; Refused; Don't know. Answer No contact if you haven't had any contact with a public clinic or hospital in the past 12 months",
    "And have you encountered any of these problems with a public clinic or hospital during the past 12 months: lack of medicines or other supplies? Answer with Never; Once or twice; A few times; Often; Refused; Don't know. Answer No contact if you haven't had any contact with a public clinic or hospital in the past 12 months",
    "And have you encountered any of these problems with a public clinic or hospital during the past 12 months: absence of doctors or other medical personnel? Answer with Never; Once or twice; A few times; Often; Refused; Don't know. Answer No contact if you haven't had any contact with a public clinic or hospital in the past 12 months",
    "And have you encountered any of these problems with a public clinic or hospital during the past 12 months: long waiting time? Answer with Never; Once or twice; A few times; Often; Refused; Don't know. Answer No contact if you haven't had any contact with a public clinic or hospital in the past 12 months",
    "And have you encountered any of these problems with a public clinic or hospital during the past 12 months: poor condition of facilities? Answer with Never; Once or twice; A few times; Often; Refused; Don't know. Answer No contact if you haven't had any contact with a public clinic or hospital in the past 12 months",
    "In your opinion, what are the most important problems facing this country that government should address?",
    "Please tell me whether you personally or any other or any other member of your household have been affected in any of the following ways by the COVID-19 pandemic: became ill with, or tested positive for, COVID-19? Answer with Yes; No; Refused; Don't know",
    "Please tell me whether you personally or any other or any other member of your household have been affected in any of the following ways by the COVID-19 pandemic: temporarily or permanently lost a job, business, or primary source of income? Answer with Yes; No; Refused; Don't know",
    "What is the main reason that you would be unlikely to get a COVID-19 vaccine? Answer with COVID doesn't exist/COVID is not real; Not worried about COVID/COVID is not serious or life-threatening/not deadly; I am at no risk or low risk for getting COVID/Small chance of contracting COVID;\n    I already had COVID and believe I am immune; God will protect me; Don't trust the vaccine/worried about getting fake or counterfeit vaccine;\n    Don't trust the government to ensure the vaccine is safe; Vaccine is not safe; Vaccine was developed too quickly;\n    Vaccine is not effective/Vaccinated people can still get COVID; Vaccine may cause COVID; Vaccine may cause infertility;\n    Vaccine may cause other bad side effects; Vaccines are being used to control or track people; People are being experimented on with vaccines;\n    Afraid of vaccines in general; Allergic to vaccines; Don't like needles;\n    Don't trust the vaccine source/will wait for other vaccines; Effective treatments for COVID are or will be available; It is too difficult to get the vaccine, e.g. have to travel far;\n    Vaccine will be too expensive; I don't know how to get the vaccine; I will wait until others have been vaccinated;\n    I will get the vaccine later; Religious objections to vaccines in general or to the COVID vaccine; Some other reason;\n    Don't know. Answer Not applicable if you've already been vaccinated or have answered you're likely to get vaccinated",
    "How much do you trust the government to ensure that any vaccine for COVID-19 that is developed or offered to Ghanaian citizens is safe before it is used in this country? Answer with Not at all; Just a little; Somewhat; A lot; Refused; Don't know",
    "How well or badly would you say the current government has managed the response to the COVID-19 pandemic? Answer with Very badly; Fairly badly; Fairly well; Very well; Refused; Don't know",
    "When the country is facing a public health emergency like the COVID-19 pandemic, do you agree or disagree that it is justified for the government to temporarily limit democracy or democratic freedoms by taking the following measures: using the police and security forces to enforce public health mandates like restrictions on public gatherings or wearing face masks? Answer with Strongly disagree; Disagree; Neither agree nor disagree; Agree; Strongly agree; Refused; Don't know",
    "Now let us talk about the media and how you get information about politics and other issues. How often do you get news from the following sources: radio? Answer with Never; Less than once a month; A few times a month; A few times a week; Every day; Refused; Don't know",
    "How often do you get news from the following sources: television? Answer with Never; Less than once a month; A few times a month; A few times a week; Every day; Refused; Don't know",
    "How often do you get news from the following sources: print newspapers? Answer with Never; Less than once a month; A few times a month; A few times a week; Every day; Refused; Don't know",
    "How often do you get news from the following sources: internet? Answer with Never; Less than once a month; A few times a month; A few times a week; Every day; Refused; Don't know",
    "How often do you get news from the following sources: social media such as Facebook, Twitter, WhatsApp, or others? Answer with Never; Less than once a month; A few times a month; A few times a week; Every day; Refused; Don't know",
    "Let's go back to talking about you. What is your ethnic community or cultural group? Answer with (National identity) only, or “doesn't think of self in those terms”; Akan; Banda; Basare; Bem; Bisa; Bole; Brefo; Brefo/wala; Bulisa; Busanga; Busi; Buuzu ( mali); Dagaati; Dagbani; Dagomba; Ekpana; Ewe/Anlo; Frafri; Fulani; Ga/Adangbe; Gangaca; Gawo; Gonja; Gruma; Gruni; Grusi; Guan; Gurma; Hausa; Kabre; Kasasi; Kassem; Konkonba; Kotokoli; Kulkulsi; Kusasi; Mamprusi; Mande; Mole-dagbani; Moshie; Nankani; Pampurisi; Safalba; Sissala; Talensi; Taln; Tampluma; Tampulinsi; Templeman; Tsalla; Tsamba; Wale; Wusasi; Zamrama; Zugu; Refused to answer; Don't know",
    "Please tell me whether you agree or disagree with the following statement: I feel strong ties with other Ghanaians. Answer with Strongly disagree; Disagree; Neither agree nor disagree; Agree; Strongly agree; Refused; Don't know",
    "How much do you trust each of the following types of people: other Ghanaians? Answer with Not at all; Just a little; Somewhat; A lot; Refused; Don't know",
    "Do you feel close to any particular political party? Answer with No (does NOT feel close to ANY party); Yes (feels close to a party); Refused to answer; Don't know",
    "Which party is that? Answer with BOTH NPP AND NDC; Convention People's Party (CPP); Democratic People's Party (DPP); Don't know; National Democratic Congress (NDC); New Patriotic Party (NPP); Not Applicable; People's National Convention (PNC); Progressive People's Party (PPP); Refused; Refused; Don't know. Answer Not applicable if you don't feel close to any party",
    "What is your main occupation? [If unemployed, retired, or disabled, ask:] What was your last main occupation? Answer with Never had a job; Student; Housewife/Homemaker;\n    Agriculture/Farming/Fishing/Forestry; Trader/Hawker/Vendor; Retail/Shop;\n    Unskilled manual worker (e.g. cleaner, laborer, domestic help, unskilled manufacturing worker); Artisan or skilled manual worker (e.g. trades like electrician, mechanic, mechanic, machinist, or skilled manufacturing worker); Clerical or secretarial;\n    Supervisor/Foreman/Senior manager; Security services; Mid-level professional (e.g. teacher, nurse, mid-level government officer);\n    Upper-level professional (e.g. banker/finance, doctor, lawyer, engineer, accountant, professor, senior-level government officer); Other; Refused;\n    Don't know; Retired",
    "What is your highest level of education? Answer with No formal schooling; Informal schooling only (including Koranic schooling); Some primary schooling;\n    Primary school completed; Intermediate school or some secondary school/high school; Secondary school/high school completed;\n    Post-secondary qualifications other than university, e.g. a diploma or degree from a polytechnic or college; Some university; University completed;\n    Post-graduate; Refused; Don't know",
    "What is your religion, if any? Answer with None; Christian only (i.e., without specific sub-group identification); Roman Catholic; Orthodox; Coptic; Anglican;\n    Lutheran; Methodist; Presbyterian; Baptist; Quaker/Friends; Mennonite;\n    Evangelical; Pentecostal (e.g. “born again” and/or “saved”); Independent (e.g. “African Independent Church”); Jehovah's Witness; Seventh-day Adventist; Mormon;\n    Muslim only (i.e., without specific sub-group identification); Sunni only (i.e., without specific sub-group identification); Ismaeli; Mouridiya Brotherhood; Tijaniya Brotherhood; Qadiriya Brotherhood;\n    Shia; Traditional/Ethnic religion; Hindu; Bahai; Agnostic (Do not know if there is a God); Atheist (Do not believe in a God);\n    Dutch Reformed; Calvinist; Church of Christ; Zionist Christian Church; Jewish; Eglise Du Christianisme Céleste;\n    Fifohazana; Ançardine; Morovian; Faith of Unity; United Church of Zambia or UCZ; New Apostolic Church;\n    Christian mission in many lands (CMML); Salvation Army; Other; Refused; Don't know",
    "Respondent's gender Answer with Man; Woman",
    "Respondent's race Answer with Black/African; White/European; Coloured/Mixed race; Arab/Lebanese/North African;\n    South Asian (Indian, Pakistani, etc.); East Asian (Chinese, Korean, Indonesian, etc.); Other; Don't know",
    "Latitude",
    "Longitude",
]

target_outcome = [
    "Have you received a vaccination against COVID-19, either one or two doses? Answer with No; Yes; Refused; Don't know",
    "If a vaccine for COVID-19 is available, how likely are you to try to get vaccinated? Answer with Very unlikely; Somewhat unlikely; Somewhat likely; Very likely; Refused; Don't know. Answer Not applicable if you have never received a COVID-19 vaccination"
]

question = f"{target_outcome[0]}. If your response to the previous question is No, Refused or Don't Know, then answer the following question: {target_outcome[1]}"

system_prompt = """Please put yourself in the shoes of a human subject participating in a healthcare survey in Ghana. You will be provided with a demographic profile that describes the geographical area/region/district where you live, the facilities and services in your area, your age, view of the country, living conditions, voting preferences, trust in different authorities, experience when seeking healthcare, experience with COVID-19, views on vaccination, preferred sources of information, employment status, highest education level, religion, gender, and race. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. After you receive your complete human subject profile, you will be asked whether you received the COVID-19 vaccination. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile:
"""

def format_afrobarometer_prompts(row: pd.Series, target_outcome) -> str:
    if row[target_outcome[0]] == "Yes":
        prompt = [{"role":"system","content":row["system_prompt"]},
                {"role":"user","content":row["user_prompt"]},
                {"role":"assistant","content":"Yes"}
                ]
    else:
        prompt = [{"role":"system","content":row["system_prompt"]},
                {"role":"user","content":row["user_prompt"]},
                {"role":"assistant","content":f"{row[target_outcome[0]]}, {row[target_outcome[1]]}"}
                ]
    
    # Convert the prompt list to a JSON-formatted string
    prompt_string = json.dumps(prompt)

    return prompt_string


# Load Afrobarometer data
afro_data = pd.read_csv(f"data/{DATA}.csv", header=1)
afro_data = afro_data[demographic_questions + target_outcome + ["ID"]]

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

# Remove entries with missing responses
afro_data = afro_data.dropna(subset=target_outcome).reset_index(drop=True)

# Construct system, user, and response prompts
afro_data["demographic_prompt"] = afro_data.apply(generate_demographic_prompt, axis=1, args=([target_outcome, "ID"],))
afro_data["system_prompt"] = afro_data["demographic_prompt"].apply(lambda x: system_prompt + x)
afro_data["user_prompt"] = question

# Format data for fine-tuning
afro_data["text"] = afro_data.apply(format_afrobarometer_prompts, axis=1, args=(target_outcome,))

# Save files in CSV format
afro_data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

## Prepare Arce Et Al 2021 for Context-tuning

In [8]:
DATA = "arce_et_al_2021_training"

target_outcome = "Respondent would take the vaccine if available?"

question = target_outcome

system_prompt = f"""Please put yourself in the shoes of a human subject participating in a healthcare survey in Africa. You will be provided with a demographic profile that describes your country, age, highest level of education, gender, reasons for taking the COVID-19 vaccine, reasons for not taking the COVID-19 vaccine, and people you would trust to help you decide to get vaccinated. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. After you receive your complete human subject profile, you will be asked whether you would take the COVID-19 vaccination, if available. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile:
"""

# Load Arce et al. data
arce_data = pd.read_csv(f"data/{DATA}.csv", header=1)

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

# Remove entries with missing responses
arce_data = arce_data.dropna(subset=[target_outcome]).reset_index(drop=True)

# Retain records from African countries and randomly sample 2000 entries
african_countries = [
    'Burkina Faso', 
    'Mozambique',
    'Sierra Leone 2', 
    'Uganda 2', 
    'Uganda 1', 
    'Rwanda',
    'Sierra Leone 1', 
    'Nigeria',
]
arce_data = arce_data[arce_data["Country where the study took place"].isin(african_countries)]
sampled_arce_data = arce_data.sample(n=2000, random_state=RANDOM_STATE).reset_index(drop=False)

# Construct system and question prompts
sampled_arce_data["demographic_prompt"] = sampled_arce_data.apply(generate_demographic_prompt, axis=1, args=([target_outcome],))
sampled_arce_data["system_prompt"] = sampled_arce_data["demographic_prompt"].apply(lambda x: system_prompt + x)
sampled_arce_data["user_prompt"] = question

# Format data for fine-tuning
sampled_arce_data["text"] = sampled_arce_data.apply(format_prompts, axis=1, args=(target_outcome,))

# Save files in CSV format
sampled_arce_data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

## Combine General Healthcare Survey Data

In [None]:
RANDOM_STATE = 42
DATA = "general_healthcare_context_training"

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

afrobarometer_data = pd.read_csv("data/afrobarometer_r9_ghana_latlong_training/train.csv")
arce_data = pd.read_csv("data/arce_et_al_2021_training/train.csv")
candour_data = pd.read_csv("data/candour_2_ghana_training/train.csv")

print(afrobarometer_data.shape)
print(arce_data.shape)
print(candour_data.shape)

merged_data = pd.concat([afrobarometer_data, arce_data, candour_data], ignore_index=False)
merged_data = merged_data.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

merged_data.to_csv(f"data/{data_folder_dir}/train.csv", index=False)

print(merged_data.shape)
merged_data.head()

# Prepare Duch 2023 for Context-Tuning

In [None]:
from sklearn.model_selection import train_test_split

def include_variable_names(
    data_with_responses: pd.DataFrame, data_file_path: str
) -> pd.DataFrame:
    """Include variable names from the original data file into the provided DataFrame.
    This function reads the original data file (CSV or XLSX) to extract the column headers,
    maps the current column headers in the provided DataFrame to the original headers,
    and then inserts the current headers as the first row in the resulting DataFrame.

    Args:
        data_with_responses (pd.DataFrame): DataFrame containing the data with responses.
        data_file_path (str): Path to the original data file (CSV or XLSX) containing the headers.
    Returns:
        pd.DataFrame: DataFrame with the original headers included and the current headers as the first row.
    Raises:
        ValueError: If the provided file format is not supported (neither CSV nor XLSX).
    """

    def get_key_by_value(d, value):
        for key, val in d.items():
            if val == value:
                return key
        return value

    if data_file_path.endswith(".csv"):
        original_data_with_headers = pd.read_csv(data_file_path)
    elif data_file_path.endswith(".xlsx"):
        original_data_with_headers = pd.read_excel(data_file_path)
    else:
        raise ValueError("Unsupported file format. Please provide a CSV or XLSX file.")

    # Extract the first row from the original data
    col_name_mapping = original_data_with_headers.iloc[0].to_dict()

    new_col_headers = []
    for col in data_with_responses.columns:
        new_col_headers.append(get_key_by_value(col_name_mapping, col))

    # Push the current column headers into the first row
    headers_as_first_row = pd.DataFrame(
        [data_with_responses.columns], columns=data_with_responses.columns
    )

    # Concatenate the headers_as_first_row with the results dataframe
    data_with_response_headers = pd.concat(
        [headers_as_first_row, data_with_responses], ignore_index=True
    )

    # Assign new column headers to the results dataFrame
    data_with_response_headers.columns = new_col_headers

    return data_with_response_headers


RANDOM_STATE = 42
DATA = "duch_et_al_2023_vaccine_financial_vaccine_intention_training"

demographic_questions = [
    "Start Date",
    "What is your current age?",
    "What is your gender?",
    "What is the highest educational qualification you have completed?",
    "Which region do you live in?",
    "Which distric do you live in?",
    "What is the name of the community you live in?",
    "How many people live in your village?",
    "What is the distance in km of the nearest health clinic from where you live?",
    "How many people live in the house together with you (NOT including you) at this moment?",
    "How many children below 18 years old are currently living in your home?",
    "What is your current working situation?",
    "How much (in Ghanaian Cedis) on average does your household spend in a typical week on food?",
    "How much (in Ghanaian Cedis) on average does your household spend in a typical week on non-food items (electricity, water, rent, school fees)?",
    "How would you rate the overall economic or financial condition of your household today?",
    "Do you have a registered mobile number?",
    "How many family members do you have in another village?",
    "How many friends and acquaintances who are not part of your family do you have in another village?",
    "How many individuals can you identify in your social network? Think of friends and relatives that live close to you",
    "How often do you use social media?",
]

target_outcome = "Do you think you will get a first shot of a COVID-19 vaccine within the first 6 weeks after the vaccine becomes available to you?"
question = f"{target_outcome} Please only repond with 'Yes', 'No', 'Do not know', or 'Prefer not to say':"

system_prompt_1 = """Please put yourself in the shoes of a human subject participating in a healthcare survey in Ghana about the COVID-19 vaccine. You will be provided with a demographic profile that describes your age, gender, highest education level you achieved, region/district you live in, size of your village, distance to nearest health clinic in km, household size, current employment situation, average household spending, household economic/financial condition, number of family members and friends in another village, social network, and social media use. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. Lastly, you will watch a video. Thereafter, you will be asked whether you think you will get a first shot of a COVID-19 vaccine within the first 6 weeks after the vaccine becomes available to you. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile:
"""

system_prompt_2 = """

You are asked to watch a video at this point. Here you are provided with the transcript of the video. You have to read the full transcript in order to continue the survey:
"""

treatment_transcript = {
    "CDC Health": "Health authorities are working hard to distribute the COVID-19 vaccines free for everyone with no strings attached. COVID 19 vaccines are safe and effective. After you have been fully vaccinated you can resume activities that you did prior to the pandemic. Getting the COVID-19 vaccine will help prevent you from getting COVID-19 and reduce your risk of being hospitalized with COVID-19. COVID 19 vaccine help you to protect yourself your environment and your loved ones from COVID-19 exposure.",
    "Placebo": "The Sun lights up our lives for business for education even for socializing but when the Sun sets many people use candles who are quality battery-operated torches and kerosene lamps as inefficient and expensive ways to create light. What if you can take some Sun with you at night?  You can with portable solar products there are different types, but each portable solar product is made up of three basic parts: a small solar panel, a modern rechargeable battery and an LED bulb. The solar panel catches the light from the Sun and stores this energy in the battery. This can now be used for much needed light when it's dark. Many can even charge phones portable solar products should be reliable affordable and warranted be sure to demand top quality solar products look for these products lighting Africa shining the way.",
    "Low Cash": "Health authorities are working hard to distribute the COVID-19 vaccines free for everyone with no strings attached. COVID-19 vaccines are safe and effective. After you have been fully vaccinated you can resume activities that you did prior to the pandemic. If you have at least one COVID-19 vaccine shot you will receive 20 Cedi. If you get vaccinated, you will get rewarded.",
    "High Cash": "Health authorities are working hard to distribute the COVID-19 vaccines free for everyone with no strings attached. COVID-19 vaccines are safe and effective. After you have been fully vaccinated you can resume activities that you did prior to the pandemic. If you have at least one COVID-19 vaccine shot you will receive 60 Cedi. If you get vaccinated, you will get rewarded.",
}

def format_system_prompt(row: pd.Series) -> str:
    final_system_prompt = system_prompt_1 + row["demographic_prompt"] + system_prompt_2 + treatment_transcript[row["treatment"]]
    return final_system_prompt


# Load data
data = pd.read_csv(f"data/{DATA}.csv", header=1)
data = data[demographic_questions + [target_outcome, "ID", "treatment"]]

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

# Remove entries with missing target responses
data = data.dropna(subset=[target_outcome]).reset_index(drop=True)

# Perform fine-tuning data (10%) and holdout data (90%) split
finetune_data, holdout_data = train_test_split(data, test_size=0.9, random_state=RANDOM_STATE)
holdout_data = include_variable_names(holdout_data, f"data/{DATA}.csv")
holdout_data.to_csv(f"data/{DATA}_holdout.csv", index=False)

# Construct system and question prompts
finetune_data["demographic_prompt"] = finetune_data.apply(generate_demographic_prompt, axis=1, args=([target_outcome, "ID"],))
finetune_data["system_prompt"] = finetune_data.apply(format_system_prompt, axis=1)
finetune_data["user_prompt"] = question

# Format data for fine-tuning
finetune_data["text"] = finetune_data.apply(format_prompts, axis=1, args=(target_outcome,))

# Save files in CSV format
finetune_data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

# Prepare Meriggi et al. 2024 for Context-Tuning

In [41]:
RANDOM_STATE = 42
DATA = "meriggi_et_al_2024_last_mile_training_data"

question_mapping = {
    "villsize":"How many people live in your village?",
    "periphery":"Do you live on the periphery of your community? Please, reply with Yes or No",
    "vaccinated_team":"Have you been vaccinated against COVID-19 in a mobile vaccination clinic? Please, reply with Yes or No",
    "vaccinated_baseline":"Have you been vaccinated against COVID-19 and have vaccine card to prove it? Please, reply with Yes or No",
    "preg":"Are you pregnant? Please, reply with Yes or No",
    "breast":"Are you breastfeeding? Please, reply with Yes or No",
    "age":"What is your age in years?",
    "female":"What is your gender? Please, reply with Male or Female",
    "hh_gender":"What is the gender of the head of your household? Please, reply with Male or Female",
    "farmer":"Is farming the chief source of income for your household? Please, reply with Yes or No",
    "anyschooling":"Has the head of your household received any formal education? Please, reply with Yes or No",
    "BSL_owns_land":"Do you or your household members own any land? Please, reply with Yes or No",
    "BSL_covid_believe":"Do you believe that COVID-19 exists in the world? Please, reply with Yes or No",
    "BSL_covid_know":"Do you know about the COVID-19 vaccine/marklate? Please, reply with Yes or No",
    "BSL_covid_wouldtake":"Would you take a COVID-19 vaccine/marklate if it were offered to you? Please, reply with Yes or No",
    "BSL_reduced_portions":"Over the last 7 days (week), has your household had reduced portions/quantities served per meal for more than 1 day? Please, reply with Yes or No",
    "BSL_safe_stragree":"Do you strongly agree with this statement: COVID-19 vaccines are safe. Please, reply with Yes or No",
    "BSL_effect_stragree":"Do you strongly agree with this statement: COVID-19 vaccines are effective. Please, reply with Yes or No",
    "END_covid_believe":"Do you believe that COVID-19 exists in the world? Please, reply with Yes or No",
    "END_covid_know":"Do you know about the COVID-19 vaccine/marklate? Please, reply with Yes or No",
    "END_safe_stragree":"Do you strongly agree with this statement: COVID-19 vaccines are safe. Please, reply with Yes or No",
    "END_effect_stragree":"Do you strongly agree with this statement: COVID-19 vaccines are effective. Please, reply with Yes or No",
    "religion":"What is the religion of the head of the household?",
    "BSL_trust":"Who do you most trust getting information about COVID-19? Please, reply with CHC, Family and friends, Social media, Media (i.e. news/radio/tv), or Ministry of Health and Sanitation",
    "END_trust":"Who do you most trust getting information about COVID-19? Please, reply with CHC, Family and friends, Social media, Media (i.e. news/radio/tv), or Ministry of Health and Sanitation",
}

demographic_questions = [
    "villsize",
    "periphery",
    "preg",
    "breast",
    "age",
    "female",
    "hh_gender",
    "farmer",
    "anyschooling",
    "BSL_owns_land",
    "BSL_covid_believe",
    "BSL_covid_know",
    "BSL_covid_wouldtake",
    "BSL_reduced_portions",
    "BSL_safe_stragree",
    "BSL_effect_stragree",
    "religion",
    "BSL_trust",
    "vaccinated_baseline",
]

target_outcomes = [
    "vaccinated_team",
    "END_covid_believe",
    "END_covid_know",
    "END_safe_stragree",
    "END_effect_stragree",
    "END_trust",
]

system_prompt_1 = """Please put yourself in the shoes of a human subject participating in a healthcare survey in a remote rural community in Sierra Leone about the COVID-19 vaccine. You will be provided with a demographic profile that describes your age, gender, whether the head of your household received any formal education, size of your village, whether you live on the periphery of your community, and whether you or your household own any land. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. Lastly, you will be provided with the description of an initiative that was carried out in your area. Thereafter, you will be asked whether you received the COVID-19 vaccination from a mobile vaccination team. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile:
"""

system_prompt_2 = """

In your area, an initiative was implemented in March–April 2022 in partnership with the Sierra Leone Ministry of Health and Sanitation (MoHS) and the international non-governmental organization (NGO) Concern Worldwide. The primary aim of this intervention was to take vaccine doses and nurses to administer vaccines to remote, rural communities. The initiative involved the following steps.

Step 1. On the first day of the intervention, a social mobilization team—trained and supervised by the MoHS—organized a conversation with all village leaders, including the town chief, mammy queen, town elders, the youth leaders and religious leaders, and any other important stakeholders including the paramount and section chiefs if they were available. The mobilization team explained the purpose of the visit, answered questions about the available vaccines and asked leaders for their cooperation in encouraging eligible community members to take the COVID-19 vaccine.

Step 2. Social mobilizers then asked leaders to convene a community meeting that same evening (when people return home from farms) to allow mobilizers to talk directly with all village residents about vaccine efficacy and safety, the importance of getting vaccinated, and to address villagers’ questions and concerns. This process ended with social mobilizers explaining the location and timing of the mobile vaccination site that they were about to set up.

Step 3. Vaccine doses, nurses to administer vaccines and MoHS staff who could register the vaccinated were brought into the community either the same evening or early the next morning. The vaccine doses and staff often travelled on motorbikes or on boats given the difficult terrain they had to traverse to reach these remote communities.

Step 4. Once the team was in place, the temporary vaccination site started operating in a central location in the village. The vaccination site remained operational from sunrise to sunset over the next 2–3 days.

"""


def format_meriggi_user_prompts(row: pd.Series, target_outcomes) -> str:
    prompt = [{"role":"system","content":row["system_prompt"]}]


    for target_outcome in target_outcomes:
        if pd.isnull(row[target_outcome]) or row[target_outcome] == "NA":
            continue

        prompt += [
            {"role":"user", "content":question_mapping[target_outcome]},
            {"role":"assistant", "content":row[target_outcome]}
        ]
    
    # Convert the prompt list to a JSON-formatted string
    prompt_string = json.dumps(prompt)

    return prompt_string


def format_meriggi_system_prompt(row: pd.Series) -> str:
    final_system_prompt = system_prompt_1 + row["demographic_prompt"] + system_prompt_2

    if row["treatment"] == "Group Mobilization":
        final_system_prompt += "Step 5. Social mobilizers targeted social groups who gathered at fixed spots in and around the village (for example, groups of farmers in fields, mosque attendees or women collecting water). Social mobilizers engaged the group to have joint conversations about the vaccines."

        attended_value = str(row["attended"])
        if attended_value == "1":
            final_system_prompt += " You attended a group session organised by MoHS staff."
        elif attended_value == "0":
            final_system_prompt += " You did not attend a group session organised by MoHS staff."
        elif attended_value == "NA" or pd.isnull(row["attended"]):
            pass
        else:
            raise ValueError(f"{row['attended']} is not considered.")
        
    elif row["treatment"] == "Individual Mobilization":
        dtd_value = str(row["dtd"])
        if dtd_value == "1":
            final_system_prompt += "Step 5. Social mobilizers came to your household to privately discuss any concerns about that vaccine that the household residents had and to encourage them to visit the vaccination site."
        elif dtd_value in ["0", "NA"] or pd.isnull(row["dtd"]):
            pass
        else:
            raise ValueError(f"{row['dtd']} is not considered.")
    
    else:
        raise ValueError(f"{row['treatment']} is not considered.")
                 
    return final_system_prompt


def generate_meriggi_demographic_prompt(row, excluded_columns): 
    demographic_questions = [question for question in list(row.index) if question not in excluded_columns]
    random.shuffle(demographic_questions)
    demographic_prompt = ""
    counter = 1
    for question in demographic_questions:
        if pd.isnull(row[question]) or row[question] == "NA":
            continue
        demographic_prompt += f"{counter}) Interviewer: {question_mapping[question]} Me: {row[question]} "
        counter += 1

    return demographic_prompt


# Load data
data = pd.read_csv(f"data/{DATA}.csv")

# Drop row with survey question description
data = data.drop(data.index[0]).reset_index(drop=True)

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

# Construct system and question prompts
data["demographic_prompt"] = data.apply(generate_meriggi_demographic_prompt, axis=1, args=(target_outcomes + ["master_person_id", "treatment", "dtd", "attended", "incomplete_observations"],))
data["system_prompt"] = data.apply(format_meriggi_system_prompt, axis=1)

# Format data for fine-tuning
data["text"] = data.apply(format_meriggi_user_prompts, axis=1, args=(target_outcomes,))

# Save files in CSV format
data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

# Prepare Fink et al. For Context-Tuning

In [44]:
RANDOM_STATE = 42
DATA = "fink_et_al_2018_training"

target_outcomes = [
    "In 2014: if you own an insecticide-treated net, did you sleep under it last night?",
    "In 2016: if you own an insecticide-treated net, did you sleep under it last night?",
    "In 2014: if your household owns an insecticide-treated net, did your youngest child sleep under it last night?",
    "In 2016: if your household owns an insecticide-treated net, did your youngest child sleep under it last night?",
    "In 2014: were you assisted by someone (e.g., a doctor, nurse, midwife, SBA, or community health worker) during your last delivery?",
    "In 2016: were you assisted by someone (e.g., a doctor, nurse, midwife, SBA, or community health worker) during your last delivery?",
    "In 2014: did you exclusively breastfeed your youngest child when they were younger than six months?",
    "In 2016: did you exclusively breastfeed your youngest child when they were younger than six months?",
    "In 2014: did you give your child ORS or go to the clinic the last time they had diarrhea?",
    "In 2016: did you give your child ORS or go to the clinic the last time they had diarrhea?",
    "In 2014: if you have a handwashing station in your household, is there soap available to wash your hands?",
    "In 2016: if you have a handwashing station in your household, is there soap available to wash your hands?",
    "In 2014: did you exclusively breastfeed your youngest child when they were younger than two years?",
    "In 2016: did you exclusively breastfeed your youngest child when they were younger than two years?",
]

system_prompt_1 = """Please put yourself in the shoes of a human female subject participating in a healthcare survey. You will be provided with a demographic profile that describes your region and district, the level of urbanization of your area, whether you own a mobile phone and whose phone you use, your age, whether you were born in your current community, how long have you lived there, who heads your household, your religion, your marital status, your partner’s religion, the highest level of education attained by you and your partner, whether you have ever given birth, how many children you desire, your literacy level, and health-related topics such as your use of the healthcare system, sources of health information, the distance to the nearest clinic/hospital, and your knowledge and attitudes regarding hand-washing, exclusive breastfeeding, Oral Rehydration Solutions (ORS) for diarrhea, the use of insecticide treated nets, and skilled birth attendance during delivery. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. Lastly, you will be provided with a description of an initiative that was carried out in your area. Thereafter, you will be asked whether you and your youngest child slept under an insecticide treated net last night, whether you received assistance during your last delivery, whether you exclusively breastfed your youngest child before they turned 6 months, whether you gave ORS tables to your child or visited a clinic the last time they had diarrhea, whether you washed your hands with soap, and whether you exclusively breastfed your youngest child before they turned 2 years. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile:
"""

system_prompt_2 = """

In 2012, an initiative was implemented in your area by the Ghana Health Services (GHS), with funding from the United Nations Children’s Fund (UNICEF). The primary aim of this intervention was to encourage families to adopt and consistently practice five health behaviours which are critical for preventing under-five mortality: sleeping under an insecticide treated mosquito net, utilization of oral rehydration solution (ORS) for the treatment of diarrhea, hand-washing with soap, exclusive breastfeeding, and delivery with a skilled birth attendant. 

"""


def format_fink_user_prompts(row: pd.Series, target_outcomes) -> str:
    prompt = [{"role":"system","content":row["system_prompt"]}]

    for target_outcome in target_outcomes:
        if pd.isnull(row[target_outcome]) or row[target_outcome] == "NA":
            continue

        prompt += [
            {"role":"user", "content":target_outcome},
            {"role":"assistant", "content":row[target_outcome]}
        ]
    
    # Convert the prompt list to a JSON-formatted string
    prompt_string = json.dumps(prompt)

    return prompt_string


def format_fink_system_prompt(row: pd.Series) -> str:
    final_system_prompt = system_prompt_1 + row["demographic_prompt"] + system_prompt_2

    if row["In your community, did the Centre for National Culture showed a video or a live drama about health behaviors (e.g., washing hand, exclusive breastfeeding, ORS)?"] == "Live drama shows by Center for National Culture":
        final_system_prompt += "The initiative consisted of a theatre (or live) drama that was played in your community with the support of the Centre for National Culture (CNC). These dramas aimed at informing on the five health behaviors that are critical for preventing under-five mortality (i.e., sleeping under an insecticide treated mosquito net, utilization of oral rehydration solution (ORS) for the treatment of diarrhea, hand-washing with soap, exclusive breastfeeding, and delivery with a skilled birth attendant)." + "\n\n"

    if row["In your community, did the Centre for National Culture showed a video or a live drama about health behaviors (e.g., washing hand, exclusive breastfeeding, ORS)?"] == "Video by Center for National Culture":
        final_system_prompt += "The initiative consisted of a video screening of a recorded drama which was played in your community with the support of the CNC. These videos aimed at informing on the five health behaviors that are critical for preventing under-five mortality (i.e., sleeping under an insecticide treated mosquito net, utilization of oral rehydration solution (ORS) for the treatment of diarrhea, hand-washing with soap, exclusive breastfeeding, and delivery with a skilled birth attendant)." + "\n\n"

    if row["Did your community radio broadcast programs about health behaviors?"] == "Yes":
        final_system_prompt += "The initiative consisted of focus group discussions and jingles broadcasted by the Ghana Community Radio Network (GCRN). These focus group discussions and jingles aimed at informing on the five health behaviors that are critical for preventing under-five mortality (i.e., sleeping under an insecticide treated mosquito net, utilization of oral rehydration solution (ORS) for the treatment of diarrhea, hand-washing with soap, exclusive breastfeeding, and delivery with a skilled birth attendant)." + "\n\n"

    if row["Did you received phone calls from healthcare workers informing you about health behaviours? If so, was the person who called always the same?"] == "Always called by the same person":
        final_system_prompt += "Voice messages were sent by the same person to your mobile phone. This message aimed at informing on the five health behaviors that are critical for preventing under-five mortality (i.e., sleeping under an insecticide treated mosquito net, utilization of oral rehydration solution (ORS) for the treatment of diarrhea, hand-washing with soap, exclusive breastfeeding, and delivery with a skilled birth attendant)." + "\n\n"

    if row["Did you received phone calls from healthcare workers informing you about health behaviours? If so, was the person who called always the same?"] == "Called by different people":
        final_system_prompt += "Voice messages were sent by different people to your mobile phone. This message aimed at informing on the five health behaviors that are critical for preventing under-five mortality (i.e., sleeping under an insecticide treated mosquito net, utilization of oral rehydration solution (ORS) for the treatment of diarrhea, hand-washing with soap, exclusive breastfeeding, and delivery with a skilled birth attendant)." + "\n\n"

    return final_system_prompt + "The results of this initiative were evaluated in two different years: 2014 and 2016."


# Load data
data = pd.read_csv(f"data/{DATA}.csv", header=1)

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

# Construct system and question prompts
data["demographic_prompt"] = data.apply(generate_demographic_prompt, axis=1, args=(target_outcomes + 
                                                                                   ["Household ID", 
                                                                                    "Community ID", 
                                                                                    "In you community, did the Centre for National Culture showed a video or a live drama about health behaviors (e.g., washing hand, exclusive breastfeeding, ORS)?", 
                                                                                    "Did your community radio broadcast programs about health behaviors?", 
                                                                                    "Did you received phone calls from healthcare workers informing you about health behaviours? If so, was the person who called always the same?"],))
data["system_prompt"] = data.apply(format_fink_system_prompt, axis=1)

# Format data for fine-tuning
data["text"] = data.apply(format_fink_user_prompts, axis=1, args=(target_outcomes,))

# Save files in CSV format
data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)