# Helper Functions

In [16]:
import pandas as pd
import os
import json

def generate_demographic_prompt(row, excluded_columns): 
    demographic_questions = [question for question in list(row.index) if question not in excluded_columns]
    demographic_prompt = ""
    counter = 1
    for question in demographic_questions:
        if pd.isnull(row[question]):
            continue
        demographic_prompt += f"{counter}) Interviewer: {question} Me: {row[question]} "
        counter += 1

    demographic_prompt = demographic_prompt + f"\n\n{row['backstory']}"

    return demographic_prompt

def format_prompts(row: pd.Series, target_outcome) -> str:
    prompt = [{"role":"system","content":row["system_prompt"]},
              {"role":"user","content":row["user_prompt"]},
              {"role":"assistant","content":row[target_outcome]}
            ]
    
    # Convert the prompt list to a JSON-formatted string
    prompt_string = json.dumps(prompt)

    return prompt_string


# Prepare Candor Data for Context-tuning

In [48]:
BACKSTORY_DATA = "data/candor_wave2_ghana_backstory.xlsx"
RANDOM_STATE = 42
DATA = "candour_wave2_ghana_context_training"

#  CANDOR
demographic_questions = [
    'country',
    'What is your gender?',
    'What is your current age?',
    'Thinking back to 12 months ago, has your household income increased or decreased since then?',
    'The following is a scale from 0 to 10 that goes from left to right, where 0 means "Left" and 10 means "Right". Today when talking about political trends, many people talk about those who are more sympathetic to the left or the right. According to the sense that the terms "Left" and "Right" have for you when you think about your political point of view, where would you find yourself on this scale?',
    'Gross HOUSEHOLD income combines your gross income with that of your partner or any other household member with whom you share financial responsibilities BEFORE any taxes are paid and BEFORE any benefits are obtained. What is your gross annual household income?',
    'What is the highest educational qualification you have completed?',
    'Do you have any dependent children who live with you? (By "dependent" children, we mean those who are not yet financially independent).',
    'Are you currently married, in a civil partnership, or living with a partner?',
    'Would you vote to re-elect this government in the next election?',
    'Overall, how would you rate the current Ghanaian government on a scale of 0 (very low rating) to 100 (very high rating)?  Please use the slider to indicate your rating from very low (0) to very high (100).',
    'Select the region you live in.',
    'Select the district you live in.',
    'We have some questions about your experience with the COVID-19 virus since the beginning of this year, January, 2021. Since the beginning of this year, do you believe you have been infected with the COVID-19 virus?',
    'We have some questions about your experience with the COVID-19 virus since the beginning of this year, January, 2021. Since the beginning of this year, have you had a COVID-19 test that showed that you were infected with the virus?',
    'We have some questions about your experience with the COVID-19 virus since the beginning of this year, January, 2021. Since the beginning of this year, has a relative of yours been infected with COVID-19?',
    'We have some questions about your experience with the COVID-19 virus since the beginning of this year, January, 2021. Since the beginning of this year, have any friends or colleagues of yours been infected with COVID-19?',
    'We have some questions about your experience with the COVID-19 virus since the beginning of this year, January, 2021. Do you know of anyone who has died from COVID-19?',
    'We are interested in your opinion about the implementation of COVID-19 vaccines. Please use the sliding scale to indicate how much you agree or disagree with the statements. You can move the pointer from 0 which means very much disagree to 100 which means very much agree. All school children should be required by law to get a COVID-19 vaccine.',
    'We are interested in your opinion about the implementation of COVID-19 vaccines. Please use the sliding scale to indicate how much you agree or disagree with the statements. You can move the pointer from 0 which means very much disagree to 100 which means very much agree. All health care workers who are in contact with patients should be required by law to get a COVID-19 vaccine.',
    'We are interested in your opinion about the implementation of COVID-19 vaccines. Please use the sliding scale to indicate how much you agree or disagree with the statements. You can move the pointer from 0 which means very much disagree to 100 which means very much agree. Whether a person gets a COVID-19 vaccine or not should be a matter of personal choice.',
    'We are interested in your opinion about the implementation of COVID-19 vaccines. Please use the sliding scale to indicate how much you agree or disagree with the statements. You can move the pointer from 0 which means very much disagree to 100 which means very much agree. Any individual over the age of 65 should be required by law to get a COVID-19 vaccine.',
    'We are interested in your opinion about the implementation of COVID-19 vaccines. Please use the sliding scale to indicate how much you agree or disagree with the statements. You can move the pointer from 0 which means very much disagree to 100 which means very much agree. Employers should be allowed to require all employees to get a COVID-19 vaccine.',
    'We are interested in your opinion about the implementation of COVID-19 vaccines. Please use the sliding scale to indicate how much you agree or disagree with the statements. You can move the pointer from 0 which means very much disagree to 100 which means very much agree. The government should make COVID-19 vaccination mandatory for everybody.',
    'We are interested in your opinion about the implementation of COVID-19 vaccines. Please use the sliding scale to indicate how much you agree or disagree with the statements. You can move the pointer from 0 which means very much disagree to 100 which means very much agree. Health clinics should be required by law to give a suitable version of a COVID-19 vaccine to all newborns, infants and pre-school children.',
    'We are interested in your opinion about the implementation of COVID-19 vaccines. Please use the sliding scale to indicate how much you agree or disagree with the statements. You can move the pointer from 0 which means very much disagree to 100 which means very much agree. People should not be allowed to travel to other countries unless they can demonstrate that they have been vaccinated against COVID-19.',
    'We are interested in your opinion about the implementation of COVID-19 vaccines. Please use the sliding scale to indicate how much you agree or disagree with the statements. You can move the pointer from 0 which means very much disagree to 100 which means very much agree. Only people fully-vaccinated against COVID-19 should be allowed into large indoors events such as cinemas, night clubs and concerts.',
    'We are interested in your opinion about the implementation of COVID-19 vaccines. Please use the sliding scale to indicate how much you agree or disagree with the statements. You can move the pointer from 0 which means very much disagree to 100 which means very much agree. Only people fully-vaccinated against COVID-19 should be allowed into cafes and restaurants.',
    'Since the beginning of the year, January, 2021, which of the following measures, if any, have you taken in response to COVID-19? Canceled a doctor appointment',
    'Since the beginning of the year, January, 2021, which of the following measures, if any, have you taken in response to COVID-19? Worn a face mask',
    'Since the beginning of the year, January, 2021, which of the following measures, if any, have you taken in response to COVID-19? Visited a doctor or hospital',
    'Since the beginning of the year, January, 2021, which of the following measures, if any, have you taken in response to COVID-19? Canceled or postponed work activities',
    'Since the beginning of the year, January, 2021, which of the following measures, if any, have you taken in response to COVID-19? Canceled or postponed school activities',
    'Since the beginning of the year, January, 2021, which of the following measures, if any, have you taken in response to COVID-19? Canceled outside housekeepers or caregivers',
    'Since the beginning of the year, January, 2021, which of the following measures, if any, have you taken in response to COVID-19? Avoided some or all restaurants',
    'Since the beginning of the year, January, 2021, which of the following measures, if any, have you taken in response to COVID-19? Worked from home',
    'Since the beginning of the year, January, 2021, which of the following measures, if any, have you taken in response to COVID-19? Studied from home',
    'Since the beginning of the year, January, 2021, which of the following measures, if any, have you taken in response to COVID-19? Canceled or postponed pleasure, social, or recreational activites',
    'Since the beginning of the year, January, 2021, which of the following measures, if any, have you taken in response to COVID-19? Stockpiled food or water',
    'Since the beginning of the year, January, 2021, which of the following measures, if any, have you taken in response to COVID-19? Avoided public or crowded places',
    'Since the beginning of the year, January, 2021, which of the following measures, if any, have you taken in response to COVID-19? Prayed',
    'Since the beginning of the year, January, 2021, which of the following measures, if any, have you taken in response to COVID-19? Avoided contact with high-risk people',
    'Since the beginning of the year, January, 2021, which of the following measures, if any, have you taken in response to COVID-19? Kept six feet distance from those outside my household',
    'Since the beginning of the year, January, 2021, which of the following measures, if any, have you taken in response to COVID-19? Stayed home because I felt unwell',
    'Since the beginning of the year, January, 2021, which of the following measures, if any, have you taken in response to COVID-19? Wiped packages entering my home',
    'How likely do you think it is that you will get COVID-19 in the next year? Use the slider to indicate how likely: 0% means "will definitely not happen" and 100% means "will definitely happen".',
    'We are interested in your opinion about the COVID-19 booster shots. Please use the sliding scale to indicate how much you agree or disagree with the following statements. You can move the pointer from 0 which means very much disagree to 100 which means very much agree. The priority for vaccines should be first doses for those who want them before making booster shots available',
    'We are interested in your opinion about the COVID-19 booster shots. Please use the sliding scale to indicate how much you agree or disagree with the following statements. You can move the pointer from 0 which means very much disagree to 100 which means very much agree. If a booster shot was available to me today, I would get it.',
    'We are interested in your opinion about the COVID-19 booster shots. Please use the sliding scale to indicate how much you agree or disagree with the following statements. You can move the pointer from 0 which means very much disagree to 100 which means very much agree. Vaccine booster shots will be needed at least every year to maintain protection against COVID-19.',
    'How is your health in general? Is it...',
    'On the following screens, please tap the statement that best describes your health A YEAR AGO and TODAY. Your mobility A YEAR AGO (walking)',
    'On the following screens, please tap the statement that best describes your health A YEAR AGO and TODAY. Your mobility TODAY (walking)',
    'On the following screens, please tap the statement that best describes your health A YEAR AGO and TODAY. Your self-care A YEAR AGO',
    'On the following screens, please tap the statement that best describes your health A YEAR AGO and TODAY. Your self-care TODAY',
    'On the following screens, please tap the statement that best describes your health A YEAR AGO and TODAY. Your regular activities A YEAR AGO (e.g. work, study, housework, family or leisure activities)',
    'On the following screens, please tap the statement that best describes your health A YEAR AGO and TODAY. Your regular activities TODAY (e.g. work, study, housework, family or leisure activities)',
    'On the following screens, please tap the statement that best describes your health A YEAR AGO and TODAY. Your pain / discomfort A YEAR AGO',
    'On the following screens, please tap the statement that best describes your health A YEAR AGO and TODAY. Your pain / discomfort TODAY',
    'On the following screens, please tap the statement that best describes your health A YEAR AGO and TODAY. Your worries (anxiety) / depression A YEAR AGO',
    'On the following screens, please tap the statement that best describes your health A YEAR AGO and TODAY. Your worries (anxiety) / depression TODAY',
    'We would like to know how good or bad your health was/is A YEAR AGO/TODAY.  On the next screen you will see a scale numbered 0 to 100. 100 means the best health you can imagine. 0 means the worst health you can imagine. Please tap on the scale how your health was A YEAR AGO.',
    'We would like to know how good or bad your health was/is A YEAR AGO/TODAY.  On the next screen you will see a scale numbered 0 to 100. 100 means the best health you can imagine. 0 means the worst health you can imagine. Please tap on the scale how your health is TODAY.',
]

target_outcome = "Have you received a COVID-19 vaccine?"

question = f"{target_outcome} Please only respond with 'Yes' or 'No' and then clearly explain the reasoning steps you took that led to your response on a new line:"

system_prompt_1 = f"""Please put yourself in the shoes of a human subject participating in a healthcare survey in Ghana. You will be provided with a demographic profile that describes the area/region/district where you live, your gender, the highest education level you achieved, your political point-of-view, your experience with the COVID-19 virus, your opinions about COVID-19 vaccines, and your backstory. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. Additionally, we will provide you with some general findings from past studies on Ghana’s COVID-19 vaccination efforts. After you receive your complete human subject profile, you will be asked whether you received the COVID-19 vaccination. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile:
"""

system_prompt_2 = """

You should note that the Health officials in Ghana have been communicating extensively to the population – both urban and rural about the COVID-19 virus. Most of the Ghana population know that the COVID-19 virus is dangerous for their health and they are aware of the benefits of getting the COVID-19 vaccination. However, vaccine hesitancy remain a notable challenge, influenced by misinformation and conspiracy theories circulating on social media. Despite efforts by health authorities to promote vaccination, some individuals remained cautious about the safety and efficacy of COVID-19 vaccines. Educational campaigns and outreach efforts are ongoing, but addressing deep-seated concerns and misinformation required continuous effort. Findings from past studies on COVID-19 vaccination efforts in Ghana reveal a complex interplay of factors influencing vaccine uptake and hesitancy. Positive perceptions of vaccines, belief in their efficacy, knowledge of COVID-19, and a generally favorable attitude toward vaccination significantly boost acceptance. Conversely, concerns about negative side effects, mistrust in vaccine safety, fear, and spiritual or religious beliefs contribute to hesitancy. Demographic factors such as educational attainment, gender, religious affiliation, age, and marital status play crucial roles in shaping attitudes towards vaccination. Higher levels of education, female gender, urban residence, Christian affiliation, and reliance on internet sources for COVID-19 information were associated with higher hesitancy rates. Notably, healthcare workers showed a varied acceptance rate influenced by their role, personal connections to COVID-19 cases, and trust in government measures. Despite efforts to increase coverage, only 40% of Ghanaians had received at least one vaccine dose."""


def format_response(response: str) -> str:
    if response == 'Yes, received both the first and second vaccine shots':
        return "Yes"
    if response == 'Yes, received the first vaccine shot':
        return "Yes"
    if response == 'Yes, I am fully vaccinated and have also received a recent "booster" vaccine shot':
        return "Yes"
    if response == 'No, I have declined the offer to be vaccinated':
        return "No"
    if response == 'Not yet but I am waiting for my appointment':
        return "No"
    else:
        return None
    
def shorten_survey_questions(question: str) -> str:
    # Define the long and short sentences
    covid_experience_long = "We have some questions about your experience with the COVID-19 virus since the beginning of this year, January, 2021. Since the beginning of this year, "
    covid_experience_short = "Since the beginning of 2021, "

    covid_vaccine_long = "We are interested in your opinion about the implementation of COVID-19 vaccines. Please use the sliding scale to indicate how much you agree or disagree with the statements. You can move the pointer from 0 which means very much disagree to 100 which means very much agree."
    covid_vaccine_short = "Indicate how much you agree or disagree with the statement, where 0 means very much disagree and 100 means very much agree."

    covid_measures_long = "Since the beginning of the year, January, 2021, which of the following measures, if any, have you taken in response to COVID-19?"
    covid_measures_short = "Since the beginning of 2021, have you taken the following measure in response to COVID-19?"

    covid_likelihood_long = 'How likely do you think it is that you will get COVID-19 in the next year? Use the slider to indicate how likely: 0% means "will definitely not happen" and 100% means "will definitely happen".'
    covid_likelihood_short = 'How likely do you think you will get COVID-19 in the next year? 0% means "will definitely not happen" and 100% means "will definitely happen".'

    covid_opinion_long = "We are interested in your opinion about the COVID-19 booster shots. Please use the sliding scale to indicate how much you agree or disagree with the following statements. You can move the pointer from 0 which means very much disagree to 100 which means very much agree."
    covid_opinion_short = "How much you agree or disagree with the following statement, where 0 means very much disagree and 100 means very much agree."

    general_health_long = "How is your health in general? Is it..."
    general_health_short = "How is your health in general?"

    health_description_long = "On the following screens, please tap the statement that best describes your health A YEAR AGO and TODAY."
    health_description_short = "Please select the statement that best describes your health A YEAR AGO and TODAY."

    health_history_long = "We would like to know how good or bad your health was/is A YEAR AGO/TODAY.  On the next screen you will see a scale numbered 0 to 100. 100 means the best health you can imagine. 0 means the worst health you can imagine. Please tap on the scale how your health"
    health_history_short = "We would like to know how good or bad your health was/is A YEAR AGO/TODAY. 100 means the best health you can imagine. 0 means the worst health you can imagine. Please indicate how your health"
    
    # Check if the input string starts with the long sentence
    if question.startswith(covid_experience_long):
        return question.replace(covid_experience_long, covid_experience_short, 1)
    
    elif question.startswith(covid_vaccine_long):
        return question.replace(covid_vaccine_long, covid_vaccine_short, 1)
    
    elif question.startswith(covid_measures_long):
        return question.replace(covid_measures_long, covid_measures_short, 1)
    
    elif question.startswith(covid_likelihood_long):
        return question.replace(covid_likelihood_long, covid_likelihood_short, 1)
    
    elif question.startswith(covid_opinion_long):
        return question.replace(covid_opinion_long, covid_opinion_short, 1)
    
    elif question.startswith(general_health_long):
        return question.replace(general_health_long, general_health_short, 1)
    
    elif question.startswith(health_description_long):
        return question.replace(health_description_long, health_description_short, 1)
    
    elif question.startswith(health_history_long):
        return question.replace(health_history_long, health_history_short, 1)
    
    else:
        # Return the original string if the condition is not met
        return question

# Load Candor data
data = pd.read_excel(f"data/{DATA}.xlsx")
data = data[demographic_questions + [target_outcome, "ID"]]

data_folder_dir = "candor_ghana"
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

# Format target outcome response to Yes or No
data[target_outcome] = data[target_outcome].apply(format_response)

# Remove entries with missing responses
data = data.dropna(subset=[target_outcome]).reset_index(drop=True)

# Shorten survey questions
data.columns = [shorten_survey_questions(col) for col in data.columns]

# Construct system and question prompts
backstories = pd.read_excel(BACKSTORY_DATA)
merged_data = pd.merge(left=data, right=backstories[["ID","backstory"]], on="ID")
merged_data["demographic_prompt"] = merged_data.apply(generate_demographic_prompt, axis=1, args=([target_outcome, "ID", "backstory"],))
merged_data["system_prompt"] = merged_data["demographic_prompt"].apply(lambda x: system_prompt_1 + x + system_prompt_2)
merged_data["user_prompt"] = question

# Format data for fine-tuning
merged_data["text"] = merged_data.apply(format_prompts, axis=1, args=(target_outcome,))

# Save files in CSV format
merged_data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

# Prepare Afrobarometer Data for Context-tuning

In [3]:
BACKSTORY_DATA = "data/afrobarometer_backstory.xlsx"
RANDOM_STATE = 42
DATA = "afrobarometer"

# Afrobarometer
demographic_questions = [
    "Do you come from a rural or urban area?",
    "How old are you?",
    "What is your gender?",
    "What is your highest level of education?",
    "What is your religion, if any?",
    "Do you have a job that pays a cash income? If yes, is it full time or part time? If no, are you currently looking for a job?",
    "What region do you come from?",
    "Do you feel close to any particular political party?",
    "When you get together with your friends or family, how often would you say you discuss political matters?",
    "Latitude",
    "Longitude",
    "What is the distance to the nearest health clinic from your location in kilometers?",
    "What district do you live in?",
    "What percentage of the population in your district voted for the National Democratic Congress (NDC)?",
    "What percentage of the population in your district voted for the New Patriotic Party (NPP)?",
    "In the past 12 months, have you had contact with a public clinic or hospital?",
]

target_outcome = "Have you received a vaccination against COVID-19, either one or two doses?"

question = f"{target_outcome} Please only respond with 'No' or 'Yes' and then clearly explain the reasoning steps you took that led to your response on a new line:"

system_prompt_1 = """Please put yourself in the shoes of a human subject participating in a healthcare survey in Ghana. You will be provided with a demographic profile that describes the area/region/district where you live, your gender, the highest education level you achieved, your religion, your employment status, the distance to your nearest health clinic, the political party you feel closest to, the percentage vote for the New Patriotic Party in your district, and your backstory. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. Additionally, we will provide you with some general findings from past studies on Ghana’s COVID-19 vaccination efforts. Lastly, you will watch a video. After you receive your complete human subject profile, you will be asked whether you received the COVID-19 vaccination. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile:
"""

system_prompt_2 = """

You should note that the Health officials in Ghana have been communicating extensively to the population – both urban and rural about the COVID-19 virus. Most of the Ghana population know that the COVID-19 virus is dangerous for their health and they are aware of the benefits of getting the COVID-19 vaccination. However, vaccine hesitancy remain a notable challenge, influenced by misinformation and conspiracy theories circulating on social media. Despite efforts by health authorities to promote vaccination, some individuals remained cautious about the safety and efficacy of COVID-19 vaccines. Educational campaigns and outreach efforts are ongoing, but addressing deep-seated concerns and misinformation required continuous effort. Findings from past studies on COVID-19 vaccination efforts in Ghana reveal a complex interplay of factors influencing vaccine uptake and hesitancy. Positive perceptions of vaccines, belief in their efficacy, knowledge of COVID-19, and a generally favorable attitude toward vaccination significantly boost acceptance. Conversely, concerns about negative side effects, mistrust in vaccine safety, fear, and spiritual or religious beliefs contribute to hesitancy. Demographic factors such as educational attainment, gender, religious affiliation, age, and marital status play crucial roles in shaping attitudes towards vaccination. Higher levels of education, female gender, urban residence, Christian affiliation, and reliance on internet sources for COVID-19 information were associated with higher hesitancy rates. Notably, healthcare workers showed a varied acceptance rate influenced by their role, personal connections to COVID-19 cases, and trust in government measures. Despite efforts to increase coverage, only 40% of Ghanaians had received at least one vaccine dose."""


# Load Afrobarometer data
data = pd.read_excel(f"data/{DATA}.xlsx")
data = data[demographic_questions + [target_outcome, "ID"]]

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

# Remove entries with missing responses
data = data.dropna(subset=[target_outcome]).reset_index(drop=True)

# Construct system and question prompts
backstories = pd.read_excel(BACKSTORY_DATA)
merged_data = pd.merge(left=data, right=backstories[["ID","backstory"]], on="ID")
merged_data["demographic_prompt"] = merged_data.apply(generate_demographic_prompt, axis=1, args=([target_outcome, "ID", "backstory"],))
merged_data["system_prompt"] = merged_data["demographic_prompt"].apply(lambda x: system_prompt_1 + x + system_prompt_2)
merged_data["user_prompt"] = question

# Format data for fine-tuning
merged_data["text"] = merged_data.apply(format_prompts, axis=1, args=(target_outcome,))

# Save files in CSV format
merged_data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

# Prepare Healthcare RCT Data for Context-tuning

# Prepare Ghana Wave 1 Data for Context-tuning

In [17]:
BACKSTORY_DATA = "data/ghana_wave_1_backstory.xlsx"
RANDOM_STATE = 42
DATA = "ghana_wave_1"

# Ghana Wave 1
demographic_questions = [
    "What is your current age?",
    "What is your gender?",
    "How many people live in the house together with you (NOT including you) at this moment?",
    "How many children below 18 years old are currently living in your home?",
    "What is your current working situation?",
    "How much on average does your household spend in a typical week on food?",
    "How much on average does your household spend in a typical week on non-food items (electricity, water, rent, school fees)?",
    "How would you rate the overall economic or financial condition of your household today?",
    "What is the highest educational qualification you have completed?",
    "Region",
    "District",
    "Subdistrict",
    "How many villages in the district do you think you have visited in the last month?",
    "How many villages in the district do you think you have visited in the last year?",
    "Do you have family in other villages in the district?",
    "Do you have WhatsApp?",
    "How often do you use WhatsApp?",
    "What social media have you used in the last year? - Facebook",
    "What social media have you used in the last year? - Twitter",
    "What social media have you used in the last year? - Instagram",
    "What social media have you used in the last year? - Reddit",
    "What social media have you used in the last year? - YouTube",
    "What social media have you used in the last year? - SnapChat",
    "What social media have you used in the last year? - TikTok",
    "What social media have you used in the last year? - Other",
    "What social media have you used in the last year? - I don't use social media",
    "How often do you use social media?",
    "Distance to clinic in km",
    "Treatment",
]

target_outcome = "Did you report being vaccinated?"

question = f"{target_outcome} Please only respond with 'No' or 'Yes' and then clearly explain the reasoning steps you took that led to your response on a new line:"

system_prompt_1 = """Please put yourself in the shoes of a human subject participating in a healthcare survey in Ghana. You will be provided with a demographic profile that describes your gender, the highest education level you achieved, your household size, your employment status, your financial situation, the number of villages you visited, your usage of WhatsApp and other social media, the distance to your nearest health clinic, and your backstory. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. Additionally, we will provide you with some general findings from past studies on Ghana’s COVID-19 vaccination efforts. Lastly, you will watch a video. After you receive your complete human subject profile, you will be asked whether you reported receiving the COVID-19 vaccination. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile:
"""

system_prompt_2 = """

You should note that the Health officials in Ghana have been communicating extensively to the population – both urban and rural about the COVID-19 virus. Most of the Ghana population know that the COVID-19 virus is dangerous for their health and they are aware of the benefits of getting the COVID-19 vaccination. However, vaccine hesitancy remain a notable challenge, influenced by misinformation and conspiracy theories circulating on social media. Despite efforts by health authorities to promote vaccination, some individuals remained cautious about the safety and efficacy of COVID-19 vaccines. Educational campaigns and outreach efforts are ongoing, but addressing deep-seated concerns and misinformation required continuous effort. Findings from past studies on COVID-19 vaccination efforts in Ghana reveal a complex interplay of factors influencing vaccine uptake and hesitancy. Positive perceptions of vaccines, belief in their efficacy, knowledge of COVID-19, and a generally favorable attitude toward vaccination significantly boost acceptance. Conversely, concerns about negative side effects, mistrust in vaccine safety, fear, and spiritual or religious beliefs contribute to hesitancy. Demographic factors such as educational attainment, gender, religious affiliation, age, and marital status play crucial roles in shaping attitudes towards vaccination. Higher levels of education, female gender, urban residence, Christian affiliation, and reliance on internet sources for COVID-19 information were associated with higher hesitancy rates. Notably, healthcare workers showed a varied acceptance rate influenced by their role, personal connections to COVID-19 cases, and trust in government measures. Despite efforts to increase coverage, only 40% of Ghanaians had received at least one vaccine dose.

You are asked to watch a video at this point. Here is the transcript of the video:
"""

treatment_transcript = {
    "CDC Health": "Health authorities are working hard to distribute the COVID-19 vaccines free for everyone with no strings attached. COVID 19 vaccines are safe and effective. After you have been fully vaccinated you can resume activities that you did prior to the pandemic. Getting the COVID-19 vaccine will help prevent you from getting COVID-19 and reduce your risk of being hospitalized with COVID-19. COVID 19 vaccine help you to protect yourself your environment and your loved ones from COVID-19 exposure.",
    "Placebo": "The Sun lights up our lives for business for education even for socializing but when the Sun sets many people use candles who are quality battery-operated torches and kerosene lamps as inefficient and expensive ways to create light. What if you can take some Sun with you at night?  You can with portable solar products there are different types, but each portable solar product is made up of three basic parts: a small solar panel, a modern rechargeable battery and an LED bulb. The solar panel catches the light from the Sun and stores this energy in the battery. This can now be used for much needed light when it's dark. Many can even charge phones portable solar products should be reliable affordable and warranted be sure to demand top quality solar products look for these products lighting Africa shining the way.",
    "Low Cash": "Health authorities are working hard to distribute the COVID-19 vaccines free for everyone with no strings attached. COVID-19 vaccines are safe and effective. After you have been fully vaccinated you can resume activities that you did prior to the pandemic. If you have at least one COVID-19 vaccine shot you will receive 20 Cedi. If you get vaccinated, you will get rewarded.",
    "High Cash": "Health authorities are working hard to distribute the COVID-19 vaccines free for everyone with no strings attached. COVID-19 vaccines are safe and effective. After you have been fully vaccinated you can resume activities that you did prior to the pandemic. If you have at least one COVID-19 vaccine shot you will receive 60 Cedi. If you get vaccinated, you will get rewarded.",
}

def format_system_prompt(row: pd.Series) -> str:
    final_system_prompt = system_prompt_1 + row["demographic_prompt"] + system_prompt_2 + treatment_transcript[row["Treatment"]]
    return final_system_prompt


# Load Ghana Wave 1 data
data = pd.read_csv(f"data/{DATA}.csv")
data = data[demographic_questions + [target_outcome, "ID"]]

data_folder_dir = DATA
if not os.path.exists(f"data/{data_folder_dir}"):
    # Create the folder
    os.makedirs(f"data/{data_folder_dir}")

# Remove entries with missing responses
data = data.dropna(subset=[target_outcome]).reset_index(drop=True)

# Construct system and question prompts
backstories = pd.read_excel(BACKSTORY_DATA)
merged_data = pd.merge(left=data, right=backstories[["ID","backstory"]], on="ID")
merged_data["demographic_prompt"] = merged_data.apply(generate_demographic_prompt, axis=1, args=([target_outcome, "ID", "backstory"],))
merged_data["system_prompt"] = merged_data.apply(format_system_prompt, axis=1)
merged_data["user_prompt"] = question

# Format data for fine-tuning
merged_data["text"] = merged_data.apply(format_prompts, axis=1, args=(target_outcome,))

# Save files in CSV format
merged_data[["text"]].to_csv(f"data/{data_folder_dir}/train.csv", index=False)

  data = pd.read_csv(f"data/{DATA}.csv")


(5900, 31)
(4090, 31)
(4090, 32)


# Combine Context-tuning Data from Different Sources