# Load Libraries

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import json

DATA = "afrobarometer"
BACKSTORY_DATA = "data/afrobarometer_backstory.xlsx"
RANDOM_STATE = 42

# Load Data

In [2]:
data = pd.read_excel(f"data/{DATA}.xlsx")
data.head()

Unnamed: 0,ID,Do you come from a rural or urban area?,What region do you come from?,How old are you?,What is your gender?,What is your race?,What is the primary language you speak in your home?,What is your highest level of education?,"What is your religion, if any?",What is your ethnic community or cultural group?,...,How much do you trust people from other religions?,How much do you trust people from other ethnic groups?,How often do you use the Internet?,Latitude,Longitude,What is the distance to the nearest health clinic from your location in kilometers?,treatment,What district do you live in?,What percentage of the population in your district voted for the National Democratic Congress (NDC)?,What percentage of the population in your district voted for the New Patriotic Party (NPP)?
0,1,Rural,UPPER EAST,49,Man,Black / African,Gruni,Some primary schooling,Roman Catholic,Grusi,...,Somewhat,Somewhat,Less than once a month,8.150358,-0.265617,12.515835,CDC,KrachiNchumuru,55.840611,37.766504
1,2,Rural,UPPER EAST,37,Woman,Black / African,Frafra,Intermediate school or Some secondary school /...,"Pentecostal (e.g., ""Born Again"" and/or ""Saved"")",Grusi,...,A lot,A lot,Never,8.150358,-0.265617,12.515835,Placebo,KrachiNchumuru,55.840611,37.766504
2,3,Urban,CENTRAL,62,Woman,Black / African,Akan,No formal schooling,Methodist,Akan,...,A lot,Not at all,Never,8.150358,-0.265617,12.515835,Placebo,KrachiNchumuru,55.840611,37.766504
3,4,Urban,CENTRAL,48,Woman,Black / African,Akan,Primary school completed,Jehovah's Witness,Akan,...,A lot,Somewhat,Never,8.150358,-0.265617,12.515835,CDC,KrachiNchumuru,55.840611,37.766504
4,5,Urban,CENTRAL,53,Man,Black / African,Akan,Secondary school / high school completed,"Christian only (i.e., respondents says only ""C...",Akan,...,Just a little,Just a little,A few times a week,5.619569,-0.274142,1.364027,HighCash,GaCentral,34.103792,63.948036


# Perform Train, Validation, Test Split

In [3]:
# Split the data into train (50%) and temp (50%)
train_data, temp_data = train_test_split(data, test_size=0.5, random_state=42)

# Split the temp data into validation (50% of temp) and test (50% of temp)
valid_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Construct System and Question Prompts

In [4]:
# Ghana Wave 1
demographic_questions = [
    "Do you come from a rural or urban area?",
    "How old are you?",
    "What is your gender?",
    "What is your highest level of education?",
    "What is your religion, if any?",
    "Do you have a job that pays a cash income? If yes, is it full time or part time? If no, are you currently looking for a job?",
    "What region do you come from?",
    "Do you feel close to any particular political party?",
    "When you get together with your friends or family, how often would you say you discuss political matters?",
    "Latitude",
    "Longitude",
    "What is the distance to the nearest health clinic from your location in kilometers?",
    "What district do you live in?",
    "What percentage of the population in your district voted for the National Democratic Congress (NDC)?",
    "What percentage of the population in your district voted for the New Patriotic Party (NPP)?",
    "In the past 12 months, have you had contact with a public clinic or hospital?",
]

target_outcome = "Have you received a vaccination against COVID-19, either one or two doses?"
question = f"{target_outcome} Please only respond with 'No' or 'Yes':"

system_prompt_1 = """Please put yourself in the shoes of a human subject participating in a healthcare survey in Ghana. You will be provided with a demographic profile that describes the area/region/district where you live, your gender, the highest education level you achieved, your religion, your employment status, the distance to your nearest health clinic, the political party you feel closest to, the percentage vote for the New Patriotic Party in your district, and your backstory. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. Additionally, we will provide you with some general findings from past studies on Ghana’s COVID-19 vaccination efforts. Lastly, you will watch a video. After you receive your complete human subject profile, you will be asked whether you received the COVID-19 vaccination. Please provide a consistent and coherent response using all the information provided. It is crucial for you to accurately replicate the response of a human subject that has the demographic profile you are provided. The human subject response will vary depending on their demographic profile. If you are unsure of an answer, provide a plausible response that is based on all of the information available to you. Respond to each question in the exact format specified and do not add any information beyond what is requested.

Your demographic profile:
"""

system_prompt_2 = """

You should note that the Health officials in Ghana have been communicating extensively to the population – both urban and rural about the COVID-19 virus. Most of the Ghana population know that the COVID-19 virus is dangerous for their health and they are aware of the benefits of getting the COVID-19 vaccination. However, vaccine hesitancy remain a notable challenge, influenced by misinformation and conspiracy theories circulating on social media. Despite efforts by health authorities to promote vaccination, some individuals remained cautious about the safety and efficacy of COVID-19 vaccines. Educational campaigns and outreach efforts are ongoing, but addressing deep-seated concerns and misinformation required continuous effort. Findings from past studies on COVID-19 vaccination efforts in Ghana reveal a complex interplay of factors influencing vaccine uptake and hesitancy. Positive perceptions of vaccines, belief in their efficacy, knowledge of COVID-19, and a generally favorable attitude toward vaccination significantly boost acceptance. Conversely, concerns about negative side effects, mistrust in vaccine safety, fear, and spiritual or religious beliefs contribute to hesitancy. Demographic factors such as educational attainment, gender, religious affiliation, age, and marital status play crucial roles in shaping attitudes towards vaccination. Higher levels of education, female gender, urban residence, Christian affiliation, and reliance on internet sources for COVID-19 information were associated with higher hesitancy rates. Notably, healthcare workers showed a varied acceptance rate influenced by their role, personal connections to COVID-19 cases, and trust in government measures. Despite efforts to increase coverage, only 40% of Ghanaians had received at least one vaccine dose.

You are asked to watch a video at this point. Here is the transcript of the video:
The Sun lights up our lives for business for education even for socializing but when the Sun sets many people use candles who are quality battery-operated torches and kerosene lamps as inefficient and expensive ways to create light. What if you can take some Sun with you at night?  You can with portable solar products there are different types, but each portable solar product is made up of three basic parts: a small solar panel, a modern rechargeable battery and an LED bulb. The solar panel catches the light from the Sun and stores this energy in the battery. This can now be used for much needed light when it's dark. Many can even charge phones portable solar products should be reliable affordable and warranted be sure to demand top quality solar products look for these products lighting Africa shining the way."""

In [5]:
def generate_demographic_prompt(row): 
    demographic_prompt = ""
    for idx, question in enumerate(demographic_questions):
        demographic_prompt += f"{idx+1}) Interviewer: {question} Me: {row[question]} "

    demographic_prompt = demographic_prompt + f"\n\n{row['backstory']}"

    return demographic_prompt

def construct_prompts(data):
    backstories = pd.read_excel(BACKSTORY_DATA)
    merged_data = pd.merge(left=data, right=backstories[["ID","backstory"]], on="ID")

    merged_data["demographic_prompt"] = merged_data.apply(generate_demographic_prompt, axis=1)
    merged_data["system_prompt"] = merged_data["demographic_prompt"].apply(lambda x: system_prompt_1 + x + system_prompt_2)
    merged_data["user_prompt"] = question

    return merged_data


In [6]:
train_data = construct_prompts(train_data)
valid_data = construct_prompts(valid_data)
test_data = construct_prompts(test_data)
train_data.head()

Unnamed: 0,ID,Do you come from a rural or urban area?,What region do you come from?,How old are you?,What is your gender?,What is your race?,What is the primary language you speak in your home?,What is your highest level of education?,"What is your religion, if any?",What is your ethnic community or cultural group?,...,Longitude,What is the distance to the nearest health clinic from your location in kilometers?,treatment,What district do you live in?,What percentage of the population in your district voted for the National Democratic Congress (NDC)?,What percentage of the population in your district voted for the New Patriotic Party (NPP)?,backstory,demographic_prompt,system_prompt,user_prompt
0,2204,Urban,EASTERN,64,Man,Black / African,Akan,Some university,Seventh Day Adventist,Akan,...,-1.514778,0.931752,CDC,Ejisu,17.029973,81.507799,I am a 64-year-old man living in the urban are...,1) Interviewer: Do you come from a rural or ur...,Please put yourself in the shoes of a human su...,Have you received a vaccination against COVID-...
1,1165,Urban,GREATER ACCRA,29,Woman,Black / African,Akan,Some primary schooling,"Christian only (i.e., respondents says only ""C...",Akan,...,-1.623514,2.32905,CDC,Afigya-KwabreSouth,16.703917,81.724414,I grew up in an urban area within the Greater ...,1) Interviewer: Do you come from a rural or ur...,Please put yourself in the shoes of a human su...,Have you received a vaccination against COVID-...
2,327,Rural,WESTERN,25,Woman,Black / African,Akan,Primary school completed,"Christian only (i.e., respondents says only ""C...",Akan,...,-1.631652,7.803533,LowCash,WassaEast,49.402169,44.750218,I grew up in a rural area in the Wassa East di...,1) Interviewer: Do you come from a rural or ur...,Please put yourself in the shoes of a human su...,Have you received a vaccination against COVID-...
3,561,Rural,BONO,70,Woman,Black / African,Akan,Some primary schooling,"Christian only (i.e., respondents says only ""C...",Ewe/Anlo,...,-3.015097,0.094058,CDC,Juaboso,56.046707,39.493144,I was born and raised in a rural area of Juabo...,1) Interviewer: Do you come from a rural or ur...,Please put yourself in the shoes of a human su...,Have you received a vaccination against COVID-...
4,1087,Urban,ASHANTI,23,Man,Black / African,Akan,Primary school completed,Roman Catholic,Akan,...,-1.631281,0.78534,CDC,Afigya-KwabreNorth,31.889515,65.889388,I grew up in an urban area within the Ashanti ...,1) Interviewer: Do you come from a rural or ur...,Please put yourself in the shoes of a human su...,Have you received a vaccination against COVID-...


# Formatting for Fine-tuning Conversation

In [7]:
def format_prompts(row: pd.Series) -> dict:
  return {
    "text": f"{row['system_prompt']}\nUser:{row['user_prompt']}\nAssistant:{row[target_outcome]}"
    # "messages": [
    #   {"role": "system", "content": row["system_prompt"]},
    #   {"role": "user", "content": row["user_prompt"]},
    #   {"role": "assistant", "content": row[target_outcome]}
    # ]
  }

In [8]:
train_data["prompt_message"] = train_data.apply(format_prompts, axis=1)
valid_data["prompt_message"] = valid_data.apply(format_prompts, axis=1)
test_data["prompt_message"] = test_data.apply(format_prompts, axis=1)
train_data.loc[0, "prompt_message"]

{'text': "Please put yourself in the shoes of a human subject participating in a healthcare survey in Ghana. You will be provided with a demographic profile that describes the area/region/district where you live, your gender, the highest education level you achieved, your religion, your employment status, the distance to your nearest health clinic, the political party you feel closest to, the percentage vote for the New Patriotic Party in your district, and your backstory. The information will be provided to you in the format of a survey interview. You will see a question from the “Interviewer:” and then your human subject response will be preceded by “Me:”. Additionally, we will provide you with some general findings from past studies on Ghana’s COVID-19 vaccination efforts. Lastly, you will watch a video. After you receive your complete human subject profile, you will be asked whether you received the COVID-19 vaccination. Please provide a consistent and coherent response using all t

# Save JSONL file

In [9]:
def write_jsonl(dataframe, file_path):
    with open(file_path, 'w') as file:
        for _, row in dataframe.iterrows():
            json.dump(row["prompt_message"], file)
            file.write('\n')

# Check if the folder exists
if not os.path.exists(f"data/{DATA}"):
    # Create the folder
    os.makedirs(f"data/{DATA}")

train_data.to_excel(f"data/{DATA}/train.xlsx", index=False)
valid_data.to_excel(f"data/{DATA}/valid.xlsx", index=False)
test_data.to_excel(f"data/{DATA}/test.xlsx", index=False)

write_jsonl(train_data[["prompt_message"]], f"data/{DATA}/train.jsonl")
write_jsonl(valid_data[["prompt_message"]], f"data/{DATA}/valid.jsonl")
write_jsonl(test_data[["prompt_message"]], f"data/{DATA}/test.jsonl")