In [1]:
from faker import Faker
import pandas as pd
import numpy as np
import random

# here is an example, we can also use CDC's API (Data.CDC.gov API) to stimulate a more realistic data
# Set random seed for reproducibility
np.random.seed(1)

# Total participants
total_participants = 5000

fake = Faker()

# Define function to generate names based on race, too slow on my PC
# def generate_name_by_race(race):
#    if race == 'Asian':
#       fake.name = Faker(['zh_CN', 'ja_JP', 'ko_KR']).name  # Some Asian names, can be adjusted as needed
#   elif race == 'Black or African American':
#        fake.name = Faker('en_US').name  # Default to US names, more diversity
#    elif race == 'American Indian or Alaska Native':
#        fake.name = Faker('en_US').name  # Default to US names
#    elif race == 'Native Hawaiian or Other Pacific Islander':
#        fake.name = Faker('en_US').name  # Default to US names
#    else:  # 'White' and 'Two or more races'
#        fake.name = Faker('en_US').name  # Default to US names for White and Mixed race#
#    return fake.name() 

def generate_attitudes_and_vaccine_status(n):
    data = {
        'Participant_ID': range(1, n + 1),
        'Attitude_Towards_Vaccines': np.random.randint(1, 6, n)
    }
    df = pd.DataFrame(data)

    df['Previous_Vaccine_Status'] = df['Attitude_Towards_Vaccines'].apply(
        lambda x: np.random.choice(['Yes', 'No'], p=[0.1 + 0.05 * x, 0.9 - 0.05 * x])
    )

    return df


# Correcting the race distribution probabilities to include Hispanic or Latino and sum up to 1
def adjusted_race_distribution():
    races = ['White', 'Hispanic or Latino', 'Black or African American', 'Asian', 
             'American Indian or Alaska Native', 'Native Hawaiian or Other Pacific Islander', 
             'Two or more races']
    # Updated probabilities to sum up to 1
    probabilities = [0.60, 0.185, 0.13, 0.06, 0.01, 0.005, 0.01]
    return np.random.choice(races, p=probabilities)

# Adjusted Education Distribution
def adjusted_education_distribution():
    education_levels = ['Less than high school', 'High school graduate', 
                        'Some college', "Bachelor's degree", 'Graduate degree']
    probabilities = [0.12, 0.27, 0.21, 0.24, 0.16]  # Adjust probabilities to match US data
    return np.random.choice(education_levels, p=probabilities)

# Adjusted Gender Distribution
def adjusted_gender_distribution():
    genders = ['Male', 'Female', 'Other', 'Prefer not to say']
    probabilities = [0.49, 0.49, 0.01, 0.01]  # Adjust to a realistic gender ratio
    return np.random.choice(genders, p=probabilities)

# Population data for US states as of 2020 (in millions)
# Data sourced from the United States Census Bureau
state_population_data = {
    'Alabama': 4.9, 'Alaska': 0.73, 'Arizona': 7.15, 'Arkansas': 3.0, 'California': 39.14,
    'Colorado': 5.7, 'Connecticut': 3.57, 'Delaware': 0.97, 'Florida': 21.48, 'Georgia': 10.61,
    'Hawaii': 1.41, 'Idaho': 1.78, 'Illinois': 12.67, 'Indiana': 6.73, 'Iowa': 3.15,
    'Kansas': 2.91, 'Kentucky': 4.47, 'Louisiana': 4.65, 'Maine': 1.34, 'Maryland': 6.05,
    'Massachusetts': 6.89, 'Michigan': 9.98, 'Minnesota': 5.64, 'Mississippi': 2.97, 'Missouri': 6.13,
    'Montana': 1.07, 'Nebraska': 1.93, 'Nevada': 3.08, 'New Hampshire': 1.36, 'New Jersey': 8.88,
    'New Mexico': 2.1, 'New York': 19.45, 'North Carolina': 10.49, 'North Dakota': 0.76, 'Ohio': 11.68,
    'Oklahoma': 3.95, 'Oregon': 4.22, 'Pennsylvania': 12.8, 'Rhode Island': 1.06, 'South Carolina': 5.12,
    'South Dakota': 0.88, 'Tennessee': 6.83, 'Texas': 28.99, 'Utah': 3.2, 'Vermont': 0.62,
    'Virginia': 8.53, 'Washington': 7.61, 'West Virginia': 1.79, 'Wisconsin': 5.82, 'Wyoming': 0.58
}

# Total US population
total_us_population = sum(state_population_data.values())

# Calculate each state's proportion of the total US population
state_proportions = {state: pop / total_us_population for state, pop in state_population_data.items()}

# Convert to a DataFrame for easier visualization and use
state_proportions_df = pd.DataFrame(list(state_proportions.items()), columns=['State', 'Population Proportion'])
state_proportions_df.head()

states = list(state_proportions.keys())
probabilities = list(state_proportions.values())

# Adjusting the generate_personal_info function with correct race distribution
def generate_personal_info(n):
    data = []
    for _ in range(n):
        age = max(18, min(80, int(np.random.normal(35, 10))))
        person = {
            'Participant_ID': _ + 1,
            'Age': age,
            'Gender': adjusted_gender_distribution(),
            'Location': np.random.choice(states, p=probabilities),
            'Education': adjusted_education_distribution(),
            'Race': adjusted_race_distribution(),
            'Employment_Status': fake.random_element(elements=('Employed', 'Unemployed', 'Student', 'Retired'))
        }
        data.append(person)
    return pd.DataFrame(data)

def generate_attitudes_and_vaccine_status(n, education_data):
    data = {
        'Participant_ID': range(1, n + 1),
        'Attitude_Towards_Vaccines': np.random.randint(1, 6, n)
    }
    df = pd.DataFrame(data)

    # Incorporating the impact of education level on vaccine attitudes
    education_attitude_impact = {
        'Less than high school': -0.5,
        'High school graduate': 0,
        'Some college': 0.5,
        "Bachelor's degree": 1,
        'Graduate degree': 1.5
    }
    df['Education_Level'] = df['Participant_ID'].apply(lambda x: education_data.loc[x - 1, 'Education'])
    df['Attitude_Adjusted'] = df.apply(lambda row: row['Attitude_Towards_Vaccines'] + education_attitude_impact[row['Education_Level']], axis=1)

    df['Previous_Vaccine_Status'] = df['Attitude_Adjusted'].apply(
        lambda x: np.random.choice(['Yes', 'No'], p=[max(0.1, min(1, 0.1 + 0.05 * x)), max(0, 1 - 0.1 - 0.05 * x)])
    )
    return df


# Generate personal information data
baseline_survey_data = generate_personal_info(total_participants)

# Generate attitudes and vaccine status data
attitudes_and_vaccine_status = generate_attitudes_and_vaccine_status(total_participants, baseline_survey_data)
baseline_survey_data.head(10)

Unnamed: 0,Participant_ID,Age,Gender,Location,Education,Race,Employment_Status
0,1,51,Male,Idaho,High school graduate,White,Employed
1,2,28,Male,Illinois,Some college,White,Unemployed
2,3,52,Male,Texas,Less than high school,Hispanic or Latino,Unemployed
3,4,27,Male,New Jersey,High school graduate,White,Retired
4,5,46,Female,Texas,Less than high school,White,Employed
5,6,24,Male,Texas,Less than high school,White,Employed
6,7,35,Female,Illinois,Bachelor's degree,Black or African American,Student
7,8,40,Male,Pennsylvania,Graduate degree,Hispanic or Latino,Retired
8,9,44,Male,Massachusetts,Graduate degree,White,Unemployed
9,10,28,Male,California,Less than high school,Hispanic or Latino,Retired


In [2]:
def introduce_missing_values(df, missing_percentage=0.05, exclude_columns=None):
    """
    Randomly introduce missing values into the dataframe, excluding specified columns.
    
    :param df: pandas DataFrame.
    :param missing_percentage: Percentage of total values in the dataframe to be set as NaN.
    :param exclude_columns: List of column names to exclude from having missing values introduced.
    :return: DataFrame with missing values introduced.
    """
    if exclude_columns is None:
        exclude_columns = []

    # Create a mask for columns where missing values can be introduced
    cols_to_include = [col for col in df.columns if col not in exclude_columns]

    # Total number of values in the included columns
    total_values = df[cols_to_include].size

    # Total number of missing values to introduce
    total_missing = int(total_values * missing_percentage)

    # Randomly select indices to replace with NaN
    rows = np.random.randint(0, df.shape[0], total_missing)
    cols = np.random.choice(cols_to_include, total_missing)

    # Replace selected values with NaN
    for row, col in zip(rows, cols):
        df.at[row, col] = np.nan

    # Convert integer columns back to Nullable Integer type
    for col in df.select_dtypes(include='number').columns:
        if col not in exclude_columns:
            df[col] = df[col].astype('Int64')

    return df

# Introduce missing values, excluding the 'Participant_ID' column
baseline_survey_data_with_missing = introduce_missing_values(
    baseline_survey_data, 
    missing_percentage=0.05, 
    exclude_columns=['Participant_ID']
)

baseline_survey_data_with_missing.head(10)


Unnamed: 0,Participant_ID,Age,Gender,Location,Education,Race,Employment_Status
0,1,,Male,Idaho,High school graduate,White,Employed
1,2,28.0,Male,Illinois,Some college,,Unemployed
2,3,52.0,Male,Texas,Less than high school,Hispanic or Latino,Unemployed
3,4,27.0,Male,New Jersey,High school graduate,White,Retired
4,5,46.0,Female,Texas,Less than high school,,Employed
5,6,24.0,Male,,Less than high school,White,Employed
6,7,35.0,Female,,Bachelor's degree,Black or African American,Student
7,8,40.0,Male,Pennsylvania,Graduate degree,Hispanic or Latino,
8,9,,Male,Massachusetts,Graduate degree,White,Unemployed
9,10,28.0,Male,California,Less than high school,,Retired


In [3]:
attitudes_and_vaccine_status.head(10)

Unnamed: 0,Participant_ID,Attitude_Towards_Vaccines,Education_Level,Attitude_Adjusted,Previous_Vaccine_Status
0,1,1,High school graduate,1.0,No
1,2,4,Some college,4.5,Yes
2,3,5,Less than high school,4.5,No
3,4,4,High school graduate,4.0,Yes
4,5,3,Less than high school,2.5,No
5,6,2,Less than high school,1.5,No
6,7,2,Bachelor's degree,3.0,No
7,8,5,Graduate degree,6.5,Yes
8,9,3,Graduate degree,4.5,Yes
9,10,5,Less than high school,4.5,No


In [4]:
baseline_survey_data_with_missing = pd.merge(baseline_survey_data_with_missing, attitudes_and_vaccine_status, on='Participant_ID', how='inner')

In [5]:
def generate_random_assignment(n):
    # 1/3rd distribution for each group
    groups = ['Reason Ad', 'Emotion Ad', 'Control']
    assigned_group = np.random.choice(groups, n, p=[1/3, 1/3, 1/3])
    data = {
        'Participant_ID': range(1, n + 1),
        'Assigned_Group': assigned_group
    }
    return pd.DataFrame(data)

# Generate random assignment data
random_assignment_data = generate_random_assignment(total_participants)
random_assignment_data.head()

Unnamed: 0,Participant_ID,Assigned_Group
0,1,Reason Ad
1,2,Emotion Ad
2,3,Control
3,4,Control
4,5,Emotion Ad


In [6]:
def generate_endline_survey(n, baseline_data, attitude_status_data):
    # Assuming some participants didn't complete the endline survey
    endline_participants = int(n * 0.9)  # 90% completion rate
    participant_ids = random.sample(list(baseline_data['Participant_ID']), endline_participants)

    # Randomly decide if participants got vaccinated after the baseline survey
    vaccinated = []
    for pid in participant_ids:
        
        initial_attitude = attitude_status_data[attitude_status_data['Participant_ID'] == pid]['Attitude_Towards_Vaccines'].iloc[0]
        previous_status = attitude_status_data[attitude_status_data['Participant_ID'] == pid]['Previous_Vaccine_Status'].iloc[0]

        if previous_status == 'Yes':
            vaccinated.append('Yes')
        else:
            probability_of_vaccination = initial_attitude / 5  # Higher attitude, higher chance of vaccination
            vaccinated.append(np.random.choice(['Yes', 'No'], p=[probability_of_vaccination, 1 - probability_of_vaccination]))

    data = {
        'Participant_ID': participant_ids,
        'Vaccinated_After_Baseline': vaccinated
    }
    return pd.DataFrame(data)

# Generate endline survey data
endline_survey_data = generate_endline_survey(total_participants, baseline_survey_data, attitudes_and_vaccine_status)
endline_survey_data.head()

Unnamed: 0,Participant_ID,Vaccinated_After_Baseline
0,2229,Yes
1,1021,No
2,4831,No
3,1700,No
4,150,Yes


In [7]:
baseline_survey_data_with_missing.to_csv("baseline_survey_data.csv", index=False)
random_assignment_data.to_csv("random_assignment_data.csv", index=False)
endline_survey_data.to_csv("endline_survey_data.csv", index=False)