In [1]:
import pandas as pd
import random
import numpy as np

In [46]:
# White first names
white_male_first_names = [
    "James", "John", "Robert", "Michael", "William", "David", "Richard", 
    "Joseph", "Thomas", "Charles", "Christopher", "Daniel", "Matthew", 
    "Anthony", "Mark", "Donald", "Steven", "Paul", "Andrew", "Joshua"
]

white_female_first_names = [
    "Mary", "Patricia", "Jennifer", "Linda", "Elizabeth", "Barbara", "Susan", 
    "Jessica", "Sarah", "Karen", "Nancy", "Lisa", "Margaret", "Betty", 
    "Sandra", "Ashley", "Kimberly", "Emily", "Donna", "Michelle"
]

# Asian first names
asian_male_first_names = [
    "Liu", "Wei", "Min", "Ying", "Hao", "Jia", "Jun", "Li", "Chen", 
    "Tuan", "Anh", "Tran", "Ngoc", "Sun", "Raj", "Arjun", "Ravi", 
    "Amit", "Kai", "Jin"
]

asian_female_first_names = [
    "Yumi", "Soo", "Lina", "Mei", "Hana", "Yuna", "Mina", "Jia", "Wei", 
    "Sakura", "Kim", "Mi", "Aya", "Li", "Trang", "An", "Hanh", 
    "Priya", "Aisha", "Ying"
]

# Black first names
black_male_first_names = [
    "Darnell", "Jerome", "Leroy", "Tyrone", "Darius", "Malik", "Marquis", 
    "DeShawn", "Andre", "Jamal", "Maurice", "Tremayne", "Rashad", 
    "Trevon", "Dante", "Lamont", "Terrence", "Malcolm", "Kareem", "Cedric"
]

black_female_first_names = [
    "Aaliyah", "Keisha", "Latoya", "Tamika", "Monique", "Jasmine", 
    "Imani", "Ebony", "Shanice", "Tiana", "Kiara", "Nia", "Lashonda", 
    "Tanisha", "Desiree", "Tiara", "Ayanna", "Zaria", "Raven", "Kiana"
]

# Hispanic first names
hispanic_male_first_names = [
    "Jose", "Luis", "Carlos", "Juan", "Jorge", "Miguel", "Angel", 
    "Francisco", "Pedro", "Alejandro", "Manuel", "Roberto", "Rafael", 
    "Fernando", "Ricardo", "Diego", "Eduardo", "Mario", "Julio", "Enrique"
]

hispanic_female_first_names = [
    "Maria", "Sofia", "Camila", "Valentina", "Isabella", "Martina", "Lucia", 
    "Victoria", "Ximena", "Fernanda", "Gabriela", "Daniela", "Natalia", 
    "Andrea", "Juliana", "Carolina", "Ariana", "Paola", "Alejandra", "Viviana"
]

# Surnames (same for both genders)
white_surnames = [
    "Smith", "Johnson", "Williams", "Brown", "Jones", "Miller", "Davis", 
    "Wilson", "Moore", "Taylor", "Anderson", "Thomas", "Jackson", 
    "White", "Harris", "Martin", "Thompson", "Garcia", "Martinez", "Robinson"
]

asian_surnames = [
    "Lee", "Kim", "Chen", "Wong", "Liu", "Yang", "Zhang", "Lin", "Huang", 
    "Wang", "Li", "Nguyen", "Tran", "Pham", "Chung", "Huynh", "Choi", 
    "Park", "Shah", "Singh"
]

black_surnames = [
    "Williams", "Johnson", "Smith", "Jones", "Brown", "Jackson", "Davis", 
    "Harris", "Robinson", "Thomas", "Walker", "White", "Taylor", 
    "Thompson", "Moore", "Anderson", "Lewis", "King", "Scott", "Green"
]

hispanic_surnames = [
    "Garcia", "Martinez", "Rodriguez", "Hernandez", "Lopez", "Gonzalez", 
    "Perez", "Sanchez", "Ramirez", "Torres", "Flores", "Rivera", "Gomez", 
    "Diaz", "Reyes", "Cruz", "Morales", "Ortiz", "Gutierrez", "Ramos"
]

# Mapping race and gender to first and last name lists
name_mapping = {
    "white": {
        "Male": (white_male_first_names, white_surnames),
        "Female": (white_female_first_names, white_surnames)
    },
    "asian": {
        "Male": (asian_male_first_names, asian_surnames),
        "Female": (asian_female_first_names, asian_surnames)
    },
    "black": {
        "Male": (black_male_first_names, black_surnames),
        "Female": (black_female_first_names, black_surnames)
    },
    "hispanic": {
        "Male": (hispanic_male_first_names, hispanic_surnames),
        "Female": (hispanic_female_first_names, hispanic_surnames)
    }
}

def sample_names(race, gender, n=10):
    """
    Sample a specified number of names for a given race and gender.
    
    Args:
    - race (str): The race for which to sample names (e.g., "white", "asian", "black", "hispanic").
    - gender (str): The gender for which to sample names ("Male" or "Female").
    - n (int): The number of names to sample (default is 10).
    
    Returns:
    - List of sampled names (first name + last name).
    """
    if race not in name_mapping or gender not in name_mapping[race]:
        raise ValueError("Race or gender not recognized. Valid options are: race - 'white', 'asian', 'black', 'hispanic'; gender - 'Male', 'Female'.")
    
    first_names, last_names = name_mapping[race][gender]
    sampled_names = []
    
    for _ in range(n):
        first_name = random.choice(first_names)
        last_name = random.choice(last_names)
        sampled_names.append(f"{first_name} {last_name}")
    
    return sampled_names


In [49]:
attribute_dict = {
    "Age": list(np.arange(25, 45)),
    "Sex": ["Male", "Female"],
    "Education": ["Associate's Degree", "Bachelor's Degree", "Master's Degree"],
    "Class of Worker": ["Private", "Public", "Self-Employed"],
    "Marital Status": ["Single", "Married", "Divorced"],
    "Place of Birth": ["New York", "New Jersey", "Connecticut", "Canada", "Pennsylvania", "California", "Florida"],
    "Big Five Scores 1": ["High openness", "High conscientiousness", "High extraversion", "High agreeableness", "High neuroticism"],
    "Big Five Scores 2": ["High openness", "High conscientiousness", "High extraversion", "High agreeableness", "High neuroticism"],
    "Defining Quirks": ["Always punctual", "Loves puzzles", "Extremely organized", "Very social", "Introverted"],
    "Personal Time": ["Reading", "Playing sports", "Gaming", "Cooking", "Traveling"],
    "Lifestyle": ["Active", "Sedentary", "Balanced", "Workaholic", "Laid-back"],
    "Political Views": ["Democrat", "Republican", "Independent", "Green", "Libertarian"],
    "Fertility": ["Has children", "Does not have children", "Planning to have children", "Undecided"],
    "Income Bracket": ["Low income", "Middle income", "Upper-middle income", "High income"],
    "Housing Situation": ["Owns home", "Rents"],
    "Relationship with Technology": ["Tech-savvy", "Familiar", "Tech-averse"],
    "Hobbies": ["Gardening", "Photography", "Crafting", "Hiking", "Playing musical instruments"],
    "Communication Style": ["Direct", "Diplomatic", "Reserved", "Open", "Humorous"],
    "Risk Tolerance": ["Risk-averse", "Moderate risk-taker", "High risk-taker"],
    "Travel Frequency": ["Frequent traveler", "Occasional traveler", "Rare traveler", "Never travels"],
    "Pet Ownership": ["Owns a dog", "Owns a cat", "Owns other pets", "No pets"],
}


# Function to generate random personas
def generate_personas(num_personas, attribute_dict):
    personas = []
    keys = list(attribute_dict.keys())
    
    for _ in range(num_personas):
        persona = {key: random.choice(attribute_dict[key]) for key in keys}
        personas.append(persona)
    
    return pd.DataFrame(personas)


personas_df = generate_personas(250, attribute_dict)
personas_df["person_id"] = list(range(len(personas_df)))
personas_df.to_csv("./data/generated_personas.csv", index=False)
print("Generated personas saved to 'generated_personas.csv'")
personas_df.head()

Generated personas saved to 'generated_personas.csv'


Unnamed: 0,Age,Sex,Education,Class of Worker,Marital Status,Place of Birth,Big Five Scores 1,Big Five Scores 2,Defining Quirks,Personal Time,...,Fertility,Income Bracket,Housing Situation,Relationship with Technology,Hobbies,Communication Style,Risk Tolerance,Travel Frequency,Pet Ownership,person_id
0,36,Male,Bachelor's Degree,Public,Single,Connecticut,High openness,High agreeableness,Introverted,Gaming,...,Has children,High income,Rents,Tech-savvy,Photography,Open,Risk-averse,Rare traveler,Owns a dog,0
1,27,Female,Associate's Degree,Self-Employed,Divorced,New Jersey,High neuroticism,High openness,Very social,Cooking,...,Has children,Upper-middle income,Owns home,Tech-averse,Gardening,Direct,Moderate risk-taker,Rare traveler,Owns a cat,1
2,43,Female,Master's Degree,Self-Employed,Divorced,Canada,High conscientiousness,High agreeableness,Loves puzzles,Gaming,...,Undecided,Low income,Owns home,Familiar,Crafting,Humorous,Moderate risk-taker,Frequent traveler,Owns other pets,2
3,30,Female,Bachelor's Degree,Self-Employed,Divorced,Florida,High openness,High openness,Extremely organized,Reading,...,Undecided,Low income,Rents,Tech-averse,Photography,Direct,Moderate risk-taker,Rare traveler,Owns other pets,3
4,32,Female,Associate's Degree,Public,Divorced,Canada,High conscientiousness,High conscientiousness,Always punctual,Cooking,...,Planning to have children,Low income,Owns home,Tech-averse,Hiking,Open,High risk-taker,Occasional traveler,Owns a cat,4


In [50]:
results = []

for i, row in personas_df.iterrows():
    for race in ["white", "black", "asian", "hispanic"]:
        name = sample_names(race, row["Sex"], 1)[0]
        email = name.lower().replace(" ","")+"@gmail.com"

        results.append([row["person_id"], race, name, email])

name_df = pd.DataFrame(results, columns=["person_id", "Race", "Name", "Email"])
name_df.to_csv("./data/generated_names.csv", index=False)
print("Generated names saved to 'generated_names.csv'")
name_df.head()

Generated names saved to 'generated_names.csv'


Unnamed: 0,person_id,Race,Name,Email
0,0,white,Richard Brown,richardbrown@gmail.com
1,0,black,DeShawn Thomas,deshawnthomas@gmail.com
2,0,asian,Sun Zhang,sunzhang@gmail.com
3,0,hispanic,Enrique Reyes,enriquereyes@gmail.com
4,1,white,Margaret Brown,margaretbrown@gmail.com


In [4]:
attribute_dict = {
    "Age": list(np.arange(25, 45)),
    "Sex": ["Male", "Female"],
    "Education": ["Associate's Degree", "Bachelor's Degree", "Master's Degree"],
    "Class of Worker": ["Private", "Public", "Self-Employed"],
    "Marital Status": ["Single", "Married", "Divorced"],
    "Place of Birth": ["New York", "New Jersey", "Connecticut", "Canada", "Pennsylvania", "California", "Florida"],
    "Big Five Scores 1": ["High openness", "High conscientiousness", "High extraversion", "High agreeableness", "High neuroticism"],
    "Big Five Scores 2": ["High openness", "High conscientiousness", "High extraversion", "High agreeableness", "High neuroticism"],
    "Defining Quirks": ["Always punctual", "Loves puzzles", "Extremely organized", "Very social", "Introverted"],
    "Personal Time": ["Reading", "Playing sports", "Gaming", "Cooking", "Traveling"],
    "Lifestyle": ["Active", "Sedentary", "Balanced", "Workaholic", "Laid-back"],
    "Political Views": ["Democrat", "Republican", "Independent", "Green", "Libertarian"],
    "Fertility": ["Has children", "Does not have children", "Planning to have children", "Undecided"],
    "Income Bracket": ["Low income", "Middle income", "Upper-middle income", "High income"],
    "Housing Situation": ["Owns home", "Rents"],
    "Relationship with Technology": ["Tech-savvy", "Familiar", "Tech-averse"],
    "Hobbies": ["Gardening", "Photography", "Crafting", "Hiking", "Playing musical instruments"],
    "Communication Style": ["Direct", "Diplomatic", "Reserved", "Open", "Humorous"],
    "Risk Tolerance": ["Risk-averse", "Moderate risk-taker", "High risk-taker"],
    "Travel Frequency": ["Frequent traveler", "Occasional traveler", "Rare traveler", "Never travels"],
    "Pet Ownership": ["Owns a dog", "Owns a cat", "Owns other pets", "No pets"],
}
df = pd.DataFrame([[k,v] for k,v in attribute_dict.items()], columns=["Category", "Values"])
print(df.to_latex(index=False))

\begin{tabular}{ll}
\toprule
Category & Values \\
\midrule
Age & [np.int64(25), np.int64(26), np.int64(27), np.int64(28), np.int64(29), np.int64(30), np.int64(31), np.int64(32), np.int64(33), np.int64(34), np.int64(35), np.int64(36), np.int64(37), np.int64(38), np.int64(39), np.int64(40), np.int64(41), np.int64(42), np.int64(43), np.int64(44)] \\
Sex & ['Male', 'Female'] \\
Education & ["Associate's Degree", "Bachelor's Degree", "Master's Degree"] \\
Class of Worker & ['Private', 'Public', 'Self-Employed'] \\
Marital Status & ['Single', 'Married', 'Divorced'] \\
Place of Birth & ['New York', 'New Jersey', 'Connecticut', 'Canada', 'Pennsylvania', 'California', 'Florida'] \\
Big Five Scores 1 & ['High openness', 'High conscientiousness', 'High extraversion', 'High agreeableness', 'High neuroticism'] \\
Big Five Scores 2 & ['High openness', 'High conscientiousness', 'High extraversion', 'High agreeableness', 'High neuroticism'] \\
Defining Quirks & ['Always punctual', 'Loves puzzles', 'Ex

In [6]:
attribute_dict = {
    "emotional_intelligence": [
        "empathetic", "supportive", "compassionate", "understanding", "caring", 
        "patient", "nurturing"
    ],
    "competitiveness": [
        "competitive", "driven", "goal-oriented", "focused", "ambitious", 
        "outperformed", "won", "achieved"
    ],
    "stability_reliability": [
        "reliable", "consistent", "punctual", "dependable", "steady", "committed", "loyal"
    ],
}
df = pd.DataFrame([[k,v] for k,v in attribute_dict.items()], columns=["Attribute", "Keywords"])
print(df.to_latex(index=False))

\begin{tabular}{ll}
\toprule
Attribute & Keywords \\
\midrule
emotional_intelligence & ['empathetic', 'supportive', 'compassionate', 'understanding', 'caring', 'patient', 'nurturing'] \\
competitiveness & ['competitive', 'driven', 'goal-oriented', 'focused', 'ambitious', 'outperformed', 'won', 'achieved'] \\
stability_reliability & ['reliable', 'consistent', 'punctual', 'dependable', 'steady', 'committed', 'loyal'] \\
\bottomrule
\end{tabular}

