In [90]:
# Import required packages
import pandas as pd
import numpy as np

In [91]:
# Variables to tweak for data creation
np.random.seed(0)
sample_size = 10

In [92]:
# Read data into environment
last_names = pd.read_csv('data/common_surnames_census_2000.csv')
first_names = pd.read_csv('data/ssa_names_db.csv')

In [93]:
# Data preprocessing for Last Names

# Replace unknown value with 0
# Prevents conflicts when finding max(percentages)
last_names2 = last_names.replace('(S)', 0.00)

# Convert percentage columns from strings to floats
for column in last_names2.columns[1:]:
    if last_names2[column].dtype == 'object':
        last_names2[column] = last_names2[column].astype(float)

# Create new column based on the ethnicity label with highest probability
last_names2['predominant'] = last_names2.iloc[:,5:].idxmax(1).str.replace('pct', '')

# Sample evenly through each unique dominant ethnicity
# Prevents most names being white and promotes even representation
last_names_final = last_names2.groupby('predominant').apply(lambda ethnicity: ethnicity.sample(sample_size)).reset_index(drop=True)

In [94]:
# Data preprocessing for First Names
# Multiply sample_size by 3 to keep same dimension as Last Names
# 6 Ethnicities / 2 Genders
first_names_final = first_names.groupby('gender').apply(lambda gender: gender.sample(sample_size*3)).reset_index(drop=True)

In [97]:
# Creating Full Names dataset
# Extract relevant features from First and Last Name datasets
fnames = first_names_final.iloc[:,0]
lnames = last_names_final.iloc[:,0].str.capitalize()
ffeatures = first_names_final.iloc[:,1]
lfeatures = last_names_final.iloc[:,5:]
# Join all features together in final dataset
full_names = pd.concat([fnames,lnames,lfeatures,ffeatures], axis= 1)

In [98]:
full_names

Unnamed: 0,name,name.1,pctwhite,pctblack,pctapi,pctaian,pct2prace,pcthispanic,predominant,gender
0,Doria,Jadoo,6.09,13.91,33.04,0.0,39.13,0.0,2prace,F
1,Symia,Ramsumair,0.0,23.14,35.54,0.0,36.36,0.0,2prace,F
2,Dalaysia,Ramsammy,0.0,24.05,33.21,0.0,38.17,2.29,2prace,F
3,Vallery,Sanichar,0.0,8.25,39.81,0.0,46.12,4.85,2prace,F
4,Miakoda,Deonarine,6.27,13.05,33.42,5.74,38.38,3.13,2prace,F
5,Ezabella,Seepersaud,0.0,4.95,34.28,6.71,49.47,0.0,2prace,F
6,Mariame,Alli,18.96,25.52,23.12,2.4,25.91,4.09,2prace,F
7,Raedyn,Fortes,22.27,17.43,13.73,0.4,29.41,16.76,2prace,F
8,Christin,Mahabir,5.28,20.65,29.55,3.03,36.69,4.79,2prace,F
9,Brenda,Sukhai,6.73,10.58,27.88,14.42,40.38,0.0,2prace,F
