In [17]:
# Import required packages
import pandas as pd
import numpy as np

In [18]:
# Variables to tweak for data creation
np.random.seed(0)
sample_size = 10

In [19]:
# Read data into environment
last_names = pd.read_csv('data/common_surnames_census_2000.csv').rename(columns={'pct2prace': 'pctmixed'})
first_names = pd.read_csv('data/ssa_names_db.csv')

In [20]:
# Data preprocessing for Last Names

# Replace unknown value with 0
# Prevents conflicts when finding max(percentages)
last_names2 = last_names.replace('(S)', 0.00)

# Convert percentage columns from strings to floats
for column in last_names2.columns[1:]:
    if last_names2[column].dtype == 'object':
        last_names2[column] = last_names2[column].astype(float)

# Create new column based on the ethnicity label with highest probability
last_names2['predominant'] = last_names2.iloc[:,5:].idxmax(1).str.replace('pct', '')

# Sample evenly through each unique dominant ethnicity
# Prevents most names being white and promotes even representation
last_names_final = last_names2.groupby('predominant').apply(lambda ethnicity: ethnicity.sample(sample_size)).reset_index(drop=True)

In [21]:
# Data preprocessing for First Names
# Multiply sample_size by 3 to keep same dimension as Last Names
# 6 Ethnicities / 2 Genders
first_names_final = first_names.groupby('gender').apply(lambda gender: gender.sample(sample_size*3)).reset_index(drop=True)

In [22]:
# Creating Full Names dataset
# Extract relevant features from First and Last Name datasets
fnames = first_names_final.iloc[:,0]
lnames = last_names_final.iloc[:,0].str.capitalize()
ffeatures = first_names_final.iloc[:,1]
lfeatures = last_names_final.iloc[:,5:]
# Join all features together in final dataset
full_names = pd.concat([fnames,lnames,lfeatures,ffeatures], axis= 1)

In [24]:
full_names

Unnamed: 0,name,name.1,pctwhite,pctblack,pctapi,pctasian,pctmixed,pcthispanic,predominant,gender
0,Ameli,Manaloto,2.99,0.0,92.74,0.0,3.85,0.0,api,F
1,Avayah,Fujiki,0.0,0.0,88.32,0.0,8.76,0.0,api,F
2,British,Gapuz,8.9,0.0,78.77,0.0,0.0,9.59,api,F
3,Kennidee,Wakabayashi,5.31,0.0,86.21,0.0,7.43,0.0,api,F
4,Henri,Nahar,11.75,3.21,73.72,0.0,8.12,0.0,api,F
5,Surabhi,Liam,33.33,5.07,53.62,0.0,0.0,4.35,api,F
6,Nanako,Subido,5.77,0.0,79.49,0.0,6.41,8.33,api,F
7,Aidalyn,Hsiang,2.81,0.0,94.04,0.0,2.81,0.0,api,F
8,Albany,Hanada,4.17,0.0,83.33,0.0,10.42,0.0,api,F
9,Kianna,Bolosan,3.85,0.0,87.09,0.0,7.14,1.65,api,F
