In [36]:
# Import required packages
import pandas as pd
import numpy as np

In [37]:
# Variables to tweak for data creation
np.random.seed(0)
sample_size = 10

In [38]:
# Read data into environment
# See name_data_explaination for data collection methods
# AIAN - American Indian or Alaskan Native
# API - Asian Pacific Islander
last_names = pd.read_csv('data/common_surnames_census_2000.csv').rename(columns={'pct2prace': 'pctmixed'})
first_names = pd.read_csv('data/ssa_names_db.csv')

In [39]:
# Data preprocessing for Last Names

# Fields suppressed for confidentiality are assigned the value (S). 
# Replace confidentiality value with 0
# Prevents conflicts when finding max(percentages)
last_names2 = last_names.replace('(S)', 0.00)

# Convert percentage columns from strings to floats
for column in last_names2.columns[1:]:
    if last_names2[column].dtype == 'object':
        last_names2[column] = last_names2[column].astype(float)

# Create new column based on the ethnicity label with highest probability
last_names2['predominant'] = last_names2.iloc[:,5:].idxmax(1).str.replace('pct', '')

# Sample evenly through each unique dominant ethnicity
# Prevents most names being white and promotes even representation
last_names_final = last_names2.groupby('predominant').apply(lambda ethnicity: ethnicity.sample(sample_size)).reset_index(drop=True)

In [None]:
# Potential other way of handling confidentiality
# 27,649 Names after removal
'''
t = last_names.replace('(S)', None)
t.dropna()
'''

In [40]:
# Data preprocessing for First Names
# Multiply sample_size by 3 to keep same dimension as Last Names
# 6 Ethnicities / 2 Genders
first_names_final = first_names.groupby('gender').apply(lambda gender: gender.sample(sample_size*3)).reset_index(drop=True)

In [41]:
# Creating Full Names dataset
# Extract relevant features from First and Last Name datasets
fnames = first_names_final.iloc[:,0]
lnames = last_names_final.iloc[:,0].str.capitalize()
ffeatures = first_names_final.iloc[:,1]
lfeatures = last_names_final.iloc[:,5:]
# Join all features together in final dataset
full_names = pd.concat([fnames,lnames,lfeatures,ffeatures], axis= 1)

In [42]:
full_names

Unnamed: 0,name,name.1,pctwhite,pctblack,pctapi,pctaian,pctmixed,pcthispanic,predominant,gender
0,Thyme,Jimmie,27.89,7.32,0.0,60.85,1.41,2.54,aian,F
1,Cathryn,Buffalohead,8.62,0.0,0.0,84.48,5.17,0.0,aian,F
2,Yaneisy,Jimmy,17.13,6.42,7.95,59.94,2.75,5.81,aian,F
3,Elleanor,Decora,36.94,0.0,2.25,52.7,4.95,3.15,aian,F
4,Aunna,Custalow,26.72,14.12,0.0,49.62,7.63,0.0,aian,F
5,Daisymae,Silversmith,28.88,0.0,0.0,65.8,3.3,2.01,aian,F
6,Libbie,Prettyweasel,0.0,0.0,0.0,99.04,0.0,0.0,aian,F
7,River,Walkingstick,24.63,0.0,0.0,62.61,8.61,4.15,aian,F
8,Lakeisha,Whitetree,34.29,0.0,0.0,59.05,4.76,0.0,aian,F
9,Katrina,Bigboy,14.06,0.0,0.0,82.03,0.0,0.0,aian,F


In [57]:
t = last_names.replace('(S)', None)

In [59]:
t.dropna()

Unnamed: 0,name,rank,count,prop100k,cum_prop100k,pctwhite,pctblack,pctapi,pctaian,pctmixed,pcthispanic
0,SMITH,1,2376206,880.85,880.85,73.35,22.22,0.40,0.85,1.63,1.56
1,JOHNSON,2,1857160,688.44,1569.30,61.55,33.80,0.42,0.91,1.82,1.50
2,WILLIAMS,3,1534042,568.66,2137.96,48.52,46.72,0.37,0.78,2.01,1.60
3,BROWN,4,1380145,511.62,2649.58,60.71,34.54,0.41,0.83,1.86,1.64
4,JONES,5,1362755,505.17,3154.75,57.69,37.73,0.35,0.94,1.85,1.44
...,...,...,...,...,...,...,...,...,...,...,...
151661,ZAPKA,150436,100,0.04,89753.23,99.00,0.00,0.00,0.00,0.00,0.00
151662,ZELDES,150436,100,0.04,89753.26,99.00,0.00,0.00,0.00,0.00,0.00
151664,ZIEGELHOFER,150436,100,0.04,89753.34,99.00,0.00,0.00,0.00,0.00,0.00
151665,ZIELESCH,150436,100,0.04,89753.37,99.00,0.00,0.00,0.00,0.00,0.00


In [None]:
# Getting OpenAI API ready
import openai

openai.api_key = open('key/Group13_Project_API_Key.txt').read().strip('\n')

text = ''

# Only uncomment when ready to start data collection

# completion = openai.ChatCompletion.create(
#   model="gpt-3.5-turbo", # this is "ChatGPT" $0.002 per 1k tokens
#   messages=[{"role": "user", "content": text}]
# )

# reply_content = completion.choices[0].message.content