In [1]:
import csv
import random
import numpy as np
import pandas as pd
from itertools import combinations
from collections import defaultdict, OrderedDict

seed = 1
random.seed(seed)
np.random.seed(seed)

## Dataset Setup

### Sets of First Names

**First Name Set ID** | Gender | Race | Popularity | Decade | Corresponding Last Name Set ID
--- | --- | --- | --- | --- | ---
**1** | M | White | Top | 2000s | 1
**2** | F | White | Top | 2000s | 1
**3** | M | White | Medium | 2000s | 2
**4** | F | White | Medium | 2000s | 2
**5** | M | White | Bottom | 2000s | 3
**6** | F | White | Bottom | 2000s | 3
**7** | M | Black | Medium | 2000s | 4
**8** | F | Black | Medium | 2000s | 4
**9** | M | Asian | Medium | 2000s | 5
**10** | F | Asian | Medium | 2000s | 5
**11** | M | Hispanic | Medium | 2000s | 6
**12** | F | Hispanic | Medium | 2000s | 6
**13** | M | White | Top | 1970s | 1
**14** | F | White | Top | 1970s | 1
**15** | M | White | Top | 1940s | 1
**16** | F | White | Top | 1940s | 1

### Sets of Last Names

**Last Name Set ID** | Gender | Race | Popularity | Decade | Corresponding First Name Set ID
--- | --- | --- | --- | --- | ---
**1** | NA | White | Top | 2000s | 1,2,13,14,15,16
**2** | NA | White | Medium | 2000s | 3,4
**3** | NA | White | Bottom | 2000s | 5,6
**4** | NA | Black | Medium | 2000s | 7,8
**5** | NA | Asian | Medium | 2000s | 9,10
**6** | NA | Hispanic | Medium | 2000s | 11,12


### Ways to Compare the Results

**Dimension** | Name Sets
--- | --- 
**Gender** | Male:{1,3,5,7,9,11,13,15} vs Female:{2,4,6,8,10,12,14,16}
**Race** | White:{3,4} vs Black:{7,8} vs Asian:{9,10} vs Hispanic:{11,12}
**Popularity** | Top:{1,2} vs Medium:{3,4} vs Bottom:{5,6}
**Decade** | 2000s:{1,2} vs 1970s:{13,14} vs 1940s:{15,16}

## Prepare Sets of First Names

In [3]:
# get the count and rank of first names, grouped by gender and decade

genders = ['M', 'F']
years, duration = [1940, 1970, 2000], 10
first_count = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
first_rank = defaultdict(lambda: defaultdict(OrderedDict))

for year in years:
    for i in range(duration):
        with open(f'../Data/General/Raw/Name/firstnames-socialsecurity-{year+i}.txt', 'r') as file:
            for line in file:
                first, gender, count = line[:-1].split(',')
                first_count[year][gender][first] += int(count)
                
    for gender in genders:
        for rank, (first, _) in enumerate(sorted(first_count[year][gender].items(), key=lambda x:x[1], reverse=True)):
            first_rank[year][gender][first] = rank
            
gender_first = {gender: set([first for year in years for first in first_rank[year][gender]]) for gender in genders}
ambiguous_first = gender_first['M'].intersection(gender_first['F'])
gender_first = {gender: gender_first[gender]-ambiguous_first for gender in genders}

In [5]:
# get the race of first names
# discard names where the ratio gap between the largest and second largest races is less than 10%

races = ['hispanic', 'white', 'black', 'asian']
index2race = {index:race for index, race in enumerate(races)}

threshold = 10
first_race = {}
df = pd.read_excel('../Data/General/Raw/Name/firstnames-mortgage-2010.xlsx', sheet_name=None)['Data']

for i in range(len(df)-1):
    first, race_dist = df['firstname'][i], np.array([df['pcthispanic'][i], df['pctwhite'][i], df['pctblack'][i], df['pctapi'][i], df['pctaian'][i], df['pct2prace'][i]])
    second_max, first_max = np.sort(race_dist)[-2:]
    if first_max-second_max >= threshold:
        first = first[0] + first[1:].lower()
        first_race[first] = index2race[race_dist.argmax()]
        
race_first = defaultdict(set)
for first, race in first_race.items():
    race_first[race].add(first)

In [6]:
# generate sets 1,2,13,14,15,16: M/F white top 1940s/1970s/2000s
# make sure that these names are representative (do not appear in the top 50 of other decades)

first_remove = defaultdict(dict)
for year in years:
    for gender in genders:
        first_remove[year][gender] = set(list(first_rank[year][gender].keys())[:50])
        
race = 'white'
size = 20
first_top = defaultdict(lambda: defaultdict(list))
for year in years:
    for gender in genders:
        for first in first_rank[year][gender]:
            if first not in race_first[race]: continue
            
            unique = True
            for other_year in years:
                for other_gender in genders:
                    if year == other_year and gender == other_gender: continue
                    if first in first_remove[other_year][other_gender]: unique = False; break
            
            if unique: 
                first_top[year][gender].append(first)
                if len(first_top[year][gender]) >= size: break

In [7]:
# generate sets 3,4,7,8,9,10,11,12: M/F white/black/asian/hispanic medium 2000s
# sample from the range between rank 400 and rank 8000

year = 2000
range_lower, range_upper = 400, 8000
race_candidates = defaultdict(lambda: defaultdict(list))

for gender in genders:
    for first in list(first_rank[year][gender].keys())[range_lower:range_upper]:
        if first in first_race:
            race_candidates[gender][first_race[first]].append(first)
            
for race in races:
    popular = set() if race!='white' else set([first for year in years for gender in genders for first in first_top[year][gender]])
    gender_overlap = list(set(race_candidates[genders[0]][race]).intersection(set(race_candidates[genders[1]][race])))
    random.shuffle(gender_overlap)    
    for i, gender in enumerate(genders):
        candidates, overlap = [], gender_overlap[i*len(gender_overlap)//2:(i+1)*len(gender_overlap)//2]
        for first in race_candidates[gender][race]:
            if first not in popular and first not in overlap: candidates.append(first)
        race_candidates[gender][race] = candidates
            
first_medium = defaultdict(lambda: defaultdict(dict))
for gender in genders:
    for race in races:
        sampled_indices = sorted(random.sample(range(len(race_candidates[gender][race])), size))
        first_medium[gender][race] = np.array(race_candidates[gender][race])[sampled_indices].tolist()

In [8]:
# generate sets 5,6: M/F white bottom 2000s
# break ties by random sampling

year = 2000
race = 'white'
first_bottom = defaultdict(list)
for gender in genders:
    candidates = set()
    for first in list(first_rank[year][gender].keys())[::-1]:
        if first_count[year][gender][first] > 5: break
        if first in race_first[race]: candidates.add(first)
        
    for other_gender in genders:
        candidates -= set(first_medium[other_gender][race])
        for other_year in years:
            candidates -= set(first_top[other_year][other_gender])
    first_bottom[gender] = random.sample(list(candidates), size)

In [11]:
# assign sets to their ids

sets, sets_info = {}, {}

sets_info[1] = ('M', 'white', 'top', '2000s')
sets[1] = first_top[2000]['M']
sets_info[2] = ('F', 'white', 'top', '2000s')
sets[2] = first_top[2000]['F']

sets_info[3] = ('M', 'white', 'medium', '2000s')
sets[3] = first_medium['M']['white']
sets_info[4] = ('F', 'white', 'medium', '2000s')
sets[4] = first_medium['F']['white']

sets_info[5] = ('M', 'white', 'bottom', '2000s')
sets[5] = first_bottom['M']
sets_info[6] = ('F', 'white', 'bottom', '2000s')
sets[6] = first_bottom['F']

sets_info[7] = ('M', 'black', 'medium', '2000s')
sets[7] = first_medium['M']['black']
sets_info[8] = ('F', 'black', 'medium', '2000s')
sets[8] = first_medium['F']['black']

sets_info[9] = ('M', 'asian', 'medium', '2000s')
sets[9] = first_medium['M']['asian']
sets_info[10] = ('F', 'asian', 'medium', '2000s')
sets[10] = first_medium['F']['asian']

sets_info[11] = ('M', 'hispanic', 'medium', '2000s')
sets[11] = first_medium['M']['hispanic']
sets_info[12] = ('F', 'hispanic', 'medium', '2000s')
sets[12] = first_medium['F']['hispanic']

sets_info[13] = ('M', 'white', 'top', '1970s')
sets[13] = first_top[1970]['M']
sets_info[14] = ('F', 'white', 'top', '1970s')
sets[14] = first_top[1970]['F']

sets_info[15] = ('M', 'white', 'top', '1940s')
sets[15] = first_top[1940]['M']
sets_info[16] = ('F', 'white', 'top', '1940s')
sets[16] = first_top[1940]['F']

In [12]:
# check for any duplicate

first_duplicates = defaultdict(set)
for id_1, id_2 in combinations(range(1,17), 2):
    intersection = set(sets[id_1]).intersection(set(sets[id_2]))
    for first in intersection:
        first_duplicates[first].add(id_1)
        first_duplicates[first].add(id_2)
        
print('Any duplicate:', len(first_duplicates))

Any duplicate: 0


In [32]:
# summarize the sets

for i in range(1,17):
    top, bottom = sets[i][0], sets[i][-1]
    gender, race, popularity, decade = sets_info[i]
    year = int(decade[:-1])
    print(gender, race, popularity, decade, '|', 
          top, first_rank[year][gender][top], first_count[year][gender][top], '|', 
          bottom, first_rank[year][gender][bottom], first_count[year][gender][bottom])

M white top 2000s | Jacob 0 273911 | Evan 44 92128
F white top 2000s | Emily 0 223714 | Anna 23 90215
M white medium 2000s | Jessie 462 5594 | Allison 5891 131
F white medium 2000s | Karissa 577 5092 | Claudine 7959 141
M white bottom 2000s | Marti 24421 5 | Rhea 22496 5
F white bottom 2000s | Kamran 30931 5 | Babette 33443 
M black medium 2000s | Kelvin 427 6259 | Odell 4398 198
F black medium 2000s | Lillie 559 5245 | Felecia 7797 145
M asian medium 2000s | Romeo 541 4475 | Rajeev 7813 86
F asian medium 2000s | Estrella 457 6585 | Xin 7901 143
M hispanic medium 2000s | Leonel 493 5137 | Bernardino 5404 148
F hispanic medium 2000s | Araceli 606 4713 | Natividad 7817 145
M white top 1970s | Brian 7 322825 | Bradley 56 58716
F white top 1970s | Amy 1 269004 | Andrea 27 86431
M white top 1940s | Larry 10 255890 | Billy 41 67826
F white top 1940s | Linda 1 531650 | Brenda 25 112407


In [25]:
# save the name sets

fields = ['SetID', 'Gender', 'Race', 'Popularity', 'Decade', 'Name', 'Rank', 'Count']
with open('../Data/General/Input/names-first.csv', 'w') as file: 
    writer = csv.writer(file)
    writer.writerow(fields)
    
    for set_id in range(1, len(sets_info)+1):
        gender, race, popularity, decade = sets_info[set_id]
        year = int(decade[:-1])
        
        for first in sets[set_id]:
            rank, count = first_rank[year][gender][first], first_count[year][gender][first]
            writer.writerow([set_id, gender, race, popularity, decade, first, rank, count])

## Prepare Sets of Last Names

In [26]:
# get the rank, count, and race of last names
# discard last names where the ratio gap between the largest and second largest races is less than 10%

races = ['hispanic', 'white', 'black', 'asian']
index2race = {5:'hispanic', 0:'white', 1:'black', 2:'asian'}

threshold = 10
last_rank, last_count, last_race = OrderedDict(), {}, {}

with open('../Data/General/Raw/Name/surnames-census-2000.csv', 'r') as file:
    reader = csv.reader(file)
    _ = next(reader)
    for row in reader:
        last, rank, count, race_dist = row[0], int(row[1]), int(row[2]), np.array([0 if each=='(S)' else float(each) for each in row[-6:]])
        second_max, first_max = np.sort(race_dist)[-2:]
        if first_max-second_max >= threshold and race_dist.argmax() in index2race:
            last = last[0] + last[1:].lower()
            last_rank[last] = rank
            last_count[last] = count
            last_race[last] = index2race[race_dist.argmax()]
            
race_last = defaultdict(list)
for last, race in last_race.items():
    race_last[race].append(last)

In [27]:
# generate sets 1,3: NA white top/bottom 2000s

race, size = 'white', 20
last_top = race_last[race][:size]

candidates = [last for last in race_last[race][::-1] if last_count[last]==100][::-1]
sampled_indices = sorted(random.sample(range(len(candidates)), size))
last_bottom = np.array(candidates)[sampled_indices].tolist()

In [28]:
# generate sets 2,4,5,6: NA white/black/asian/hispanic medium 2000s
# sample from the range between rank 400 and rank 8000

range_lower, range_upper = 400, 8000
race_candidates = defaultdict(list)

for last in list(last_rank.keys())[range_lower:range_upper]:
    race_candidates[last_race[last]].append(last)

last_medium = {}
for race in races:
    sampled_indices = sorted(random.sample(range(len(race_candidates[race])), size))
    last_medium[race] = np.array(race_candidates[race])[sampled_indices].tolist()

In [29]:
# assign sets to their ids

sets, sets_info = {}, {}

sets_info[1] = ('NA', 'white', 'top', '2000s')
sets[1] = last_top
sets_info[2] = ('NA', 'white', 'medium', '2000s')
sets[2] = last_medium['white']
sets_info[3] = ('NA', 'white', 'bottom', '2000s')
sets[3] = last_bottom

sets_info[4] = ('NA', 'black', 'medium', '2000s')
sets[4] = last_medium['black']
sets_info[5] = ('NA', 'asian', 'medium', '2000s')
sets[5] = last_medium['asian']
sets_info[6] = ('NA', 'hispanic', 'medium', '2000s')
sets[6] = last_medium['hispanic']

In [31]:
# check for any duplicate

last_duplicates = defaultdict(set)
for id_1, id_2 in combinations(range(1,7), 2):
    intersection = set(sets[id_1]).intersection(set(sets[id_2]))
    for last in intersection:
        last_duplicates[last].add(id_1)
        last_duplicates[last].add(id_2)
        
print('Any duplicate:', len(last_duplicates))

Any duplicate: 0


In [36]:
# summarize the sets

for i in range(1,7):
    top, bottom = sets[i][0], sets[i][-1]
    gender, race, popularity, decade = sets_info[i]
    year = int(decade[:-1])
    print(gender, race, popularity, decade, '|', 
          top, last_rank[top], last_count[top], '|', 
          bottom, last_rank[bottom], last_count[bottom])

NA white top 2000s | Smith 1 2376206 | Young 31 465948
NA white medium 2000s | Whitehead 721 43310 | Kenner 7937 3865
NA white bottom 2000s | Benhamou 150436 100 | Tennity 150436 100
NA black medium 2000s | Booker 902 35101 | Belle 8155 3742
NA asian medium 2000s | Ahmed 1206 26607 | Kobayashi 8097 3772
NA hispanic medium 2000s | Salgado 1012 31627 | Atencio 7794 3933


In [45]:
# save the name sets

fields = ['SetID', 'Gender', 'Race', 'Popularity', 'Decade', 'Last', 'Rank', 'Count']
with open('../Data/General/Input/names-last.csv', 'w') as file: 
    writer = csv.writer(file)
    writer.writerow(fields)
    
    for set_id in range(1, len(sets_info)+1):
        gender, race, popularity, decade = sets_info[set_id]
        year = int(decade[:-1])
        
        for last in sets[set_id]:
            rank, count = last_rank[last], last_count[last]
            writer.writerow([set_id, gender, race, popularity, decade, last, rank, count])