In [1]:
import io
import pathlib
import urllib.request

import numpy as np
import openpyxl
import pandas as pd

# Background (links good as of 2021-01-22)

This fetches data from the following Excel file:

https://dataverse.harvard.edu/api/access/datafile/3078263?format=original&gbrecs=true

This excel file contains race data based on the first names. The script creates dataframes from the worksheet in this excel file. It then cleans them the data.

Information about the contents of these files may be found here:

https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TYJKEZ#

It then the data for use in Surgeo calculations.


# Constants

In [2]:
HARVARD_FIRST_NAME_URL = 'https://dataverse.harvard.edu/api/access/datafile/3078263?format=original&gbrecs=true'

HARVARD_FIRST_NAME_COLUMNS = [
    'firstname',
    'obs',
    'pcthispanic',
    'pctwhite',
    'pctblack',
    'pctapi',
    'pctaian',
    'pct2prace',
    'NaN',
]

TARGET_FIRST_NAME_COLUMNS = [
    'firstname',
    'pctwhite',
    'pctblack',
    'pctapi',
    'pctaian',
    'pct2prace',
    'pcthispanic',
]

TARGET_COLUMN_RENAMES = {
    'firstname': 'name',
    'pcthispanic': 'hispanic',
    'pctwhite': 'white',
    'pctblack': 'black',
    'pctapi': 'api',
    'pctaian': 'native',
    'pct2prace': 'multiple',
}

## Download First Name Data

In [3]:
def url_to_df(url):
    '''Takes the URL of a the Harvard first name excel file and converts to DF
    '''
    # Download zipfile from census URL
    with urllib.request.urlopen(url) as response:
        # Write file into BytesIO object
        excel_data = io.BytesIO(response.read())
        # Open excel data as openpyxl workbook
        ef = openpyxl.load_workbook(excel_data)
        # Get the correct worksheet
        ws_data = ef.get_sheet_by_name('Data')
        # Extract the worsheet data
        raw_data = ws_data.values
        # Get the column names
        cols = next(raw_data)
        # Get the actual data
        values = list(raw_data)
        # Create dataframe
        df = pd.DataFrame(values, columns=cols)
        return df

In [4]:
df_harvard = url_to_df(HARVARD_FIRST_NAME_URL)

In [5]:
df_harvard.tail()

Unnamed: 0,firstname,obs,pcthispanic,pctwhite,pctblack,pctapi,pctaian,pct2prace,NaN
4246,ZOFIA,55,0.0,100.0,0.0,0.0,0.0,0.0,
4247,ZOILA,66,86.364,12.121,1.515,0.0,0.0,0.0,
4248,ZOLTAN,47,6.383,87.234,4.255,2.128,0.0,0.0,
4249,ZORAN,52,0.0,100.0,0.0,0.0,0.0,0.0,
4250,ALL OTHER FIRST NAMES,214124,8.226,51.422,11.541,28.117,0.396,0.298,


# Clean Data

In [6]:
def clean_df(df):
    '''Change column names, set index, and convert percentages'''
    # Change names
    df.columns = HARVARD_FIRST_NAME_COLUMNS
    # Filter columns
    df = df[TARGET_FIRST_NAME_COLUMNS]
    # Rename columns
    df = df.rename(columns=TARGET_COLUMN_RENAMES)
    # Set index to name
    df = df.set_index('name')
    # Sort index
    df = df.sort_index()
    # Convert percentages to 0 to 1 numbers
    df = df / 100
    return df

In [7]:
df_harvard = clean_df(df_harvard)

In [8]:
df_harvard.tail()

Unnamed: 0_level_0,white,black,api,native,multiple,hispanic
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ZOE,0.87255,0.04902,0.03922,0.0,0.0,0.03922
ZOFIA,1.0,0.0,0.0,0.0,0.0,0.0
ZOILA,0.12121,0.01515,0.0,0.0,0.0,0.86364
ZOLTAN,0.87234,0.04255,0.02128,0.0,0.0,0.06383
ZORAN,1.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Round to 4 digits
df_harvard = df_harvard.round(4)

In [10]:
df_harvard.tail()

Unnamed: 0_level_0,white,black,api,native,multiple,hispanic
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ZOE,0.8726,0.049,0.0392,0.0,0.0,0.0392
ZOFIA,1.0,0.0,0.0,0.0,0.0,0.0
ZOILA,0.1212,0.0151,0.0,0.0,0.0,0.8636
ZOLTAN,0.8723,0.0426,0.0213,0.0,0.0,0.0638
ZORAN,1.0,0.0,0.0,0.0,0.0,0.0


# Write data to module as CSV

In [11]:
current_directory = pathlib.Path().cwd()
project_directory = current_directory.parents[0]
data_directory    = project_directory / 'surgeo' / 'data'
path_harvard      = data_directory / 'prob_race_given_first_name_harvard.csv'
df_harvard.to_csv(path_harvard)

# Create the reverse mapping data

In [12]:
column_totals = df_harvard.sum(axis=0).divide(100.0)
ratio_by_column = df_harvard.divide(column_totals, axis='columns').copy()

In [13]:
ratio_by_column.tail()

Unnamed: 0_level_0,white,black,api,native,multiple,hispanic
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ZOE,0.031176,0.020898,0.005225,0.0,0.0,0.008734
ZOFIA,0.035728,0.0,0.0,0.0,0.0,0.0
ZOILA,0.00433,0.00644,0.0,0.0,0.0,0.192423
ZOLTAN,0.031165,0.018168,0.002839,0.0,0.0,0.014216
ZORAN,0.035728,0.0,0.0,0.0,0.0,0.0


In [14]:
# Prob first name given race
rbc_path = data_directory / 'prob_first_name_given_race_harvard.csv'
ratio_by_column.to_csv(rbc_path)