In [155]:
import pandas as pd
import censusdata
import csv
from pathlib import Path
import os
import sys

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

from etl.sources.census.etl_utils import get_state_fips_codes

ACS_YEAR = 2019

DATA_PATH = Path.cwd().parent / "data"
OUTPUT_PATH = DATA_PATH / "dataset" / f"census_acs_{ACS_YEAR}"

GEOID_FIELD_NAME = "GEOID10"
UNEMPLOYED_FIELD_NAME = "Unemployed civilians (fraction)"
LINGUISTIC_ISOLATION_FIELD_NAME = "Linguistic isolation (fraction)"
LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = "Linguistic isolation (total)"

LINGUISTIC_ISOLATION_FIELDS = [
    "C16002_001E",
    "C16002_004E",
    "C16002_007E",
    "C16002_010E",
    "C16002_013E",
]

# Some display settings to make pandas outputs more readable.
pd.set_option("display.expand_frame_repr", False)
pd.set_option("display.precision", 2)

In [163]:
# For variable discovery, if necessary.
censusdata.search(
    "acs5", 2019, "label", "Monthly Housing Costs as a Percentage of Household Income"
)

[]

In [157]:
# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.
# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx
censusdata.printtable(censusdata.censustable(src="acs5", year=ACS_YEAR, table="B23025"))
censusdata.printtable(censusdata.censustable(src="acs5", year=ACS_YEAR, table="C16002"))

Variable     | Table                          | Label                                                    | Type 
-------------------------------------------------------------------------------------------------------------------
B23025_001E  | EMPLOYMENT STATUS FOR THE POPU | !! Estimate Total:                                       | int  
B23025_002E  | EMPLOYMENT STATUS FOR THE POPU | !! !! Estimate Total: In labor force:                    | int  
B23025_003E  | EMPLOYMENT STATUS FOR THE POPU | !! !! !! Estimate Total: In labor force: Civilian labor  | int  
B23025_004E  | EMPLOYMENT STATUS FOR THE POPU | !! !! !! !! Estimate Total: In labor force: Civilian lab | int  
B23025_005E  | EMPLOYMENT STATUS FOR THE POPU | !! !! !! !! Estimate Total: In labor force: Civilian lab | int  
B23025_006E  | EMPLOYMENT STATUS FOR THE POPU | !! !! !! Estimate Total: In labor force: Armed Forces    | int  
B23025_007E  | EMPLOYMENT STATUS FOR THE POPU | !! !! Estimate Total: Not in labor force     

In [162]:
def fips_from_censusdata_censusgeo(censusgeo: censusdata.censusgeo) -> str:
    """Create a FIPS code from the proprietary censusgeo index."""
    fips = "".join([value for (key, value) in censusgeo.params()])
    return fips


dfs = []
for fips in get_state_fips_codes(DATA_PATH):
    print(f"Downloading data for state/territory with FIPS code {fips}")

    dfs.append(
        censusdata.download(
            src="acs5",
            year=ACS_YEAR,
            geo=censusdata.censusgeo(
                [("state", fips), ("county", "*"), ("block group", "*")]
            ),
            var=[
                # Emploment fields
                "B23025_005E",
                "B23025_003E",
            ]
            + LINGUISTIC_ISOLATION_FIELDS,
        )
    )


df = pd.concat(dfs)

df[GEOID_FIELD_NAME] = df.index.to_series().apply(func=fips_from_censusdata_censusgeo)

df.head()

Downloading data for state/territory with FIPS code 01
Downloading data for state/territory with FIPS code 02
Downloading data for state/territory with FIPS code 04
Downloading data for state/territory with FIPS code 05
Downloading data for state/territory with FIPS code 06
Downloading data for state/territory with FIPS code 08
Downloading data for state/territory with FIPS code 09
Downloading data for state/territory with FIPS code 10
Downloading data for state/territory with FIPS code 11
Downloading data for state/territory with FIPS code 12
Downloading data for state/territory with FIPS code 13
Downloading data for state/territory with FIPS code 15
Downloading data for state/territory with FIPS code 16
Downloading data for state/territory with FIPS code 17
Downloading data for state/territory with FIPS code 18
Downloading data for state/territory with FIPS code 19
Downloading data for state/territory with FIPS code 20
Downloading data for state/territory with FIPS code 21
Downloadin

Unnamed: 0,B23025_005E,B23025_003E,C16002_001E,C16002_004E,C16002_007E,C16002_010E,C16002_013E,GEOID10
"Block Group 2, Census Tract 9620, Covington County, Alabama: Summary level: 150, state:01> county:039> tract:962000> block group:2",32,415,423,0,0,0,0,10399620002
"Block Group 2, Census Tract 9618, Covington County, Alabama: Summary level: 150, state:01> county:039> tract:961800> block group:2",65,515,664,0,0,0,0,10399618002
"Block Group 4, Census Tract 9616, Covington County, Alabama: Summary level: 150, state:01> county:039> tract:961600> block group:4",55,413,328,0,0,0,0,10399616004
"Block Group 2, Census Tract 9616, Covington County, Alabama: Summary level: 150, state:01> county:039> tract:961600> block group:2",15,531,381,0,0,0,0,10399616002
"Block Group 1, Census Tract 9616, Covington County, Alabama: Summary level: 150, state:01> county:039> tract:961600> block group:1",44,698,625,0,0,0,0,10399616001


In [159]:
# Calculate percent unemployment.
# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
df[UNEMPLOYED_FIELD_NAME] = df.B23025_005E / df.B23025_003E

df.head()

Unnamed: 0,B23025_005E,B23025_003E,C16002_001E,C16002_004E,C16002_007E,C16002_010E,C16002_013E,GEOID10,Unemployed civilians (fraction)
"Block Group 2, Census Tract 9620, Covington County, Alabama: Summary level: 150, state:01> county:039> tract:962000> block group:2",32,415,423,0,0,0,0,10399620002,0.08
"Block Group 2, Census Tract 9618, Covington County, Alabama: Summary level: 150, state:01> county:039> tract:961800> block group:2",65,515,664,0,0,0,0,10399618002,0.13
"Block Group 4, Census Tract 9616, Covington County, Alabama: Summary level: 150, state:01> county:039> tract:961600> block group:4",55,413,328,0,0,0,0,10399616004,0.13
"Block Group 2, Census Tract 9616, Covington County, Alabama: Summary level: 150, state:01> county:039> tract:961600> block group:2",15,531,381,0,0,0,0,10399616002,0.03
"Block Group 1, Census Tract 9616, Covington County, Alabama: Summary level: 150, state:01> county:039> tract:961600> block group:1",44,698,625,0,0,0,0,10399616001,0.06


In [160]:
# Calculate linguistic isolation.
individual_limited_english_fields = [
    "C16002_004E",
    "C16002_007E",
    "C16002_010E",
    "C16002_013E",
]

df[LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME] = df[individual_limited_english_fields].sum(
    axis=1, skipna=True
)
df[LINGUISTIC_ISOLATION_FIELD_NAME] = (
    df[LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME].astype(float) / df["C16002_001E"]
)

df.sort_values(by=LINGUISTIC_ISOLATION_FIELD_NAME, inplace=True, ascending=False)

df.head()

Unnamed: 0,B23025_005E,B23025_003E,C16002_001E,C16002_004E,C16002_007E,C16002_010E,C16002_013E,GEOID10,Unemployed civilians (fraction),Linguistic isolation (total),Linguistic isolation (fraction)
"Block Group 5, Census Tract 308.02, Marshall County, Alabama: Summary level: 150, state:01> county:095> tract:030802> block group:5",0,1232,717,287,16,0,0,10950308025,0.0,303,0.42
"Block Group 1, Census Tract 60, Montgomery County, Alabama: Summary level: 150, state:01> county:101> tract:006000> block group:1",0,406,290,109,0,0,0,11010060001,0.0,109,0.38
"Block Group 1, Census Tract 133, Jefferson County, Alabama: Summary level: 150, state:01> county:073> tract:013300> block group:1",44,353,238,85,0,0,0,10730133001,0.12,85,0.36
"Block Group 4, Census Tract 306, Russell County, Alabama: Summary level: 150, state:01> county:113> tract:030600> block group:4",0,289,463,164,0,0,0,11130306004,0.0,164,0.35
"Block Group 1, Census Tract 303.15, Shelby County, Alabama: Summary level: 150, state:01> county:117> tract:030315> block group:1",25,895,497,169,0,0,0,11170303151,0.03,169,0.34


In [161]:
# mkdir census
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

columns_to_include = [GEOID_FIELD_NAME, UNEMPLOYED_FIELD_NAME]

df[columns_to_include].to_csv(path_or_buf=OUTPUT_PATH / "usa.csv", index=False)