In [4]:
import pandas as pd
import censusdata
import csv
from pathlib import Path
import os
import sys

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

from etl.sources.census.etl_utils import get_state_fips_codes
from utils import unzip_file_from_url, remove_all_from_dir

DATA_PATH = Path.cwd().parent / "data"
TMP_PATH = DATA_PATH / "tmp"
HOUSING_FTP_URL = "https://htaindex.cnt.org/download/download.php?focus=blkgrp&geoid="
OUTPUT_PATH = DATA_PATH / "dataset" / "housing_and_transportation_index"

GEOID_FIELD_NAME = "GEOID10"

# Note: some variable definitions.
# HUD-adjusted median family income (HAMFI).
# The four housing problems are: incomplete kitchen facilities, incomplete plumbing facilities, more than 1 person per room, and cost burden greater than 30%.


# Table 8 is the desired table.
"T3_est29"

# We measure households earning less than 80% of HUD Area Median Family Income by county
# and paying greater than 50% of their income to housing costs.

'T3_est29'

In [5]:
# Download each state / territory individually
dfs = []
zip_file_dir = TMP_PATH / "hud_housing"

print(f"Downloading housing data")
unzip_file_from_url(
    "https://www.huduser.gov/portal/datasets/cp/2012thru2016-140-csv.zip",
    TMP_PATH,
    zip_file_dir,
)

2021-06-28 18:54:16,431 [utils       ] INFO     Downloading https://www.huduser.gov/portal/datasets/cp/2012thru2016-140-csv.zip


Downloading housing data


2021-06-28 18:55:26,195 [utils       ] INFO     Extracting /Users/lucas/Documents/usds/repos/justice40-tool/score/data/tmp/downloaded.zip


In [6]:
# New file name:
tmp_csv_file_path = (
    zip_file_dir
    / "2012thru2016-140-csv"
    / "2012thru2016-140-csv"
    / "140"
    / "Table8.csv"
)
df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)

df.head()

Unnamed: 0,source,sumlevel,geoid,name,st,cnty,tract,T8_est1,T8_est2,T8_est3,...,T8_moe124,T8_moe125,T8_moe126,T8_moe127,T8_moe128,T8_moe129,T8_moe130,T8_moe131,T8_moe132,T8_moe133
0,2012thru2016,140,14000US01001020100,"Census Tract 201, Autauga County, Alabama",1,1,20100,740,550,15,...,44,11,11,11,11,11,11,11,11,11
1,2012thru2016,140,14000US01001020200,"Census Tract 202, Autauga County, Alabama",1,1,20200,840,525,55,...,30,11,11,11,11,11,11,11,11,11
2,2012thru2016,140,14000US01001020300,"Census Tract 203, Autauga County, Alabama",1,1,20300,1225,860,35,...,29,11,11,11,11,11,11,11,11,11
3,2012thru2016,140,14000US01001020400,"Census Tract 204, Autauga County, Alabama",1,1,20400,1775,1440,50,...,71,11,11,11,11,11,11,11,11,11
4,2012thru2016,140,14000US01001020500,"Census Tract 205, Autauga County, Alabama",1,1,20500,4290,2280,80,...,227,18,18,18,115,115,18,18,18,18


In [None]:
# Rename and reformat block group ID
df.rename(columns={"geoid": GEOID_FIELD_NAME}, inplace=True)

In [10]:
# Calculate housing burden
# This is quite a number of steps. It does not appear to be accessible nationally in a simpler format, though.

# Owner occupied numerator fields
OWNER_OCCUPIED_NUMERATOR_FIELDS = [
    # Key: Column Name	Line_Type	Tenure	Household income	Cost burden	Facilities
    #     T8_est7	Subtotal	Owner occupied	less than or equal to 30% of HAMFI	greater than 30% but less than or equal to 50%	All
    "T8_est7",
    # T8_est10	Subtotal	Owner occupied	less than or equal to 30% of HAMFI	greater than 50%	All
    "T8_est10",
    # T8_est20	Subtotal	Owner occupied	greater than 30% but less than or equal to 50% of HAMFI	greater than 30% but less than or equal to 50%	All
    "T8_est20",
    # T8_est23	Subtotal	Owner occupied	greater than 30% but less than or equal to 50% of HAMFI	greater than 50%	All
    "T8_est23",
    # T8_est33	Subtotal	Owner occupied	greater than 50% but less than or equal to 80% of HAMFI	greater than 30% but less than or equal to 50%	All
    "T8_est33",
    # T8_est36	Subtotal	Owner occupied	greater than 50% but less than or equal to 80% of HAMFI	greater than 50%	All
    "T8_est36",
]

In [None]:
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

df.to_csv(path_or_buf=OUTPUT_PATH / "usa.csv", index=False)

In [None]:
# cleanup
remove_all_from_dir(TMP_PATH)