In [None]:
import pandas as pd
import censusdata
import csv
from pathlib import Path
import os
import sys

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

from etl.sources.census.etl_utils import get_state_fips_codes
from utils import unzip_file_from_url, remove_all_from_dir

ACS_YEAR = 2019

DATA_PATH = Path.cwd().parent / "data"
TMP_PATH = DATA_PATH / "tmp"
HOUSING_FTP_URL = "https://htaindex.cnt.org/download/download.php?focus=blkgrp&geoid="
OUTPUT_PATH = DATA_PATH / "dataset" / "housing_and_transportation_index"

GEOID_FIELD_NAME = "GEOID10"

In [None]:
# Download each state / territory individually
dfs = []
zip_file_dir = TMP_PATH / "housing_and_transportation_index"
for fips in get_state_fips_codes(DATA_PATH):
    print(f"Downloading housing data for state/territory with FIPS code {fips}")
    unzip_file_from_url(f"{HOUSING_FTP_URL}{fips}", TMP_PATH, zip_file_dir)

    # New file name:
    tmp_csv_file_path = zip_file_dir / f"htaindex_data_blkgrps_{fips}.csv"
    tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)

    dfs.append(tmp_df)

df = pd.concat(dfs)

df.head()

In [None]:
# Rename and reformat block group ID
df.rename(columns={"blkgrp": GEOID_FIELD_NAME}, inplace=True)
df[GEOID_FIELD_NAME] = df[GEOID_FIELD_NAME].str.replace('"', "")

In [None]:
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

df.to_csv(path_or_buf=OUTPUT_PATH / "usa.csv", index=False)

In [None]:
# cleanup
remove_all_from_dir(TMP_PATH)