In [10]:
import collections
import functools
import IPython
import itertools
import numpy as np
import os
import pandas as pd
import pathlib
import pypandoc
import requests
import string
import sys
import typing
import us
import zipfile

from datetime import datetime
from tqdm.notebook import tqdm_notebook

module_path = os.path.abspath(os.path.join("../.."))
if module_path not in sys.path:
    sys.path.append(module_path)

from data_pipeline.utils import remove_all_from_dir, get_excel_column_name
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census.etl_utils import get_state_information
from data_pipeline.etl.sources.ejscreen_areas_of_concern.etl import (
    EJSCREENAreasOfConcernETL,
)


from data_pipeline.score import field_names

# Turn on TQDM for pandas so that we can have progress bars when running `apply`.
tqdm_notebook.pandas()

In [11]:
# Suppress scientific notation in pandas (this shows up for census tract IDs)
pd.options.display.float_format = "{:.2f}".format

# Set some global parameters
DATA_DIR = pathlib.Path.cwd().parent / "data"
TEMP_DATA_DIR = DATA_DIR / "tmp"
COMPARISON_OUTPUTS_DIR = DATA_DIR / "comparison_outputs"

# Make the dirs if they don't exist
TEMP_DATA_DIR.mkdir(parents=True, exist_ok=True)
COMPARISON_OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)

CEJST_PRIORITY_COMMUNITY_THRESHOLD = 0.75

# Name fields using variables. (This makes it easy to reference the same fields frequently without using strings
# and introducing the risk of misspelling the field name.)
GEOID_STATE_FIELD_NAME = "GEOID10_STATE"
COUNTRY_FIELD_NAME = "Country"

# Define some suffixes
POPULATION_SUFFIX = " (priority population)"

In [13]:
# Load CEJST score data
cal_environ_4_dot_0_data_path = DATA_DIR  / "dataset" / "calenviroscreen4" / "data06.csv"
cejst_df = pd.read_csv(
    cal_environ_4_dot_0_data_path,
    dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: "string"},
)

# Create the state ID by taking the first two digits of the FIPS CODE of the tract.
# For more information, see https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html.
cejst_df.loc[:, GEOID_STATE_FIELD_NAME] = (
    cejst_df.loc[:, ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]
    .astype(str)
    .str[0:2]
)

cejst_df.head()

Unnamed: 0,GEOID10_TRACT,Total Population,California County,ZIP,Nearby City \n(to help approximate location only),Longitude,Latitude,calenviroscreen_score,calenviroscreen_percentile,DRAFT CES 4.0\nPercentile Range,...,Poverty Pctl,Unemployment,Unemployment Pctl,Housing Burden,Housing Burden Pctl,Pop. Char.,Pop. Char. Score,Pop. Char. Pctl,calenviroscreen_priority_community,GEOID10_STATE
0,6019001100,2760,Fresno,93706,Fresno,-119.78,36.71,94.61,100.0,95-100% (highest scores),...,98.43,16.2,97.15,30.7,90.61,93.73,9.72,99.87,True,6
1,6077000700,4177,San Joaquin,95206,Stockton,-121.29,37.94,90.83,99.99,95-100% (highest scores),...,96.43,18.5,98.45,35.2,95.61,93.4,9.68,99.84,True,6
2,6077000100,4055,San Joaquin,95202,Stockton,-121.29,37.95,85.75,99.97,95-100% (highest scores),...,99.5,17.9,98.17,36.4,96.51,95.71,9.92,99.97,True,6
3,6071001600,5527,San Bernardino,91761,Ontario,-117.62,34.06,83.56,99.96,95-100% (highest scores),...,94.82,6.7,57.2,32.1,92.65,80.59,8.36,93.06,True,6
4,6037204920,2639,Los Angeles,90023,Los Angeles,-118.2,34.02,82.9,99.95,95-100% (highest scores),...,93.51,5.6,43.81,25.0,77.95,83.95,8.7,95.78,True,6


In [14]:
cejst_df.shape

(8035, 60)

In [8]:
hud_data_path = DATA_DIR / "dataset" / "hud_housing" / "housing_burden.csv"

In [15]:
housing_burden = pd.read_csv(
    hud_data_path,
    dtype= {"FIPS_tract_id": "string"},
)

In [27]:
housing_burden.columns

Index(['state', 'FIPS_tract_id', 'hbrd_score', 'hbrd_rank'], dtype='object')

In [17]:
housing_burden = housing_burden[housing_burden.FIPS_tract_id.isin(list(cejst_df.GEOID10_TRACT.unique()))]

In [18]:
housing_burden.shape

(8034, 4)

In [25]:
housing_burden = housing_burden.sort_values(by='hbrd_rank', ascending = False)

In [26]:
cejst_df = cejst_df[['GEOID10_TRACT', 'Housing Burden', 'Housing Burden Pctl']]

In [28]:
housing_burden.merge(cejst_df, left_on = "FIPS_tract_id", right_on = "GEOID10_TRACT")

Unnamed: 0,state,FIPS_tract_id,hbrd_score,hbrd_rank,GEOID10_TRACT,Housing Burden,Housing Burden Pctl
0,6,06037575500,0.88,100.00,06037575500,,
1,6,06079010902,0.72,99.99,06079010902,64.60,99.97
2,6,06083002926,0.70,99.97,06083002926,66.10,99.99
3,6,06083002924,0.69,99.96,06083002924,63.90,99.95
4,6,06037980010,0.69,99.95,06037980010,,
...,...,...,...,...,...,...,...
8029,6,06111007507,,,06111007507,6.20,2.59
8030,6,06111007513,,,06111007513,7.40,4.65
8031,6,06111007609,,,06111007609,3.40,0.34
8032,6,06111007610,,,06111007610,9.60,11.03
