In [23]:
from pathlib import Path
import numpy as np
import pandas as pd
import csv
import sys
import os

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

from etl.sources.census.etl_utils import get_state_fips_codes
from utils import unzip_file_from_url, remove_all_from_dir

DATA_PATH = Path.cwd().parent / "data"
TMP_PATH = DATA_PATH / "tmp"
CALENVIROSCREEN_FTP_URL = "https://justice40-data.s3.amazonaws.com/CalEnviroScreen/CalEnviroScreen_4.0_2021.zip"
CSV_PATH = DATA_PATH / "dataset" / "calenviroscreen4"

# Definining some variable names
CALENVIROSCREEN_SCORE_FIELD_NAME = "calenviroscreen_score"
CALENVIROSCREEN_PERCENTILE_FIELD_NAME = "calenviroscreen_percentile"
CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD_NAME = "calenviroscreen_priority_community"
GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"

# Choosing constants.
# None of these numbers are final, but just for the purposes of comparison.
CALENVIROSCREEN_PRIORITY_COMMUNITY_THRESHOLD = 75

print(DATA_PATH)

/Users/lucas/Documents/usds/repos/justice40-tool/score/data


In [24]:
# download file from ejscreen ftp
unzip_file_from_url(CALENVIROSCREEN_FTP_URL, TMP_PATH, TMP_PATH)

2021-06-29 16:11:23,918 [utils       ] INFO     Downloading https://justice40-data.s3.amazonaws.com/CalEnviroScreen/CalEnviroScreen_4.0_2021.zip
2021-06-29 16:11:25,063 [utils       ] INFO     Extracting /Users/lucas/Documents/usds/repos/justice40-tool/score/data/tmp/downloaded.zip


In [25]:
# Data from https://calenviroscreen-oehha.hub.arcgis.com/#Data, specifically:
# https://oehha.ca.gov/media/downloads/calenviroscreen/document/calenviroscreen40resultsdatadictionaryd12021.zip
calenviroscreen_4_csv_name = "CalEnviroScreen_4.0_2021.csv"
calenviroscreen_data_path = TMP_PATH.joinpath(calenviroscreen_4_csv_name)

# Load comparison index (CalEnviroScreen 4)
calenviroscreen_df = pd.read_csv(
    calenviroscreen_data_path, dtype={"Census Tract": "string"}
)

calenviroscreen_df.rename(
    columns={
        "Census Tract": GEOID_TRACT_FIELD_NAME,
        "DRAFT CES 4.0 Score": CALENVIROSCREEN_SCORE_FIELD_NAME,
        "DRAFT CES 4.0 Percentile": CALENVIROSCREEN_PERCENTILE_FIELD_NAME,
    },
    inplace=True,
)


# Calculate the top K% of prioritized communities
calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD_NAME] = (
    calenviroscreen_df[CALENVIROSCREEN_PERCENTILE_FIELD_NAME]
    >= CALENVIROSCREEN_PRIORITY_COMMUNITY_THRESHOLD
)

calenviroscreen_df.head()

Unnamed: 0,GEOID10_TRACT,Total Population,California County,ZIP,Nearby City \r\n(to help approximate location only),Longitude,Latitude,calenviroscreen_score,calenviroscreen_percentile,DRAFT CES 4.0\r\nPercentile Range,...,Poverty,Poverty Pctl,Unemployment,Unemployment Pctl,Housing Burden,Housing Burden Pctl,Pop. Char.,Pop. Char. Score,Pop. Char. Pctl,calenviroscreen_priority_community
0,6019001100,2760,Fresno,93706,Fresno,-119.781696,36.709695,94.61,100.0,95-100% (highest scores),...,76.6,98.43,16.2,97.15,30.7,90.61,93.73,9.72,99.87,True
1,6077000700,4177,San Joaquin,95206,Stockton,-121.287873,37.943173,90.83,99.99,95-100% (highest scores),...,70.6,96.43,18.5,98.45,35.2,95.61,93.4,9.68,99.84,True
2,6077000100,4055,San Joaquin,95202,Stockton,-121.285363,37.953996,85.75,99.97,95-100% (highest scores),...,81.8,99.5,17.9,98.17,36.4,96.51,95.71,9.92,99.97,True
3,6071001600,5527,San Bernardino,91761,Ontario,-117.618013,34.05778,83.56,99.96,95-100% (highest scores),...,67.1,94.82,6.7,57.2,32.1,92.65,80.59,8.36,93.06,True
4,6037204920,2639,Los Angeles,90023,Los Angeles,-118.197497,34.0175,82.9,99.95,95-100% (highest scores),...,64.9,93.51,5.6,43.81,25.0,77.95,83.95,8.7,95.78,True


In [26]:
# write csv
CSV_PATH.mkdir(parents=True, exist_ok=True)

# Matching other conventions in the ETL scripts, write only for the state (FIPS code 06).
calenviroscreen_df.to_csv(CSV_PATH / "data06.csv", index=False)

In [None]:
# cleanup
remove_all_from_dir(TMP_PATH)