In [None]:
# Before running this script as it currently stands, you'll need to run these notebooks (in any order):
# * score_calc.ipynb
# * calenviroscreen_etl.ipynb
# * hud_recap_etl.ipynb

import functools
import numpy as np
import pandas as pd
from pathlib import Path
import requests
import typing
import zipfile
from datetime import datetime
from tqdm.notebook import tqdm_notebook

# Turn on TQDM for pandas so that we can have progress bars when running `apply`.
tqdm_notebook.pandas()

In [None]:
# Suppress scientific notation in pandas (this shows up for census tract IDs)
pd.options.display.float_format = "{:.2f}".format

# Set some global parameters
DATA_DIR = Path.cwd().parent / "data"
TEMP_DATA_DIR = Path.cwd().parent / "data" / "tmp"
CEJST_PRIORITY_COMMUNITY_THRESHOLD = 0.75

# Name fields using variables. (This makes it easy to reference the same fields frequently without using strings
# and introducing the risk of misspelling the field name.)

GEOID_FIELD_NAME = "GEOID10"
GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"
CENSUS_BLOCK_GROUP_POPULATION_FIELD = "census_block_group_population"

CALENVIROSCREEN_SCORE_FIELD = "calenviroscreen_score"
CALENVIROSCREEN_PERCENTILE_FIELD = "calenviroscreen_percentile"
CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD = "calenviroscreen_priority_community"

CEJST_SCORE_FIELD = "cejst_score"
CEJST_PERCENTILE_FIELD = "cejst_percentile"
CEJST_PRIORITY_COMMUNITY_FIELD = "cejst_priority_community"

In [None]:
# Load CEJST score data
cejst_data_path = DATA_DIR / "score" / "csv" / "usa.csv"
cejst_df = pd.read_csv(cejst_data_path, dtype={GEOID_FIELD_NAME: "string"})

# score_used = "Score A"

# # Rename unclear name "id" to "census_block_group_id", as well as other renamings.
# cejst_df.rename(
#     columns={
#         "Total population": CENSUS_BLOCK_GROUP_POPULATION_FIELD,
#         score_used: CEJST_SCORE_FIELD,
#         f"{score_used} (percentile)": CEJST_PERCENTILE_FIELD,
#     },
#     inplace=True,
#     errors="raise",
# )

# Create the CBG's Census Tract ID by dropping the last number from the FIPS CODE of the CBG.
# The CBG ID is the last one character.
# For more information, see https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html.
cejst_df.loc[:, GEOID_TRACT_FIELD_NAME] = (
    cejst_df.loc[:, GEOID_FIELD_NAME].astype(str).str[:-1]
)

cejst_df.head()

In [None]:
# Load CalEnviroScreen 4.0
CALENVIROSCREEN_SCORE_FIELD = "calenviroscreen_score"
CALENVIROSCREEN_PERCENTILE_FIELD = "calenviroscreen_percentile"
CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD = "calenviroscreen_priority_community"

calenviroscreen_data_path = DATA_DIR / "dataset" / "calenviroscreen4" / "data06.csv"
calenviroscreen_df = pd.read_csv(
    calenviroscreen_data_path, dtype={GEOID_TRACT_FIELD_NAME: "string"}
)

calenviroscreen_df.head()

In [None]:
# Load HUD data
hud_recap_data_path = DATA_DIR / "dataset" / "hud_recap" / "usa.csv"
hud_recap_df = pd.read_csv(
    hud_recap_data_path, dtype={GEOID_TRACT_FIELD_NAME: "string"}
)

hud_recap_df.head()

In [None]:
# Join all dataframes that use tracts
census_tract_dfs = [calenviroscreen_df, hud_recap_df]

census_tract_df = functools.reduce(
    lambda left, right: pd.merge(
        left=left, right=right, on=GEOID_TRACT_FIELD_NAME, how="outer"
    ),
    census_tract_dfs,
)

if census_tract_df[GEOID_TRACT_FIELD_NAME].str.len().unique() != [11]:
    raise ValueError("Some of the census tract data has the wrong length.")

if len(census_tract_df) > 74134:
    raise ValueError("Too many rows in the join.")

census_tract_df.head()

In [None]:
# Join tract indices and CEJST data.
# Note: we're joining on the census *tract*, so there will be multiple CBG entries joined to the same census tract row from CES,
# creating multiple rows of the same CES data.
merged_df = cejst_df.merge(
    census_tract_df,
    how="left",
    on=GEOID_TRACT_FIELD_NAME,
)


if len(merged_df) > 220333:
    raise ValueError("Too many rows in the join.")

merged_df.head()

# merged_df.to_csv(
#     path_or_buf=TEMP_DATA_DIR / "merged.csv",
#     na_rep="",
#     index=False
# )

In [None]:
cejst_priority_communities_fields = [
    "Score A (top 25th percentile)",
    "Score B (top 25th percentile)",
    "Score C (top 25th percentile)",
    "Score D (top 25th percentile)",
    "Score E (top 25th percentile)",
]

comparison_priority_communities_fields = [
    "calenviroscreen_priority_community",
    "hud_recap_priority_community",
]

In [None]:
# TODO: parameterize these
method_a_name = "Score A"
method_b_name = "CalEnviroScreen 4.0"
method_a_priority_census_block_groups_field = ("Score A (top 25th percentile)",)
method_b_priority_census_tracts_field = ("calenviroscreen_priority_community",)

# Comparison field names
any_tract_has_at_least_one_method_a_cbg = (
    f"Any tract has at least one {method_a_name} Priority CBG?"
)
method_b_tract_has_at_least_one_method_a_cbg = (
    f"{method_b_name} priority tract has at least one {method_a_name} CBG?"
)
method_b_tract_has_100_percent_method_a_cbg = (
    f"{method_b_name} tract has 100% {method_a_name} priority CBGs?"
)
method_b_non_priority_tract_has_at_least_one_method_a_cbg = (
    f"Non-priority {method_b_name} tract has at least one {method_a_name} priority CBG?"
)
method_b_non_priority_tract_has_100_percent_method_a_cbg = (
    f"Non-priority {method_b_name} tract has 100% {method_a_name} priority CBGs?"
)


def create_comparison_df(
    df: pd.DataFrame,
    method_a_priority_census_block_groups_field: str,
    method_b_priority_census_tracts_field: str,
    other_census_tract_fields_to_keep: typing.Optional[str],
) -> None:
    """Produces a comparison report for any two given boolean columns representing priority fields.

    Args:
      df: a pandas dataframe including the data for this comparison.
      method_a_priority_census_block_groups_field: the name of a boolean column in `df`, such as the CEJST priority
        community field that defines communities at the level of census block groups (CBGs).
      method_b_priority_census_tracts_field: the name of a boolean column in `df`, such as the CalEnviroScreen priority
        community field that defines communities at the level of census tracts.
      other_census_tract_fields_to_keep (optional): a list of field names to preserve at the census tract level

    Returns:
      df: a pandas dataframe with one row with the results of this comparison
    """

    def calculate_comparison(frame: pd.DataFrame) -> pd.DataFrame:
        """
        This method will be applied to a `group_by` object.

        Note: It inherits from outer scope `method_a_priority_census_block_groups_field`, `method_b_priority_census_tracts_field`,
        and `other_census_tract_fields_to_keep`.
        """
        # Keep all the tract values at the Census Tract Level
        for field in other_census_tract_fields_to_keep:
            if len(frame[field].unique()) != 1:
                raise ValueError(
                    f"There are different values per CBG for field {field}."
                    "`other_census_tract_fields_to_keep` can only be used for fields at the census tract level."
                )

        df = frame.loc[
            frame.index[0],
            [
                GEOID_TRACT_FIELD_NAME,
                method_b_priority_census_tracts_field,
            ]
            + other_census_tract_fields_to_keep,
        ]

        # Convenience constant for whether the tract is or is not a CalEnviroScreen priority community.
        is_a_method_b_priority_tract = frame.loc[
            frame.index[0], [method_b_priority_census_tracts_field]
        ][0]

        # Recall that NaN values are not falsy, so we need to check if `is_a_method_b_priority_tract` is True.
        is_a_method_b_priority_tract = is_a_method_b_priority_tract is True

        # Calculate whether the tract (whether or not it is a comparison priority tract) includes CBGs that are priority
        # according to the current CBG score.
        df[any_tract_has_at_least_one_method_a_cbg] = (
            frame.loc[:, method_a_priority_census_block_groups_field].sum() > 0
        )

        # Calculate comparison
        # A comparison priority tract has at least one CBG that is a priority CBG.
        df[method_b_tract_has_at_least_one_method_a_cbg] = (
            frame.loc[:, method_a_priority_census_block_groups_field].sum() > 0
            if is_a_method_b_priority_tract
            else None
        )

        # A comparison priority tract has all of its contained CBGs as CBG priority CBGs.
        df[method_b_tract_has_100_percent_method_a_cbg] = (
            frame.loc[:, method_a_priority_census_block_groups_field].mean() == 1
            if is_a_method_b_priority_tract
            else None
        )

        # Calculate the inverse
        # A tract that is _not_ a comparison priority has at least one CBG priority CBG.
        df[method_b_non_priority_tract_has_at_least_one_method_a_cbg] = (
            frame.loc[:, method_a_priority_census_block_groups_field].sum() > 0
            if not is_a_method_b_priority_tract
            else None
        )

        # A tract that is _not_ a comparison priority has all of its contained CBGs as CBG priority CBGs.
        df[method_b_non_priority_tract_has_100_percent_method_a_cbg] = (
            frame.loc[:, method_a_priority_census_block_groups_field].mean() == 1
            if not is_a_method_b_priority_tract
            else None
        )

        return df

    # Group all data by the census tract.
    grouped_df = df.groupby(GEOID_TRACT_FIELD_NAME)

    # Run the comparison function on the groups.
    comparison_df = grouped_df.progress_apply(calculate_comparison)

    print(comparison_df.head())

    # Write comparison to CSV.
    comparison_df.to_csv(
        path_or_buf=TEMP_DATA_DIR
        / f"Comparison Output - {method_a_priority_census_block_groups_field} and {method_b_priority_census_tracts_field}.csv",
        na_rep="",
        index=False,
    )

    print(comparison_df.head())

    return comparison_df


# California only
cal_df = merged_df[merged_df[GEOID_TRACT_FIELD_NAME].astype(str).str[0:2] == "06"]
# cal_df = cal_df[0:1000]
print(len(cal_df))

comparison_df = create_comparison_df(
    df=cal_df,
    method_a_priority_census_block_groups_field="Score A (top 25th percentile)",
    method_b_priority_census_tracts_field="calenviroscreen_priority_community",
    other_census_tract_fields_to_keep=[
        CALENVIROSCREEN_SCORE_FIELD,
        CALENVIROSCREEN_PERCENTILE_FIELD,
    ],
)

In [None]:
# TODO: fix
original_df = cal_df

# Prepare some constants for use in the following Markdown cell.
total_cbgs = len(original_df)

# Note: using squeeze throughout do reduce result of `sum()` to a scalar.
# TODO: investigate why sums are sometimes series and sometimes scalar.
method_a_priority_cbgs = (
    original_df.loc[:, method_a_priority_census_block_groups_field].sum().squeeze()
)
method_a_priority_cbgs_percent = f"{method_a_priority_cbgs / total_cbgs:.0%}"

total_tracts_count = len(comparison_df)
method_b_priority_tracts_count = (
    comparison_df.loc[:, method_b_priority_census_tracts_field].sum().squeeze()
)
method_b_priority_tracts_count_percent = (
    f"{method_b_priority_tracts_count / total_tracts_count:.0%}"
)
method_b_non_priority_tracts_count = total_tracts_count - method_b_priority_tracts_count

method_a_tracts_count = (
    comparison_df.loc[:, any_tract_has_at_least_one_method_a_cbg].sum().squeeze()
)
method_a_tracts_count_percent = f"{method_a_tracts_count / total_tracts_count:.0%}"

# Method A priority community stats
method_b_tracts_with_at_least_one_method_a_cbg = comparison_df.loc[
    :, method_b_tract_has_at_least_one_method_a_cbg
].sum()
method_b_tracts_with_at_least_one_method_a_cbg_percent = f"{method_b_tracts_with_at_least_one_method_a_cbg / method_b_priority_tracts_count:.0%}"

method_b_tracts_with_at_100_percent_method_a_cbg = comparison_df.loc[
    :, method_b_tract_has_100_percent_method_a_cbg
].sum()
method_b_tracts_with_at_100_percent_method_a_cbg_percent = f"{method_b_tracts_with_at_100_percent_method_a_cbg / method_b_priority_tracts_count:.0%}"

# Method A non-priority community stats
method_b_non_priority_tracts_with_at_least_one_method_a_cbg = comparison_df.loc[
    :, method_b_non_priority_tract_has_at_least_one_method_a_cbg
].sum()
method_b_non_priority_tracts_with_at_least_one_method_a_cbg_percent = f"{method_b_non_priority_tracts_with_at_least_one_method_a_cbg / method_b_non_priority_tracts_count:.0%}"

method_b_non_priority_tracts_with_100_percent_method_a_cbg = comparison_df.loc[
    :, method_b_non_priority_tract_has_100_percent_method_a_cbg
].sum()
method_b_non_priority_tracts_with_100_percent_method_a_cbg_percent = f"{method_b_non_priority_tracts_with_100_percent_method_a_cbg / method_b_non_priority_tracts_count:.0%}"

# {{method_a_name}} compared to {{method_b_name}}

(Calculated on {{datetime.today().strftime('%Y-%m-%d')}})

Recall that census tracts contain one or more census block groups, with up to nine census block groups per tract.

Within the geographic area analyzed, there are {{method_b_priority_tracts_count}} census tracts designated as priority communities by {{method_b_name}}, out of {{total_tracts_count}} total tracts ({{method_b_priority_tracts_count_percent}}). 

Within the geographic region analyzed, there are {{method_a_priority_cbgs}} census block groups considered as priority communities by {{method_a_name}}, out of {{total_cbgs}} CBGs ({{method_a_priority_cbgs_percent}}). They occupy {{method_a_tracts_count}} census tracts ({{method_a_tracts_count_percent}}) of the geographic area analyzed.

Out of every {{method_b_name}} priority census tract, {{method_b_tracts_with_at_least_one_method_a_cbg}} ({{method_b_tracts_with_at_least_one_method_a_cbg_percent}}) of these census tracts have at least one census block group within them that is considered a priority community by {{method_a_name}}.

Out of every {{method_b_name}} priority census tract, {{method_b_tracts_with_at_100_percent_method_a_cbg}} ({{method_b_tracts_with_at_100_percent_method_a_cbg_percent}}) of these census tracts have 100% of the included census block groups within them considered priority communities by {{method_a_name}}.

Out of every census tract that is __not__ marked as a priority community by {{method_b_name}}, {{method_b_non_priority_tracts_with_at_least_one_method_a_cbg}} ({{method_b_non_priority_tracts_with_at_least_one_method_a_cbg_percent}}) of these census tracts have at least one census block group within them that is considered a priority community by the current version of the CEJST score.

Out of every census tract that is __not__ marked as a priority community by {{method_b_name}}, {{method_b_non_priority_tracts_with_100_percent_method_a_cbg}} ({{method_b_non_priority_tracts_with_100_percent_method_a_cbg_percent}}) of these census tracts have 100% of the included census block groups within them considered priority communities by the current version of the CEJST score.