In [1]:
# Before running this script as it currently stands, you'll need to run these notebooks (in any order):
# * score_calc.ipynb
# * calenviroscreen_etl.ipynb
# * hud_recap_etl.ipynb

import collections
import functools
import IPython
import numpy as np
import pandas as pd
import pathlib
import pypandoc
import requests
import typing
import us
import zipfile

from datetime import datetime
from tqdm.notebook import tqdm_notebook

# Turn on TQDM for pandas so that we can have progress bars when running `apply`.
tqdm_notebook.pandas()

In [2]:
# Suppress scientific notation in pandas (this shows up for census tract IDs)
pd.options.display.float_format = "{:.2f}".format

# Set some global parameters
DATA_DIR = pathlib.Path.cwd().parent / "data"
TEMP_DATA_DIR = pathlib.Path.cwd().parent / "data" / "tmp"
COMPARISON_OUTPUTS_DIR = TEMP_DATA_DIR / "comparison_outputs"

# Make the dirs if they don't exist
TEMP_DATA_DIR.mkdir(parents=True, exist_ok=True)
COMPARISON_OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)

CEJST_PRIORITY_COMMUNITY_THRESHOLD = 0.75

# Name fields using variables. (This makes it easy to reference the same fields frequently without using strings
# and introducing the risk of misspelling the field name.)

GEOID_FIELD_NAME = "GEOID10"
GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"
GEOID_STATE_FIELD_NAME = "GEOID10_STATE"
CENSUS_BLOCK_GROUP_POPULATION_FIELD = "Total population"

CALENVIROSCREEN_SCORE_FIELD = "calenviroscreen_score"
CALENVIROSCREEN_PERCENTILE_FIELD = "calenviroscreen_percentile"
CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD = "calenviroscreen_priority_community"

CEJST_SCORE_FIELD = "cejst_score"
CEJST_PERCENTILE_FIELD = "cejst_percentile"
CEJST_PRIORITY_COMMUNITY_FIELD = "cejst_priority_community"

In [3]:
# Load CEJST score data
cejst_data_path = DATA_DIR / "score" / "csv" / "usa.csv"
cejst_df = pd.read_csv(cejst_data_path, dtype={GEOID_FIELD_NAME: "string"})

# score_used = "Score A"

# # Rename unclear name "id" to "census_block_group_id", as well as other renamings.
# cejst_df.rename(
#     columns={
#         "Total population": CENSUS_BLOCK_GROUP_POPULATION_FIELD,
#         score_used: CEJST_SCORE_FIELD,
#         f"{score_used} (percentile)": CEJST_PERCENTILE_FIELD,
#     },
#     inplace=True,
#     errors="raise",
# )

# Create the CBG's Census Tract ID by dropping the last number from the FIPS CODE of the CBG.
# The CBG ID is the last one character.
# For more information, see https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html.
cejst_df.loc[:, GEOID_TRACT_FIELD_NAME] = (
    cejst_df.loc[:, GEOID_FIELD_NAME].astype(str).str[:-1]
)

cejst_df.loc[:, GEOID_STATE_FIELD_NAME] = (
    cejst_df.loc[:, GEOID_FIELD_NAME].astype(str).str[0:2]
)

cejst_df.head()

Unnamed: 0,GEOID10,Housing burden (percent),Total population,Air toxics cancer risk,Respiratory hazard index,Diesel particulate matter,Particulate matter (PM2.5),Ozone,Traffic proximity and volume,Proximity to RMP sites,...,Score B (percentile),Score B (top 25th percentile),Score C (percentile),Score C (top 25th percentile),Score D (percentile),Score D (top 25th percentile),Score E (percentile),Score E (top 25th percentile),GEOID10_TRACT,GEOID10_STATE
0,10010201001,0.15,636,49.38,0.79,0.28,9.69,36.76,91.02,0.09,...,0.74,False,0.59,False,0.55,False,0.42,False,1001020100,1
1,10010201002,0.15,1287,49.38,0.79,0.28,9.69,36.76,2.62,0.07,...,0.25,False,0.24,False,0.15,False,0.11,False,1001020100,1
2,10010202001,0.25,810,50.32,0.81,0.3,9.71,36.84,4.68,0.08,...,0.74,False,0.48,False,0.62,False,0.45,False,1001020200,1
3,10010202002,0.25,1218,50.32,0.81,0.3,9.71,36.84,218.65,0.09,...,0.74,False,0.59,False,0.64,False,0.65,False,1001020200,1
4,10010203001,0.21,2641,50.77,0.82,0.36,9.75,36.88,69.64,0.08,...,0.59,False,0.41,False,0.45,False,0.42,False,1001020300,1


In [4]:
# Load CalEnviroScreen 4.0
CALENVIROSCREEN_SCORE_FIELD = "calenviroscreen_score"
CALENVIROSCREEN_PERCENTILE_FIELD = "calenviroscreen_percentile"
CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD = "calenviroscreen_priority_community"

calenviroscreen_data_path = DATA_DIR / "dataset" / "calenviroscreen4" / "data06.csv"
calenviroscreen_df = pd.read_csv(
    calenviroscreen_data_path, dtype={GEOID_TRACT_FIELD_NAME: "string"}
)

calenviroscreen_df.head()

Unnamed: 0,GEOID10_TRACT,Total Population,California County,ZIP,Nearby City \r\n(to help approximate location only),Longitude,Latitude,calenviroscreen_score,calenviroscreen_percentile,DRAFT CES 4.0\r\nPercentile Range,...,Poverty,Poverty Pctl,Unemployment,Unemployment Pctl,Housing Burden,Housing Burden Pctl,Pop. Char.,Pop. Char. Score,Pop. Char. Pctl,calenviroscreen_priority_community
0,6019001100,2760,Fresno,93706,Fresno,-119.78,36.71,94.61,100.0,95-100% (highest scores),...,76.6,98.43,16.2,97.15,30.7,90.61,93.73,9.72,99.87,True
1,6077000700,4177,San Joaquin,95206,Stockton,-121.29,37.94,90.83,99.99,95-100% (highest scores),...,70.6,96.43,18.5,98.45,35.2,95.61,93.4,9.68,99.84,True
2,6077000100,4055,San Joaquin,95202,Stockton,-121.29,37.95,85.75,99.97,95-100% (highest scores),...,81.8,99.5,17.9,98.17,36.4,96.51,95.71,9.92,99.97,True
3,6071001600,5527,San Bernardino,91761,Ontario,-117.62,34.06,83.56,99.96,95-100% (highest scores),...,67.1,94.82,6.7,57.2,32.1,92.65,80.59,8.36,93.06,True
4,6037204920,2639,Los Angeles,90023,Los Angeles,-118.2,34.02,82.9,99.95,95-100% (highest scores),...,64.9,93.51,5.6,43.81,25.0,77.95,83.95,8.7,95.78,True


In [5]:
# Load HUD data
hud_recap_data_path = DATA_DIR / "dataset" / "hud_recap" / "usa.csv"
hud_recap_df = pd.read_csv(
    hud_recap_data_path, dtype={GEOID_TRACT_FIELD_NAME: "string"}
)

hud_recap_df.head()

Unnamed: 0,GEOID10_TRACT,hud_recap_priority_community
0,1001020100,False
1,1001020200,False
2,1001020300,False
3,1001020400,False
4,1001020500,False


In [6]:
# Join all dataframes that use tracts
census_tract_dfs = [calenviroscreen_df, hud_recap_df]

census_tract_df = functools.reduce(
    lambda left, right: pd.merge(
        left=left, right=right, on=GEOID_TRACT_FIELD_NAME, how="outer"
    ),
    census_tract_dfs,
)

if census_tract_df[GEOID_TRACT_FIELD_NAME].str.len().unique() != [11]:
    raise ValueError("Some of the census tract data has the wrong length.")

if len(census_tract_df) > 74134:
    raise ValueError("Too many rows in the join.")

census_tract_df.head()

Unnamed: 0,GEOID10_TRACT,Total Population,California County,ZIP,Nearby City \r\n(to help approximate location only),Longitude,Latitude,calenviroscreen_score,calenviroscreen_percentile,DRAFT CES 4.0\r\nPercentile Range,...,Poverty Pctl,Unemployment,Unemployment Pctl,Housing Burden,Housing Burden Pctl,Pop. Char.,Pop. Char. Score,Pop. Char. Pctl,calenviroscreen_priority_community,hud_recap_priority_community
0,6019001100,2760.0,Fresno,93706.0,Fresno,-119.78,36.71,94.61,100.0,95-100% (highest scores),...,98.43,16.2,97.15,30.7,90.61,93.73,9.72,99.87,True,True
1,6077000700,4177.0,San Joaquin,95206.0,Stockton,-121.29,37.94,90.83,99.99,95-100% (highest scores),...,96.43,18.5,98.45,35.2,95.61,93.4,9.68,99.84,True,True
2,6077000100,4055.0,San Joaquin,95202.0,Stockton,-121.29,37.95,85.75,99.97,95-100% (highest scores),...,99.5,17.9,98.17,36.4,96.51,95.71,9.92,99.97,True,True
3,6071001600,5527.0,San Bernardino,91761.0,Ontario,-117.62,34.06,83.56,99.96,95-100% (highest scores),...,94.82,6.7,57.2,32.1,92.65,80.59,8.36,93.06,True,True
4,6037204920,2639.0,Los Angeles,90023.0,Los Angeles,-118.2,34.02,82.9,99.95,95-100% (highest scores),...,93.51,5.6,43.81,25.0,77.95,83.95,8.7,95.78,True,False


In [7]:
# Join tract indices and CEJST data.
# Note: we're joining on the census *tract*, so there will be multiple CBG entries joined to the same census tract row from CES,
# creating multiple rows of the same CES data.
merged_df = cejst_df.merge(
    census_tract_df,
    how="left",
    on=GEOID_TRACT_FIELD_NAME,
)


if len(merged_df) > 220333:
    raise ValueError("Too many rows in the join.")

merged_df.head()

# merged_df.to_csv(
#     path_or_buf=COMPARISON_OUTPUTS_DIR / "merged.csv",
#     na_rep="",
#     index=False
# )

Unnamed: 0,GEOID10,Housing burden (percent),Total population,Air toxics cancer risk,Respiratory hazard index,Diesel particulate matter,Particulate matter (PM2.5),Ozone_x,Traffic proximity and volume,Proximity to RMP sites,...,Poverty Pctl,Unemployment,Unemployment Pctl,Housing Burden,Housing Burden Pctl,Pop. Char.,Pop. Char. Score,Pop. Char. Pctl,calenviroscreen_priority_community,hud_recap_priority_community
0,10010201001,0.15,636,49.38,0.79,0.28,9.69,36.76,91.02,0.09,...,,,,,,,,,,False
1,10010201002,0.15,1287,49.38,0.79,0.28,9.69,36.76,2.62,0.07,...,,,,,,,,,,False
2,10010202001,0.25,810,50.32,0.81,0.3,9.71,36.84,4.68,0.08,...,,,,,,,,,,False
3,10010202002,0.25,1218,50.32,0.81,0.3,9.71,36.84,218.65,0.09,...,,,,,,,,,,False
4,10010203001,0.21,2641,50.77,0.82,0.36,9.75,36.88,69.64,0.08,...,,,,,,,,,,False


In [8]:
cejst_priority_communities_fields = [
    "Score A (top 25th percentile)",
    "Score B (top 25th percentile)",
    "Score C (top 25th percentile)",
    "Score D (top 25th percentile)",
    "Score E (top 25th percentile)",
]

comparison_priority_communities_fields = [
    "calenviroscreen_priority_community",
    "hud_recap_priority_community",
]

In [9]:
# TODO: redo this as part of comp tool method
state_analysis_dicts = []

# Refactor
for field in cejst_priority_communities_fields:
    print(f"Field is {field}")

    for state_id in merged_df[GEOID_STATE_FIELD_NAME].unique():
        state_df = merged_df[
            merged_df[GEOID_TRACT_FIELD_NAME].astype(str).str[0:2] == state_id
        ]

        total_cbgs_in_state = len(state_df)
        priority_cbgs_in_state = state_df.loc[:, field].sum().squeeze()
        total_population_in_state = (
            state_df.loc[:, CENSUS_BLOCK_GROUP_POPULATION_FIELD].sum().squeeze()
        )
        total_population_in_priority_cbgs = (
            state_df.loc[state_df[field], CENSUS_BLOCK_GROUP_POPULATION_FIELD]
            .sum()
            .squeeze()
        )

        state_analysis_dicts.append(
            {
                "Priority community field": field,
                GEOID_STATE_FIELD_NAME: state_id,
                "State name": us.states.lookup(state_id).name,
                "Total CBGs in state": total_cbgs_in_state,
                f"Priority CBGs in state": priority_cbgs_in_state,
                f"Priority CBGs in state (percent)": priority_cbgs_in_state.astype(
                    float
                )
                / total_cbgs_in_state,
                "Total population in state": total_population_in_state,
                "Total population in priority CBGs": total_population_in_priority_cbgs,
                f"Total population in priority CBGs (percent)": total_population_in_priority_cbgs.astype(
                    float
                )
                / total_population_in_state,
            }
        )

state_analysis_df = pd.DataFrame(state_analysis_dicts)

state_analysis_df.to_csv(
    path_or_buf=COMPARISON_OUTPUTS_DIR / "Priority CBGs by state.csv",
    na_rep="",
    index=False,
)

state_analysis_df.head()

Field is Score A (top 25th percentile)
Field is Score B (top 25th percentile)
Field is Score C (top 25th percentile)
Field is Score D (top 25th percentile)
Field is Score E (top 25th percentile)


Unnamed: 0,Priority community field,GEOID10_STATE,State name,Total CBGs in state,Priority CBGs in state,Priority CBGs in state (percent),Total population in state,Total population in priority CBGs,Total population in priority CBGs (percent)
0,Score A (top 25th percentile),1,Alabama,3438,1335,0.39,4864680,1547217,0.32
1,Score A (top 25th percentile),2,Alaska,534,55,0.1,738516,61918,0.08
2,Score A (top 25th percentile),4,Arizona,4178,1234,0.3,6946685,1987208,0.29
3,Score A (top 25th percentile),5,Arkansas,2147,809,0.38,2990671,957601,0.32
4,Score A (top 25th percentile),6,California,23212,6987,0.3,39148760,12388634,0.32


In [15]:
# This cell defines a couple of comparison functions. It does not run them.

# Define a namedtuple for column names, which need to be shared between multiple parts of this comparison pipeline.
# Named tuples are useful here because they provide guarantees that for each instance, all properties are defined and
# can be accessed as properties (rather than as strings).

# Note: if you'd like to add a field used throughout the comparison process, add it in three places.
# For an example `new_field`,
# 1. in this namedtuple, add the field as a string in `field_names` (e.g., `field_names=[..., "new_field"])`)
# 2. in the function `get_comparison_field_names`, define how the field name should be created from input data
#     (e.g., `...new_field=f"New field compares {method_a_name} to {method_b_name}")
# 3. In the function `get_comparison_markdown_content`, add some reporting on the new field to the markdown content.
#     (e.g., `The statistics indicate that {calculation_based_on_new_field} percent of census tracts are different between scores.`)
ComparisonFieldNames = collections.namedtuple(
    typename="ComparisonFieldNames",
    field_names=[
        "any_tract_has_at_least_one_method_a_cbg",
        "method_b_tract_has_at_least_one_method_a_cbg",
        "method_b_tract_has_100_percent_method_a_cbg",
        "method_b_non_priority_tract_has_at_least_one_method_a_cbg",
        "method_b_non_priority_tract_has_100_percent_method_a_cbg",
    ],
)

# Define a namedtuple for indices.
Index = collections.namedtuple(
    typename="Index",
    field_names=[
        "method_name",
        "priority_communities_field",
        # Note: this field only used by indices defined at the census tract level.
        "other_census_tract_fields_to_keep",
    ],
)


def get_comparison_field_names(
    method_a_name: str,
    method_b_name: str,
) -> ComparisonFieldNames:
    comparison_field_names = ComparisonFieldNames(
        any_tract_has_at_least_one_method_a_cbg=(
            f"Any tract has at least one {method_a_name} Priority CBG?"
        ),
        method_b_tract_has_at_least_one_method_a_cbg=(
            f"{method_b_name} priority tract has at least one {method_a_name} CBG?"
        ),
        method_b_tract_has_100_percent_method_a_cbg=(
            f"{method_b_name} tract has 100% {method_a_name} priority CBGs?"
        ),
        method_b_non_priority_tract_has_at_least_one_method_a_cbg=(
            f"Non-priority {method_b_name} tract has at least one {method_a_name} priority CBG?"
        ),
        method_b_non_priority_tract_has_100_percent_method_a_cbg=(
            f"Non-priority {method_b_name} tract has 100% {method_a_name} priority CBGs?"
        ),
    )
    return comparison_field_names


def get_comparison_df(
    df: pd.DataFrame,
    method_a_priority_census_block_groups_field: str,
    method_b_priority_census_tracts_field: str,
    other_census_tract_fields_to_keep: typing.Optional[typing.List[str]],
    comparison_field_names: ComparisonFieldNames,
    output_dir: pathlib.PosixPath,
) -> None:
    """Produces a comparison report for any two given boolean columns representing priority fields.

    Args:
      df: a pandas dataframe including the data for this comparison.
      method_a_priority_census_block_groups_field: the name of a boolean column in `df`, such as the CEJST priority
        community field that defines communities at the level of census block groups (CBGs).
      method_b_priority_census_tracts_field: the name of a boolean column in `df`, such as the CalEnviroScreen priority
        community field that defines communities at the level of census tracts.
      other_census_tract_fields_to_keep (optional): a list of field names to preserve at the census tract level

    Returns:
      df: a pandas dataframe with one row with the results of this comparison
    """

    def calculate_comparison(frame: pd.DataFrame) -> pd.DataFrame:
        """
        This method will be applied to a `group_by` object.

        Note: It inherits from outer scope `method_a_priority_census_block_groups_field`, `method_b_priority_census_tracts_field`,
        and `other_census_tract_fields_to_keep`.
        """
        # Keep all the tract values at the Census Tract Level
        for field in other_census_tract_fields_to_keep:
            if len(frame[field].unique()) != 1:
                raise ValueError(
                    f"There are different values per CBG for field {field}."
                    "`other_census_tract_fields_to_keep` can only be used for fields at the census tract level."
                )

        df = frame.loc[
            frame.index[0],
            [
                GEOID_TRACT_FIELD_NAME,
                method_b_priority_census_tracts_field,
            ]
            + other_census_tract_fields_to_keep,
        ]

        # Convenience constant for whether the tract is or is not a CalEnviroScreen priority community.
        is_a_method_b_priority_tract = frame.loc[
            frame.index[0], [method_b_priority_census_tracts_field]
        ][0]

        # Recall that NaN values are not falsy, so we need to check if `is_a_method_b_priority_tract` is True.
        is_a_method_b_priority_tract = is_a_method_b_priority_tract is True

        # Calculate whether the tract (whether or not it is a comparison priority tract) includes CBGs that are priority
        # according to the current CBG score.
        df[comparison_field_names.any_tract_has_at_least_one_method_a_cbg] = (
            frame.loc[:, method_a_priority_census_block_groups_field].sum() > 0
        )

        # Calculate comparison
        # A comparison priority tract has at least one CBG that is a priority CBG.
        df[comparison_field_names.method_b_tract_has_at_least_one_method_a_cbg] = (
            frame.loc[:, method_a_priority_census_block_groups_field].sum() > 0
            if is_a_method_b_priority_tract
            else None
        )

        # A comparison priority tract has all of its contained CBGs as CBG priority CBGs.
        df[comparison_field_names.method_b_tract_has_100_percent_method_a_cbg] = (
            frame.loc[:, method_a_priority_census_block_groups_field].mean() == 1
            if is_a_method_b_priority_tract
            else None
        )

        # Calculate the inverse
        # A tract that is _not_ a comparison priority has at least one CBG priority CBG.
        df[
            comparison_field_names.method_b_non_priority_tract_has_at_least_one_method_a_cbg
        ] = (
            frame.loc[:, method_a_priority_census_block_groups_field].sum() > 0
            if not is_a_method_b_priority_tract
            else None
        )

        # A tract that is _not_ a comparison priority has all of its contained CBGs as CBG priority CBGs.
        df[
            comparison_field_names.method_b_non_priority_tract_has_100_percent_method_a_cbg
        ] = (
            frame.loc[:, method_a_priority_census_block_groups_field].mean() == 1
            if not is_a_method_b_priority_tract
            else None
        )

        return df

    # Group all data by the census tract.
    grouped_df = df.groupby(GEOID_TRACT_FIELD_NAME)

    # Run the comparison function on the groups.
    comparison_df = grouped_df.progress_apply(calculate_comparison)

    return comparison_df


def get_comparison_markdown_content(
    original_df: pd.DataFrame,
    comparison_df: pd.DataFrame,
    comparison_field_names: ComparisonFieldNames,
    method_a_name: str,
    method_b_name: str,
    method_a_priority_census_block_groups_field: str,
    method_b_priority_census_tracts_field: str,
) -> str:
    # Prepare some constants for use in the following Markdown content.
    total_cbgs = len(original_df)

    # Note: using squeeze throughout do reduce result of `sum()` to a scalar.
    # TODO: investigate why sums are sometimes series and sometimes scalar.
    method_a_priority_cbgs = (
        original_df.loc[:, method_a_priority_census_block_groups_field].sum().squeeze()
    )
    method_a_priority_cbgs_percent = f"{method_a_priority_cbgs / total_cbgs:.0%}"

    total_tracts_count = len(comparison_df)

    print(f"here we are {method_b_priority_census_tracts_field}")

    method_b_priority_tracts_count = (
        comparison_df.loc[:, method_b_priority_census_tracts_field].sum().squeeze()
    )

    print("here we aren't")
    method_b_priority_tracts_count_percent = (
        f"{method_b_priority_tracts_count / total_tracts_count:.0%}"
    )
    method_b_non_priority_tracts_count = (
        total_tracts_count - method_b_priority_tracts_count
    )

    method_a_tracts_count = (
        comparison_df.loc[
            :, comparison_field_names.any_tract_has_at_least_one_method_a_cbg
        ]
        .sum()
        .squeeze()
    )
    method_a_tracts_count_percent = f"{method_a_tracts_count / total_tracts_count:.0%}"

    # Method A priority community stats
    method_b_tracts_with_at_least_one_method_a_cbg = comparison_df.loc[
        :, comparison_field_names.method_b_tract_has_at_least_one_method_a_cbg
    ].sum()
    method_b_tracts_with_at_least_one_method_a_cbg_percent = f"{method_b_tracts_with_at_least_one_method_a_cbg / method_b_priority_tracts_count:.0%}"

    method_b_tracts_with_at_100_percent_method_a_cbg = comparison_df.loc[
        :, comparison_field_names.method_b_tract_has_100_percent_method_a_cbg
    ].sum()
    method_b_tracts_with_at_100_percent_method_a_cbg_percent = f"{method_b_tracts_with_at_100_percent_method_a_cbg / method_b_priority_tracts_count:.0%}"

    # Method A non-priority community stats
    method_b_non_priority_tracts_with_at_least_one_method_a_cbg = comparison_df.loc[
        :,
        comparison_field_names.method_b_non_priority_tract_has_at_least_one_method_a_cbg,
    ].sum()

    method_b_non_priority_tracts_with_at_least_one_method_a_cbg_percent = f"{method_b_non_priority_tracts_with_at_least_one_method_a_cbg / method_b_non_priority_tracts_count:.0%}"

    method_b_non_priority_tracts_with_100_percent_method_a_cbg = comparison_df.loc[
        :,
        comparison_field_names.method_b_non_priority_tract_has_100_percent_method_a_cbg,
    ].sum()
    method_b_non_priority_tracts_with_100_percent_method_a_cbg_percent = f"{method_b_non_priority_tracts_with_100_percent_method_a_cbg / method_b_non_priority_tracts_count:.0%}"

    # Create markdown content for comparisons.
    markdown_content = f"""
# {method_a_name} compared to {method_b_name}

(This report was calculated on {datetime.today().strftime('%Y-%m-%d')}.)

Recall that census tracts contain one or more census block groups, with up to nine census block groups per tract.

Within the geographic area analyzed, there are {method_b_priority_tracts_count} census tracts designated as priority communities by {method_b_name}, out of {total_tracts_count} total tracts ({method_b_priority_tracts_count_percent}). 

Within the geographic region analyzed, there are {method_a_priority_cbgs} census block groups considered as priority communities by {method_a_name}, out of {total_cbgs} CBGs ({method_a_priority_cbgs_percent}). They occupy {method_a_tracts_count} census tracts ({method_a_tracts_count_percent}) of the geographic area analyzed.

Out of every {method_b_name} priority census tract, {method_b_tracts_with_at_least_one_method_a_cbg} ({method_b_tracts_with_at_least_one_method_a_cbg_percent}) of these census tracts have at least one census block group within them that is considered a priority community by {method_a_name}.

Out of every {method_b_name} priority census tract, {method_b_tracts_with_at_100_percent_method_a_cbg} ({method_b_tracts_with_at_100_percent_method_a_cbg_percent}) of these census tracts have 100% of the included census block groups within them considered priority communities by {method_a_name}.

Out of every census tract that is __not__ marked as a priority community by {method_b_name}, {method_b_non_priority_tracts_with_at_least_one_method_a_cbg} ({method_b_non_priority_tracts_with_at_least_one_method_a_cbg_percent}) of these census tracts have at least one census block group within them that is considered a priority community by the current version of the CEJST score.

Out of every census tract that is __not__ marked as a priority community by {method_b_name}, {method_b_non_priority_tracts_with_100_percent_method_a_cbg} ({method_b_non_priority_tracts_with_100_percent_method_a_cbg_percent}) of these census tracts have 100% of the included census block groups within them considered priority communities by the current version of the CEJST score.
"""

    return markdown_content


def write_markdown_and_docx_content(
    markdown_content: str, file_dir: pathlib.PosixPath, file_name_without_extension: str
) -> pathlib.PosixPath:
    """Write Markdown content to both .md and .docx files."""
    # Set the file paths for both files.
    markdown_file_path = file_dir / f"{file_name_without_extension}.md"
    docx_file_path = file_dir / f"{file_name_without_extension}.docx"

    # Write the markdown content to file.
    with open(markdown_file_path, "w") as text_file:
        text_file.write(markdown_content)

    # Convert markdown file to Word doc.
    pypandoc.convert_file(
        source_file=str(markdown_file_path),
        to="docx",
        outputfile=str(docx_file_path),
        extra_args=[],
    )

    return docx_file_path


def execute_comparison(
    df: pd.DataFrame,
    method_a_name: str,
    method_b_name: str,
    method_a_priority_census_block_groups_field: str,
    method_b_priority_census_tracts_field: str,
    other_census_tract_fields_to_keep: typing.Optional[typing.List[str]],
) -> pathlib.PosixPath:
    """Execute an individual comparison by creating the data frame and writing the report.

    Args:
      df: a pandas dataframe including the data for this comparison.
      method_a_priority_census_block_groups_field: the name of a boolean column in `df`, such as the CEJST priority
        community field that defines communities at the level of census block groups (CBGs).
      method_b_priority_census_tracts_field: the name of a boolean column in `df`, such as the CalEnviroScreen priority
        community field that defines communities at the level of census tracts.
      other_census_tract_fields_to_keep (optional): a list of field names to preserve at the census tract level

    Returns:
      df: a pandas dataframe with one row with the results of this comparison

    """
    comparison_field_names = get_comparison_field_names(
        method_a_name=method_a_name, method_b_name=method_b_name
    )

    # Create or use a directory for outputs grouped by Method A.
    output_dir = COMPARISON_OUTPUTS_DIR / method_a_name
    output_dir.mkdir(parents=True, exist_ok=True)

    comparison_df = get_comparison_df(
        df=df,
        method_a_priority_census_block_groups_field=method_a_priority_census_block_groups_field,
        method_b_priority_census_tracts_field=method_b_priority_census_tracts_field,
        comparison_field_names=comparison_field_names,
        other_census_tract_fields_to_keep=other_census_tract_fields_to_keep,
        output_dir=output_dir,
    )

    # Choose output path, and create directories if they don't exist.
    file_path = (
        output_dir / f"Comparison Output - {method_a_name} and {method_b_name}.csv"
    )

    # Write comparison to CSV.
    comparison_df.to_csv(
        path_or_buf=file_path,
        na_rep="",
        index=False,
    )

    markdown_content = get_comparison_markdown_content(
        original_df=df,
        comparison_df=comparison_df,
        comparison_field_names=comparison_field_names,
        method_a_name=method_a_name,
        method_b_name=method_b_name,
        method_a_priority_census_block_groups_field=method_a_priority_census_block_groups_field,
        method_b_priority_census_tracts_field=method_b_priority_census_tracts_field,
    )

    comparison_docx_file_path = write_markdown_and_docx_content(
        markdown_content=markdown_content,
        # TODO: create a directory for each field A
        file_dir=output_dir,
        file_name_without_extension=f"Comparison report - {method_a_name} and {method_b_name}",
    )

    return comparison_docx_file_path


def execute_comparisons(
    df: pd.DataFrame,
    census_block_group_indices: typing.List[Index],
    census_tract_indices: typing.List[Index],
):
    """Create multiple comparison reports."""
    comparison_docx_file_paths = []
    for cbg_index in census_block_group_indices:
        for census_tract_index in census_tract_indices:
            print(
                f"Running comparisons for {cbg_index.method_name} against {census_tract_index.method_name}..."
            )

            comparison_docx_file_path = execute_comparison(
                df=df,
                method_a_name=cbg_index.method_name,
                method_b_name=census_tract_index.method_name,
                method_a_priority_census_block_groups_field=cbg_index.priority_communities_field,
                method_b_priority_census_tracts_field=census_tract_index.priority_communities_field,
                other_census_tract_fields_to_keep=census_tract_index.other_census_tract_fields_to_keep,
            )

            comparison_docx_file_paths.append(comparison_docx_file_path)

    return comparison_docx_file_paths


# California only
cal_df = merged_df[merged_df[GEOID_TRACT_FIELD_NAME].astype(str).str[0:2] == "06"]
# cal_df = cal_df[0:1000]
print(len(cal_df))

census_block_group_indices = [
    Index(
        method_name="Score A",
        priority_communities_field="Score A (top 25th percentile)",
        other_census_tract_fields_to_keep=[],
    )
]

census_tract_indices = [
    Index(
        method_name="CalEnviroScreen 4.0",
        priority_communities_field="calenviroscreen_priority_community",
        other_census_tract_fields_to_keep=[
            CALENVIROSCREEN_SCORE_FIELD,
            CALENVIROSCREEN_PERCENTILE_FIELD,
        ],
    ),
    Index(
        method_name="HUD RECAP",
        priority_communities_field="hud_recap_priority_community",
        other_census_tract_fields_to_keep=[],
    ),
]

file_paths = execute_comparisons(
    df=cal_df[1:100],
    census_block_group_indices=census_block_group_indices,
    census_tract_indices=census_tract_indices,
)

print(file_paths)

23212
Running comparisons for Score A against CalEnviroScreen 4.0...


  0%|          | 0/38 [00:00<?, ?it/s]

here we are calenviroscreen_priority_community
here we aren't
Running comparisons for Score A against HUD RECAP...


  0%|          | 0/38 [00:00<?, ?it/s]

here we are hud_recap_priority_community
here we aren't
[PosixPath('/Users/lucas/Documents/usds/repos/justice40-tool/score/data/tmp/comparison_outputs/Score A/Comparison report - Score A and CalEnviroScreen 4.0.docx'), PosixPath('/Users/lucas/Documents/usds/repos/justice40-tool/score/data/tmp/comparison_outputs/Score A/Comparison report - Score A and HUD RECAP.docx')]
