In [1]:
import pandas as pd
import numpy as np
import pathlib

%load_ext lab_black

# Creating an aggregate burden index

Although we will not be using a aggregate burden index for v1.0 of the CEJST, the USDS team wanted to demonstrate how even duplicating CalEnviroScreen's cumulative index (or a loose interpretation of it) would impact the communities highlighted. 

The data team believes that a threshold methodology has significant limitations that an aggregate or cumulative burden index could remediate, and presents the following as an example of such an index. 

In [2]:
SCORE_DIR = pathlib.Path.cwd().parent / "data" / "score" / "csv" / "full"

In [3]:
usa = pd.read_csv(
    SCORE_DIR / "usa.csv",
    dtype={"GEOID10_TRACT": str},
)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
## environment
toxins_category = [
    "Percent pre-1960s housing (lead paint indicator) (percentile)",
    "Proximity to Risk Management Plan (RMP) facilities (percentile)",
    "Proximity to NPL sites (percentile)",
    "Proximity to hazardous waste sites (percentile)",
    "Wastewater discharge (percentile)",
]

## sensitive populations
health_category = [
    "Diagnosed diabetes among adults aged greater than or equal to 18 years (percentile)",
    "Current asthma among adults aged greater than or equal to 18 years (percentile)",
    "Coronary heart disease among adults aged greater than or equal to 18 years (percentile)",
    "Low life expectancy (percentile)",
]

## exposure
built_environment_category = [
    "Expected building loss rate (Natural Hazards Risk Index) (percentile)",
    "Expected agricultural loss rate (Natural Hazards Risk Index) (percentile)",
    "Expected population loss rate (Natural Hazards Risk Index) (percentile)",
    "Energy burden (percentile)",
    "Diesel particulate matter exposure (percentile)",
    "Traffic proximity and volume (percentile)",
    "PM2.5 in the air (percentile)",
]

## socioeconomic
socioeconomic_category = [
    "Unemployment (percent) (percentile)",
    "Housing burden (percent) (percentile)",
    "Low median household income as a percent of area median income (percentile)",
    "Percent of households in linguistic isolation (percentile)",
    "Percent of individuals below 200% Federal Poverty Line, imputed and adjusted (percentile)",
    "Percent individuals age 25 or over with less than high school degree (percentile)",
    "Percent of individuals < 100% Federal Poverty Line (percentile)",
]

In [5]:
usa["toxins_cat"] = usa[toxins_category].mean(axis=1)
usa["built_env_cat"] = usa[built_environment_category].mean(axis=1)
usa["health_cat"] = usa[health_category].mean(axis=1)
usa["ses_cat"] = usa[socioeconomic_category].mean(axis=1)


usa["pollution_burden"] = 0.5 * usa["toxins_cat"] + usa["built_env_cat"]
usa["population_characteristics"] = usa["health_cat"] + usa["ses_cat"]
poll_max = usa["pollution_burden"].max()
pop_max = usa["population_characteristics"].max()

usa["scaled_pollution_burden"] = usa["pollution_burden"] / poll_max
usa["scaled_population_characteristics"] = usa["population_characteristics"] / pop_max

usa["cal_score"] = (
    usa["scaled_pollution_burden"] * usa["scaled_population_characteristics"]
)
usa["pct_cal_score"] = usa["cal_score"].rank(pct=True)

In [6]:
usa[usa["GEOID10_TRACT"] == "08031008387"][
    toxins_category
    + health_category
    + built_environment_category
    + socioeconomic_category
].T

Unnamed: 0,11618
Percent pre-1960s housing (lead paint indicator) (percentile),0.019256
Proximity to Risk Management Plan (RMP) facilities (percentile),0.890407
Proximity to NPL sites (percentile),0.816638
Proximity to hazardous waste sites (percentile),0.492804
Wastewater discharge (percentile),0.317463
Diagnosed diabetes among adults aged greater than or equal to 18 years (percentile),0.411229
Current asthma among adults aged greater than or equal to 18 years (percentile),0.838579
Coronary heart disease among adults aged greater than or equal to 18 years (percentile),0.121655
Low life expectancy (percentile),0.663824
Expected building loss rate (Natural Hazards Risk Index) (percentile),0.843358


In [7]:
for cutoff in [0.65, 0.8, 0.825, 0.85, 0.9]:

    usa["pct_cal_score_" + str(cutoff)] = usa["pct_cal_score"] >= cutoff
    display(
        pd.crosstab(
            usa["pct_cal_score_" + str(cutoff)],
            usa["Definition N (communities)"],
        )
    )
    display(
        pd.crosstab(
            usa["pct_cal_score_" + str(cutoff)],
            usa["Definition N (communities)"],
            normalize=True,
        )
    )

Definition N (communities),False,True
pct_cal_score_0.65,Unnamed: 1_level_1,Unnamed: 2_level_1
False,42825,6054
True,5849,19432


Definition N (communities),False,True
pct_cal_score_0.65,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.577468,0.081634
True,0.07887,0.262028


Definition N (communities),False,True
pct_cal_score_0.8,Unnamed: 1_level_1,Unnamed: 2_level_1
False,47299,12415
True,1375,13071


Definition N (communities),False,True
pct_cal_score_0.8,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.637797,0.167408
True,0.018541,0.176254


Definition N (communities),False,True
pct_cal_score_0.825,Unnamed: 1_level_1,Unnamed: 2_level_1
False,47741,13778
True,933,11708


Definition N (communities),False,True
pct_cal_score_0.825,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.643757,0.185787
True,0.012581,0.157875


Definition N (communities),False,True
pct_cal_score_0.85,Unnamed: 1_level_1,Unnamed: 2_level_1
False,48085,15240
True,589,10246


Definition N (communities),False,True
pct_cal_score_0.85,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.648395,0.205502
True,0.007942,0.138161


Definition N (communities),False,True
pct_cal_score_0.9,Unnamed: 1_level_1,Unnamed: 2_level_1
False,48499,18438
True,175,7048


Definition N (communities),False,True
pct_cal_score_0.9,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.653978,0.248625
True,0.00236,0.095038


## Does it square with calenvironscreen? 

In [8]:
true_ces = pd.read_csv(
    "/Users/emmausds/j40/data_pipeline/data/dataset/calenviroscreen4/data06.csv",
    dtype={"GEOID10_TRACT": str},
)

In [9]:
ces_merged = usa.merge(true_ces, on="GEOID10_TRACT", how="right")

In [10]:
ces_merged["new_cal_score"] = ces_merged["pct_cal_score"].rank(pct=True)
ces_merged["new_cal_flag"] = ces_merged["new_cal_score"] >= 0.9

In [11]:
ces_merged["new_cal_flag"].value_counts(normalize=True)

False    0.900311
True     0.099689
Name: new_cal_flag, dtype: float64

In [12]:
ces_merged["any_flag"] = (
    ces_merged["pct_cal_score_0.9"] | ces_merged["Definition N (communities)"]
)

In [13]:
ces_merged.groupby("pct_cal_score_0.9")["DRAFT CES 4.0\nPercentile Range"].value_counts(
    dropna=False, normalize=True
)

pct_cal_score_0.9  DRAFT CES 4.0\nPercentile Range
False              10-15%                             0.052534
                   20-25%                             0.052534
                   40-45%                             0.052534
                   1-5% (lowest scores)               0.052402
                   15-20%                             0.052402
                   25-30%                             0.052402
                   30-35%                             0.052402
                   35-40%                             0.052402
                   5-10%                              0.052402
                   45-50%                             0.052269
                   50-55%                             0.052269
                   60-65%                             0.052137
                   55-60%                             0.052005
                   65-70%                             0.051343
                   70-75%                             0.050417
    

In [14]:
ces_merged.groupby("pct_cal_score_0.9")["DRAFT CES 4.0\nPercentile Range"].value_counts(
    dropna=False, normalize=True
).rename("share").reset_index().pivot_table(
    index="pct_cal_score_0.9", columns="DRAFT CES 4.0\nPercentile Range", values="share"
)

DRAFT CES 4.0 Percentile Range,1-5% (lowest scores),10-15%,15-20%,20-25%,25-30%,30-35%,35-40%,40-45%,45-50%,5-10%,50-55%,55-60%,60-65%,65-70%,70-75%,75-80%,80-85%,85-90%,90-95%,95-100% (highest scores)
pct_cal_score_0.9,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
False,0.052402,0.052534,0.052402,0.052534,0.052402,0.052402,0.052402,0.052534,0.052269,0.052402,0.052269,0.052005,0.052137,0.051343,0.050417,0.048697,0.047506,0.041419,0.037846,0.028847
True,,,,,,0.002092,,,0.002092,,0.004184,0.006276,0.006276,0.016736,0.033473,0.058577,0.079498,0.17364,0.232218,0.374477


In [15]:
ces_merged.groupby("any_flag")["DRAFT CES 4.0\nPercentile Range"].value_counts(
    dropna=False, normalize=True
).rename("share").reset_index().pivot_table(
    index="any_flag", columns="DRAFT CES 4.0\nPercentile Range", values="share"
).T

any_flag,False,True
DRAFT CES 4.0 Percentile Range,Unnamed: 1_level_1,Unnamed: 2_level_1
1-5% (lowest scores),0.078916,
10-15%,0.076525,0.004309
15-20%,0.075727,0.005303
20-25%,0.07234,0.011269
25-30%,0.071343,0.012595
30-35%,0.06955,0.01591
35-40%,0.066361,0.020882
40-45%,0.062375,0.027842
45-50%,0.056397,0.037454
5-10%,0.078119,0.001326


In [16]:
true_ces["DRAFT CES 4.0\nPercentile Range"].value_counts()

95-100% (highest scores)    397
30-35%                      397
80-85%                      397
10-15%                      397
70-75%                      397
60-65%                      397
20-25%                      397
50-55%                      397
90-95%                      397
40-45%                      397
5-10%                       396
15-20%                      396
25-30%                      396
45-50%                      396
35-40%                      396
55-60%                      396
65-70%                      396
75-80%                      396
85-90%                      396
1-5% (lowest scores)        396
Name: DRAFT CES 4.0\nPercentile Range, dtype: int64