In [1]:
import collections
from pathlib import Path
import pandas as pd
import csv

# Define some global parameters
BUCKET_SOCIOECONOMIC = "Socioeconomic Factors"
BUCKET_SENSITIVE = "Sensitive populations"
BUCKET_ENVIRONMENTAL = "Environmental effects"
BUCKET_EXPOSURES = "Exposures"
BUCKETS = [
    BUCKET_SOCIOECONOMIC,
    BUCKET_SENSITIVE,
    BUCKET_ENVIRONMENTAL,
    BUCKET_EXPOSURES,
]

# There's another aggregation level (a second level of "buckets").
AGGREGATION_POLLUTION = "Pollution Burden"
AGGREGATION_POPULATION = "Population Characteristics"

PERCENTILE_FIELD_SUFFIX = " (percentile)"

data_path = Path.cwd().parent / "data"
fips_csv_path = data_path / "fips_states_2010.csv"
csv_path = data_path / "score" / "csv"

# Tell pandas to display all columns
pd.set_option("display.max_columns", None)

In [2]:
# EJSCreen csv Load
ejscreen_csv = data_path / "dataset" / "ejscreen_2020" / "usa.csv"
df = pd.read_csv(ejscreen_csv, dtype={"ID": "string"}, low_memory=False)
df.head()

Unnamed: 0,OBJECTID,ID,STATE_NAME,ST_ABBREV,REGION,ACSTOTPOP,D_PM25_2,B_PM25_D2,P_PM25_D2,D_OZONE_2,B_OZONE_D2,P_OZONE_D2,D_DSLPM_2,B_DSLPM_D2,P_DSLPM_D2,D_CANCR_2,B_CANCR_D2,P_CANCR_D2,D_RESP_2,B_RESP_D2,P_RESP_D2,D_PTRAF_2,B_PTRAF_D2,P_PTRAF_D2,D_LDPNT_2,B_LDPNT_D2,P_LDPNT_D2,D_PNPL_2,B_PNPL_D2,P_PNPL_D2,D_PRMP_2,B_PRMP_D2,P_PRMP_D2,D_PTSDF_2,B_PTSDF_D2,P_PTSDF_D2,D_PWDIS_2,B_PWDIS_D2,P_PWDIS_D2,PM25,B_PM25,P_PM25,OZONE,B_OZONE,P_OZONE,DSLPM,B_DSLPM,P_DSLPM,CANCER,B_CANCR,P_CANCR,RESP,B_RESP,P_RESP,PTRAF,B_PTRAF,P_PTRAF,PRE1960PCT,B_LDPNT,P_LDPNT,PNPL,B_PNPL,P_PNPL,PRMP,B_PRMP,P_PRMP,PTSDF,B_PTSDF,P_PTSDF,PWDIS,B_PWDIS,P_PWDIS,VULEOPCT,B_VULEOPCT,P_VULEOPCT,MINORPCT,B_MINORPCT,P_MINORPCT,LOWINCPCT,B_LWINCPCT,P_LWINCPCT,LINGISOPCT,B_LNGISPCT,P_LNGISPCT,LESSHSPCT,B_LESHSPCT,P_LESHSPCT,UNDER5PCT,B_UNDR5PCT,P_UNDR5PCT,OVER64PCT,B_OVR64PCT,P_OVR64PCT,T_MINORPCT,T_LWINCPCT,T_LNGISPCT,T_LESHSPCT,T_UNDR5PCT,T_OVR64PCT,T_VULEOPCT,T_PM25,T_PM25_D2,T_OZONE,T_OZONE_D2,T_DSLPM,T_DSLPM_D2,T_CANCR,T_CANCR_D2,T_RESP,T_RESP_D2,T_PTRAF,T_PTRAF_D2,T_LDPNT,T_LDPNT_D2,T_PNPL,T_PNPL_D2,T_PRMP,T_PRMP_D2,T_PTSDF,T_PTSDF_D2,T_PWDIS,T_PWDIS_D2,Shape_Length,Shape_Area
0,1,10010201001,Alabama,AL,4,636,-492.025529412,6,52.0,-1866.38637046,6,52.0,-14.1482578148,6,51.0,-2506.96648306,6,51.0,-40.0108963305,6,51.0,-4621.05160063,4,35.0,-13.815487,4,31.0,-3.601838,5,40.0,-4.325802,6,53.0,-29.790608,4,38.0,,0,,9.69089945205,7,69.0,36.7602117647,3,24.0,0.2786630687,5,47.0,49.3770316066,9,84.0,0.788051737456,9,89.0,91.0159000855,6,55.0,0.272109,8,79.0,0.070942,8,79.0,0.085201,3,23.0,0.586754,6,57.0,,0,,0.274371,5,43.0,0.163522,4,35.0,0.38522,6,52.0,0.047619,10,92.0,0.208134,8,75.0,0.031447,3,23.0,0.150943,5,46.0,16% (35%ile),39% (52%ile),5% (92%ile),21% (75%ile),3% (23%ile),15% (46%ile),27% (43%ile),9.69 ug/m3 (69%ile),52%ile,36.8 ppb (24%ile),52%ile,0.279 ug/m3 (47%ile),51%ile,49 lifetime risk per million (84%ile),51%ile,0.79 (89%ile),51%ile,91 daily vehicles/meters distance (55%ile),35%ile,0.27 = fraction pre-1960 (79%ile),31%ile,0.071 facilities/km distance (79%ile),40%ile,0.085 facilities/km distance (23%ile),53%ile,0.59 facilities/km distance (57%ile),38%ile,,,13443.155206,6040790.0
1,2,10010201002,Alabama,AL,4,1287,-2053.08341364,4,30.0,-7787.90260177,4,32.0,-59.0366794303,4,30.0,-10460.8622871,3,27.0,-166.954157276,3,26.0,-554.798777021,5,46.0,-24.461822,3,20.0,-13.631646,2,19.0,-15.634259,5,42.0,-95.462908,3,23.0,,0,,9.69089945205,7,69.0,36.7602117647,3,24.0,0.2786630687,5,47.0,49.3770316066,9,84.0,0.788051737456,9,89.0,2.61874365577,2,14.0,0.115464,6,50.0,0.064344,8,76.0,0.073796,2,17.0,0.450601,6,52.0,,0,,0.189588,3,22.0,0.216006,5,44.0,0.16317,2,15.0,0.0,8,71.0,0.040678,2,15.0,0.041958,4,34.0,0.115773,3,28.0,22% (44%ile),16% (15%ile),0% (71%ile),4% (15%ile),4% (34%ile),12% (28%ile),19% (22%ile),9.69 ug/m3 (69%ile),30%ile,36.8 ppb (24%ile),32%ile,0.279 ug/m3 (47%ile),30%ile,49 lifetime risk per million (84%ile),27%ile,0.79 (89%ile),26%ile,2.6 daily vehicles/meters distance (14%ile),46%ile,0.12 = fraction pre-1960 (50%ile),20%ile,0.064 facilities/km distance (76%ile),19%ile,0.074 facilities/km distance (17%ile),42%ile,0.45 facilities/km distance (52%ile),23%ile,,,11917.089598,7834160.0
2,3,10010202001,Alabama,AL,4,810,1846.12693767,8,75.0,7002.78371663,8,75.0,57.8504398228,8,76.0,9566.07732145,8,77.0,153.743919566,8,78.0,889.548733515,7,64.0,27.230838,8,77.0,13.161449,9,85.0,14.802837,7,67.0,124.208381,8,77.0,,0,,9.71124027397,8,70.0,36.8369660131,3,25.0,0.3043125094,6,54.0,50.3207409265,9,89.0,0.808744032223,10,93.0,4.67932150834,2,17.0,0.143243,6,57.0,0.069234,8,78.0,0.077868,3,20.0,0.653377,6,59.0,,0,,0.588895,9,82.0,0.676543,9,83.0,0.501247,8,73.0,0.0,8,71.0,0.135563,6,53.0,0.041975,4,34.0,0.224691,9,83.0,68% (83%ile),50% (73%ile),0% (71%ile),14% (53%ile),4% (34%ile),22% (83%ile),59% (82%ile),9.71 ug/m3 (70%ile),75%ile,36.8 ppb (25%ile),75%ile,0.304 ug/m3 (54%ile),76%ile,50 lifetime risk per million (89%ile),77%ile,0.81 (93%ile),78%ile,4.7 daily vehicles/meters distance (17%ile),64%ile,0.14 = fraction pre-1960 (57%ile),77%ile,0.069 facilities/km distance (78%ile),85%ile,0.078 facilities/km distance (20%ile),67%ile,0.65 facilities/km distance (59%ile),77%ile,,,7770.915121,2900774.0
3,4,10010202002,Alabama,AL,4,1218,1392.07530488,8,72.0,5280.46153188,8,71.0,43.6222271668,8,72.0,7213.31763924,8,73.0,115.930876332,8,74.0,31342.4200742,9,80.0,49.071003,9,83.0,10.839023,9,83.0,12.45938,7,66.0,144.693543,8,78.0,,0,,9.71124027397,8,70.0,36.8369660131,3,25.0,0.3043125094,6,54.0,50.3207409265,9,89.0,0.808744032223,10,93.0,218.647490578,8,76.0,0.342324,9,85.0,0.075614,9,81.0,0.086918,3,24.0,1.009395,7,69.0,,0,,0.471891,8,73.0,0.550082,8,77.0,0.393701,6,54.0,0.010076,8,75.0,0.192,8,71.0,0.041051,4,33.0,0.083744,2,14.0,55% (77%ile),39% (54%ile),1% (75%ile),19% (71%ile),4% (33%ile),8% (14%ile),47% (73%ile),9.71 ug/m3 (70%ile),72%ile,36.8 ppb (25%ile),71%ile,0.304 ug/m3 (54%ile),72%ile,50 lifetime risk per million (89%ile),73%ile,0.81 (93%ile),74%ile,220 daily vehicles/meters distance (76%ile),80%ile,0.34 = fraction pre-1960 (85%ile),83%ile,0.076 facilities/km distance (81%ile),83%ile,0.087 facilities/km distance (24%ile),66%ile,1 facilities/km distance (69%ile),78%ile,,,6506.804784,1793332.0
4,5,10010203001,Alabama,AL,4,2641,-769.374640358,5,48.0,-2911.8926061,5,49.0,-28.752997683,5,42.0,-4008.21527015,5,46.0,-64.4803005962,5,45.0,-5498.0782156,4,33.0,-6.622665,5,42.0,-5.835112,4,32.0,-6.337256,6,51.0,-92.978762,3,24.0,,0,,9.74568328767,8,71.0,36.8849993464,3,27.0,0.3642147717,7,65.0,50.7721395048,10,92.0,0.81677320117,10,94.0,69.6442619361,6,50.0,0.083889,5,41.0,0.073913,9,80.0,0.080274,3,21.0,1.177764,8,74.0,,0,,0.324309,6,53.0,0.340401,7,60.0,0.308217,4,38.0,0.0,8,71.0,0.125473,5,49.0,0.020825,2,14.0,0.142749,5,43.0,34% (60%ile),31% (38%ile),0% (71%ile),13% (49%ile),2% (14%ile),14% (43%ile),32% (53%ile),9.75 ug/m3 (71%ile),48%ile,36.9 ppb (27%ile),49%ile,0.364 ug/m3 (65%ile),42%ile,51 lifetime risk per million (92%ile),46%ile,0.82 (94%ile),45%ile,70 daily vehicles/meters distance (50%ile),33%ile,0.084 = fraction pre-1960 (41%ile),42%ile,0.074 facilities/km distance (80%ile),32%ile,0.08 facilities/km distance (21%ile),51%ile,1.2 facilities/km distance (74%ile),24%ile,,,11070.367848,5461602.0


In [3]:
# Define a named tuple that will be used for each data set input.
DataSet = collections.namedtuple(
    typename="DataSet", field_names=["input_field", "renamed_field", "bucket"]
)

data_sets = [
    # The following data sets have `bucket=None`, because it's not used in the score.
    DataSet(
        input_field="ID", renamed_field="Census block group FIPS code", bucket=None
    ),
    DataSet(input_field="ACSTOTPOP", renamed_field="Total population", bucket=None),
    # The following data sets have buckets, because they're used in the score
    DataSet(
        input_field="CANCER",
        renamed_field="Air toxics cancer risk",
        bucket=BUCKET_EXPOSURES,
    ),
    DataSet(
        input_field="RESP",
        renamed_field="Respiratory hazard index",
        bucket=BUCKET_EXPOSURES,
    ),
    DataSet(
        input_field="DSLPM",
        renamed_field="Diesel particulate matter",
        bucket=BUCKET_EXPOSURES,
    ),
    DataSet(
        input_field="PM25",
        renamed_field="Particulate matter (PM2.5)",
        bucket=BUCKET_EXPOSURES,
    ),
    DataSet(input_field="OZONE", renamed_field="Ozone", bucket=BUCKET_EXPOSURES),
    DataSet(
        input_field="PTRAF",
        renamed_field="Traffic proximity and volume",
        bucket=BUCKET_EXPOSURES,
    ),
    DataSet(
        input_field="PRMP",
        renamed_field="Proximity to RMP sites",
        bucket=BUCKET_ENVIRONMENTAL,
    ),
    DataSet(
        input_field="PTSDF",
        renamed_field="Proximity to TSDF sites",
        bucket=BUCKET_ENVIRONMENTAL,
    ),
    DataSet(
        input_field="PNPL",
        renamed_field="Proximity to NPL sites",
        bucket=BUCKET_ENVIRONMENTAL,
    ),
    DataSet(
        input_field="PWDIS",
        renamed_field="Wastewater discharge",
        bucket=BUCKET_ENVIRONMENTAL,
    ),
    DataSet(
        input_field="PRE1960PCT",
        renamed_field="Percent pre-1960s housing (lead paint indicator)",
        bucket=BUCKET_ENVIRONMENTAL,
    ),
    DataSet(
        input_field="UNDER5PCT",
        renamed_field="Individuals under 5 years old",
        bucket=BUCKET_SENSITIVE,
    ),
    DataSet(
        input_field="OVER64PCT",
        renamed_field="Individuals over 64 years old",
        bucket=BUCKET_SENSITIVE,
    ),
    DataSet(
        input_field="LINGISOPCT",
        renamed_field="Percent of households in linguistic isolation",
        bucket=BUCKET_SOCIOECONOMIC,
    ),
    DataSet(
        input_field="LOWINCPCT",
        renamed_field="Poverty (Less than 200% of federal poverty line)",
        bucket=BUCKET_SOCIOECONOMIC,
    ),
    DataSet(
        input_field="LESSHSPCT",
        renamed_field="Percent individuals age 25 or over with less than high school degree",
        bucket=BUCKET_SOCIOECONOMIC,
    ),
]

In [4]:
# Rename columns:
renaming_dict = {data_set.input_field: data_set.renamed_field for data_set in data_sets}

df.rename(
    columns=renaming_dict,
    inplace=True,
    errors="raise",
)

columns_to_keep = [data_set.renamed_field for data_set in data_sets]
df = df[columns_to_keep]

df.head()

Unnamed: 0,Census block group FIPS code,Total population,Air toxics cancer risk,Respiratory hazard index,Diesel particulate matter,Particulate matter (PM2.5),Ozone,Traffic proximity and volume,Proximity to RMP sites,Proximity to TSDF sites,Proximity to NPL sites,Wastewater discharge,Percent pre-1960s housing (lead paint indicator),Individuals under 5 years old,Individuals over 64 years old,Percent of households in linguistic isolation,Poverty (Less than 200% of federal poverty line),Percent individuals age 25 or over with less than high school degree
0,10010201001,636,49.3770316066,0.788051737456,0.2786630687,9.69089945205,36.7602117647,91.0159000855,0.085201,0.586754,0.070942,,0.272109,0.031447,0.150943,0.047619,0.38522,0.208134
1,10010201002,1287,49.3770316066,0.788051737456,0.2786630687,9.69089945205,36.7602117647,2.61874365577,0.073796,0.450601,0.064344,,0.115464,0.041958,0.115773,0.0,0.16317,0.040678
2,10010202001,810,50.3207409265,0.808744032223,0.3043125094,9.71124027397,36.8369660131,4.67932150834,0.077868,0.653377,0.069234,,0.143243,0.041975,0.224691,0.0,0.501247,0.135563
3,10010202002,1218,50.3207409265,0.808744032223,0.3043125094,9.71124027397,36.8369660131,218.647490578,0.086918,1.009395,0.075614,,0.342324,0.041051,0.083744,0.010076,0.393701,0.192
4,10010203001,2641,50.7721395048,0.81677320117,0.3642147717,9.74568328767,36.8849993464,69.6442619361,0.080274,1.177764,0.073913,,0.083889,0.020825,0.142749,0.0,0.308217,0.125473


In [5]:
# calculate percentiles
for data_set in data_sets:
    df[f"{data_set.renamed_field}{PERCENTILE_FIELD_SUFFIX}"] = df[
        data_set.renamed_field
    ].rank(pct=True)

df.head()

Unnamed: 0,Census block group FIPS code,Total population,Air toxics cancer risk,Respiratory hazard index,Diesel particulate matter,Particulate matter (PM2.5),Ozone,Traffic proximity and volume,Proximity to RMP sites,Proximity to TSDF sites,Proximity to NPL sites,Wastewater discharge,Percent pre-1960s housing (lead paint indicator),Individuals under 5 years old,Individuals over 64 years old,Percent of households in linguistic isolation,Poverty (Less than 200% of federal poverty line),Percent individuals age 25 or over with less than high school degree,Census block group FIPS code (percentile),Total population (percentile),Air toxics cancer risk (percentile),Respiratory hazard index (percentile),Diesel particulate matter (percentile),Particulate matter (PM2.5) (percentile),Ozone (percentile),Traffic proximity and volume (percentile),Proximity to RMP sites (percentile),Proximity to TSDF sites (percentile),Proximity to NPL sites (percentile),Wastewater discharge (percentile),Percent pre-1960s housing (lead paint indicator) (percentile),Individuals under 5 years old (percentile),Individuals over 64 years old (percentile),Percent of households in linguistic isolation (percentile),Poverty (Less than 200% of federal poverty line) (percentile),Percent individuals age 25 or over with less than high school degree (percentile)
0,10010201001,636,49.3770316066,0.788051737456,0.2786630687,9.69089945205,36.7602117647,91.0159000855,0.085201,0.586754,0.070942,,0.272109,0.031447,0.150943,0.047619,0.38522,0.208134,5e-06,0.092785,0.975893,0.98006,0.345107,0.945639,0.14836,0.908502,0.134596,0.40255,0.551021,0.837476,0.503799,0.264221,0.503887,0.729877,0.625015,0.793292
1,10010201002,1287,49.3770316066,0.788051737456,0.2786630687,9.69089945205,36.7602117647,2.61874365577,0.073796,0.450601,0.064344,,0.115464,0.041958,0.115773,0.0,0.16317,0.040678,9e-06,0.512531,0.975893,0.98006,0.345107,0.945639,0.14836,0.302755,0.10982,0.367176,0.518837,0.837476,0.316022,0.375445,0.334149,0.257413,0.246722,0.23855
2,10010202001,810,50.3207409265,0.808744032223,0.3043125094,9.71124027397,36.8369660131,4.67932150834,0.077868,0.653377,0.069234,,0.143243,0.041975,0.224691,0.0,0.501247,0.135563,1.4e-05,0.195479,0.978961,0.984421,0.383599,0.948476,0.150406,0.575556,0.118788,0.418135,0.542719,0.837476,0.354904,0.375627,0.791084,0.257413,0.772002,0.63439
3,10010202002,1218,50.3207409265,0.808744032223,0.3043125094,9.71124027397,36.8369660131,218.647490578,0.086918,1.009395,0.075614,,0.342324,0.041051,0.083744,0.010076,0.393701,0.192,1.8e-05,0.470967,0.978961,0.984421,0.383599,0.948476,0.150406,0.338651,0.138454,0.491887,0.57124,0.837476,0.570312,0.365504,0.189901,0.541916,0.637158,0.765126
4,10010203001,2641,50.7721395048,0.81677320117,0.3642147717,9.74568328767,36.8849993464,69.6442619361,0.080274,1.177764,0.073913,,0.083889,0.020825,0.142749,0.0,0.308217,0.125473,2.3e-05,0.91533,0.980132,0.985846,0.469165,0.953405,0.151723,0.795441,0.124266,0.521343,0.564146,0.837476,0.267429,0.170465,0.464397,0.257413,0.504977,0.603841


In [6]:
# Calculate score "A" and score "B"
df["Score A"] = df[
    [
        "Poverty (Less than 200% of federal poverty line) (percentile)",
        "Percent individuals age 25 or over with less than high school degree (percentile)",
    ]
].mean(axis=1)
df["Score B"] = (
    df["Poverty (Less than 200% of federal poverty line) (percentile)"]
    * df[
        "Percent individuals age 25 or over with less than high school degree (percentile)"
    ]
)

In [7]:
# Calculate "CalEnviroScreen for the US" score
# Average all the percentile values in each bucket into a single score for each of the four buckets.
for bucket in BUCKETS:
    fields_in_bucket = [
        f"{data_set.renamed_field}{PERCENTILE_FIELD_SUFFIX}"
        for data_set in data_sets
        if data_set.bucket == bucket
    ]
    df[f"{bucket}"] = df[fields_in_bucket].mean(axis=1)

# Combine the score from the two Exposures and Environmental Effects buckets into a single score called "Pollution Burden". The math for this score is: (1.0 * Exposures Score + 0.5 * Environment Effects score) / 1.5.
df[AGGREGATION_POLLUTION] = (
    1.0 * df[f"{BUCKET_EXPOSURES}"] + 0.5 * df[f"{BUCKET_ENVIRONMENTAL}"]
) / 1.5

# Average the score from the two Sensitive populations and Socioeconomic factors buckets into a single score called "Population Characteristics".
df[AGGREGATION_POPULATION] = df[
    [f"{BUCKET_SENSITIVE}", f"{BUCKET_SOCIOECONOMIC}"]
].mean(axis=1)

# Multiply the "Pollution Burden" score and the "Population Characteristics" together to produce the cumulative impact score.
df["Score C"] = df[AGGREGATION_POLLUTION] * df[AGGREGATION_POPULATION]

df.head()

Unnamed: 0,Census block group FIPS code,Total population,Air toxics cancer risk,Respiratory hazard index,Diesel particulate matter,Particulate matter (PM2.5),Ozone,Traffic proximity and volume,Proximity to RMP sites,Proximity to TSDF sites,Proximity to NPL sites,Wastewater discharge,Percent pre-1960s housing (lead paint indicator),Individuals under 5 years old,Individuals over 64 years old,Percent of households in linguistic isolation,Poverty (Less than 200% of federal poverty line),Percent individuals age 25 or over with less than high school degree,Census block group FIPS code (percentile),Total population (percentile),Air toxics cancer risk (percentile),Respiratory hazard index (percentile),Diesel particulate matter (percentile),Particulate matter (PM2.5) (percentile),Ozone (percentile),Traffic proximity and volume (percentile),Proximity to RMP sites (percentile),Proximity to TSDF sites (percentile),Proximity to NPL sites (percentile),Wastewater discharge (percentile),Percent pre-1960s housing (lead paint indicator) (percentile),Individuals under 5 years old (percentile),Individuals over 64 years old (percentile),Percent of households in linguistic isolation (percentile),Poverty (Less than 200% of federal poverty line) (percentile),Percent individuals age 25 or over with less than high school degree (percentile),Score A,Score B,Socioeconomic Factors,Sensitive populations,Environmental effects,Exposures,Pollution Burden,Population Characteristics,Score C
0,10010201001,636,49.3770316066,0.788051737456,0.2786630687,9.69089945205,36.7602117647,91.0159000855,0.085201,0.586754,0.070942,,0.272109,0.031447,0.150943,0.047619,0.38522,0.208134,5e-06,0.092785,0.975893,0.98006,0.345107,0.945639,0.14836,0.908502,0.134596,0.40255,0.551021,0.837476,0.503799,0.264221,0.503887,0.729877,0.625015,0.793292,0.709154,0.49582,0.716062,0.384054,0.485888,0.71726,0.640136,0.550058,0.352112
1,10010201002,1287,49.3770316066,0.788051737456,0.2786630687,9.69089945205,36.7602117647,2.61874365577,0.073796,0.450601,0.064344,,0.115464,0.041958,0.115773,0.0,0.16317,0.040678,9e-06,0.512531,0.975893,0.98006,0.345107,0.945639,0.14836,0.302755,0.10982,0.367176,0.518837,0.837476,0.316022,0.375445,0.334149,0.257413,0.246722,0.23855,0.242636,0.058856,0.247562,0.354797,0.429866,0.616302,0.554157,0.301179,0.166901
2,10010202001,810,50.3207409265,0.808744032223,0.3043125094,9.71124027397,36.8369660131,4.67932150834,0.077868,0.653377,0.069234,,0.143243,0.041975,0.224691,0.0,0.501247,0.135563,1.4e-05,0.195479,0.978961,0.984421,0.383599,0.948476,0.150406,0.575556,0.118788,0.418135,0.542719,0.837476,0.354904,0.375627,0.791084,0.257413,0.772002,0.63439,0.703196,0.48975,0.554601,0.583356,0.454404,0.670237,0.598293,0.568979,0.340416
3,10010202002,1218,50.3207409265,0.808744032223,0.3043125094,9.71124027397,36.8369660131,218.647490578,0.086918,1.009395,0.075614,,0.342324,0.041051,0.083744,0.010076,0.393701,0.192,1.8e-05,0.470967,0.978961,0.984421,0.383599,0.948476,0.150406,0.338651,0.138454,0.491887,0.57124,0.837476,0.570312,0.365504,0.189901,0.541916,0.637158,0.765126,0.701142,0.487506,0.648067,0.277702,0.521874,0.630752,0.59446,0.462885,0.275166
4,10010203001,2641,50.7721395048,0.81677320117,0.3642147717,9.74568328767,36.8849993464,69.6442619361,0.080274,1.177764,0.073913,,0.083889,0.020825,0.142749,0.0,0.308217,0.125473,2.3e-05,0.91533,0.980132,0.985846,0.469165,0.953405,0.151723,0.795441,0.124266,0.521343,0.564146,0.837476,0.267429,0.170465,0.464397,0.257413,0.504977,0.603841,0.554409,0.304925,0.45541,0.317431,0.462932,0.722619,0.636056,0.38642,0.245785


In [8]:
# Create percentiles for the scores
for score_field in ["Score A", "Score B", "Score C"]:
    df[f"{score_field}{PERCENTILE_FIELD_SUFFIX}"] = df[score_field].rank(pct=True)
    df[f"{score_field} (top 25th percentile)"] = (
        df[f"{score_field}{PERCENTILE_FIELD_SUFFIX}"] >= 0.75
    )
df.head()

Unnamed: 0,Census block group FIPS code,Total population,Air toxics cancer risk,Respiratory hazard index,Diesel particulate matter,Particulate matter (PM2.5),Ozone,Traffic proximity and volume,Proximity to RMP sites,Proximity to TSDF sites,Proximity to NPL sites,Wastewater discharge,Percent pre-1960s housing (lead paint indicator),Individuals under 5 years old,Individuals over 64 years old,Percent of households in linguistic isolation,Poverty (Less than 200% of federal poverty line),Percent individuals age 25 or over with less than high school degree,Census block group FIPS code (percentile),Total population (percentile),Air toxics cancer risk (percentile),Respiratory hazard index (percentile),Diesel particulate matter (percentile),Particulate matter (PM2.5) (percentile),Ozone (percentile),Traffic proximity and volume (percentile),Proximity to RMP sites (percentile),Proximity to TSDF sites (percentile),Proximity to NPL sites (percentile),Wastewater discharge (percentile),Percent pre-1960s housing (lead paint indicator) (percentile),Individuals under 5 years old (percentile),Individuals over 64 years old (percentile),Percent of households in linguistic isolation (percentile),Poverty (Less than 200% of federal poverty line) (percentile),Percent individuals age 25 or over with less than high school degree (percentile),Score A,Score B,Socioeconomic Factors,Sensitive populations,Environmental effects,Exposures,Pollution Burden,Population Characteristics,Score C,Score A (percentile),Score A (top 25th percentile),Score B (percentile),Score B (top 25th percentile),Score C (percentile),Score C (top 25th percentile)
0,10010201001,636,49.3770316066,0.788051737456,0.2786630687,9.69089945205,36.7602117647,91.0159000855,0.085201,0.586754,0.070942,,0.272109,0.031447,0.150943,0.047619,0.38522,0.208134,5e-06,0.092785,0.975893,0.98006,0.345107,0.945639,0.14836,0.908502,0.134596,0.40255,0.551021,0.837476,0.503799,0.264221,0.503887,0.729877,0.625015,0.793292,0.709154,0.49582,0.716062,0.384054,0.485888,0.71726,0.640136,0.550058,0.352112,0.73954,False,0.743311,False,0.823381,True
1,10010201002,1287,49.3770316066,0.788051737456,0.2786630687,9.69089945205,36.7602117647,2.61874365577,0.073796,0.450601,0.064344,,0.115464,0.041958,0.115773,0.0,0.16317,0.040678,9e-06,0.512531,0.975893,0.98006,0.345107,0.945639,0.14836,0.302755,0.10982,0.367176,0.518837,0.837476,0.316022,0.375445,0.334149,0.257413,0.246722,0.23855,0.242636,0.058856,0.247562,0.354797,0.429866,0.616302,0.554157,0.301179,0.166901,0.206805,False,0.24959,False,0.227828,False
2,10010202001,810,50.3207409265,0.808744032223,0.3043125094,9.71124027397,36.8369660131,4.67932150834,0.077868,0.653377,0.069234,,0.143243,0.041975,0.224691,0.0,0.501247,0.135563,1.4e-05,0.195479,0.978961,0.984421,0.383599,0.948476,0.150406,0.575556,0.118788,0.418135,0.542719,0.837476,0.354904,0.375627,0.791084,0.257413,0.772002,0.63439,0.703196,0.48975,0.554601,0.583356,0.454404,0.670237,0.598293,0.568979,0.340416,0.733009,False,0.738859,False,0.801668,True
3,10010202002,1218,50.3207409265,0.808744032223,0.3043125094,9.71124027397,36.8369660131,218.647490578,0.086918,1.009395,0.075614,,0.342324,0.041051,0.083744,0.010076,0.393701,0.192,1.8e-05,0.470967,0.978961,0.984421,0.383599,0.948476,0.150406,0.338651,0.138454,0.491887,0.57124,0.837476,0.570312,0.365504,0.189901,0.541916,0.637158,0.765126,0.701142,0.487506,0.648067,0.277702,0.521874,0.630752,0.59446,0.462885,0.275166,0.730848,False,0.737357,False,0.636636,False
4,10010203001,2641,50.7721395048,0.81677320117,0.3642147717,9.74568328767,36.8849993464,69.6442619361,0.080274,1.177764,0.073913,,0.083889,0.020825,0.142749,0.0,0.308217,0.125473,2.3e-05,0.91533,0.980132,0.985846,0.469165,0.953405,0.151723,0.795441,0.124266,0.521343,0.564146,0.837476,0.267429,0.170465,0.464397,0.257413,0.504977,0.603841,0.554409,0.304925,0.45541,0.317431,0.462932,0.722619,0.636056,0.38642,0.245785,0.568571,False,0.586058,False,0.537042,False


In [9]:
# write nationwide csv
df.to_csv(csv_path / f"usa.csv", index=False)

In [10]:
# write per state csvs
with open(fips_csv_path) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=",")
    line_count = 0

    for row in csv_reader:
        if line_count == 0:
            line_count += 1
        else:
            states_fips = row[0].strip()
            print(f"Generating data{states_fips} csv")
            df1 = df[df["Census block group FIPS code"].str[:2] == states_fips]
            # we need to name the file data01.csv for ogr2ogr csv merge to work
            df1.to_csv(csv_path / f"data{states_fips}.csv", index=False)

Generating data01 csv
Generating data02 csv
Generating data04 csv
Generating data05 csv
Generating data06 csv
Generating data08 csv
Generating data09 csv
Generating data10 csv
Generating data11 csv
Generating data12 csv
Generating data13 csv
Generating data15 csv
Generating data16 csv
Generating data17 csv
Generating data18 csv
Generating data19 csv
Generating data20 csv
Generating data21 csv
Generating data22 csv
Generating data23 csv
Generating data24 csv
Generating data25 csv
Generating data26 csv
Generating data27 csv
Generating data28 csv
Generating data29 csv
Generating data30 csv
Generating data31 csv
Generating data32 csv
Generating data33 csv
Generating data34 csv
Generating data35 csv
Generating data36 csv
Generating data37 csv
Generating data38 csv
Generating data39 csv
Generating data40 csv
Generating data41 csv
Generating data42 csv
Generating data44 csv
Generating data45 csv
Generating data46 csv
Generating data47 csv
Generating data48 csv
Generating data49 csv
Generating