In [1]:
# These are probably more imports than we need. Copied from comparison tool
import geopandas
import pandas as pd
import os
import sys
import time
import shapely
import warnings
from pandarallel import pandarallel

# This module is an easy wrapper to parallelize Panda's apply method.
pandarallel.initialize(
    progress_bar=True,
    # If nb_workers is not set, it defaults to available cores.
    # nb_workers=8,
)


module_path = os.path.abspath(os.path.join("../.."))
if module_path not in sys.path:
    sys.path.append(module_path)

from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census.etl import CensusETL
from data_pipeline.utils import unzip_file_from_url

# If field names are necessary, import them.
# from data_pipeline.score import field_names

# Turn on TQDM for pandas so that we can have progress bars when running `apply`.
from tqdm.notebook import tqdm_notebook

tqdm_notebook.pandas()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
# Define some input fields
EAMLIS_LAT_FIELD = "Latitude"
EAMLIS_LONG_FIELD = "Longitude"
EAMLIS_KEY_FIELD = "AMLIS Key"

FUDS_LAT_FIELD = "LATITUDE"
FUDS_LONG_FIELD = "LONGITUDE"
FUDS_COLUMNS_TO_KEEP = [
    FUDS_LAT_FIELD,
    FUDS_LONG_FIELD,
    "FUDSUNIQUEPROPERTYNUMBER",
    "CURRENTOWNER",
    "ELIGIBILITY",
    "EMSMGMTACTIONPLANLINK",
    "FEATUREDESCRIPTION",
    "FEATURENAME",
    "FUDSINSTALLATIONID",
    "HASPROJECTS",
    "STATUS",
    "PROPERTY_HISTORY",
]


# Geojson input fields
GEOJSON_PATH = CensusETL().GEOJSON_PATH / "us.json"
GEOJSON_TRACT_ID_FIELD = "GEOID10"


# Choose output directories:
FUDS_OUTPUT_DIR = ExtractTransformLoad.DATA_PATH / "formerly_used_defense_sites"
# Create directory if it doesn't exist
FUDS_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Choose output directory:
EAMLIS_OUTPUT_DIR = ExtractTransformLoad.DATA_PATH / "abandoned_mine_lands"
# Create directory if it doesn't exist
EAMLIS_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

2022-07-15 22:38:19,438 [data_pipeline.etl.sources.census.etl_utils] INFO     Downloading fips from S3 repository
2022-07-15 22:38:19,443 [data_pipeline.utils] INFO     Downloading https://justice40-data.s3.amazonaws.com/data-sources/fips_states_2010.zip
2022-07-15 22:38:19,867 [data_pipeline.utils] INFO     Extracting /Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/tmp/downloaded-8d08eaa9-3c28-469c-8e52-f9ef049d9fce.zip


In [10]:
# METHOD DEFINITIONS
def get_census_tract_for_one_coordinate(
    geom_point: shapely.geometry.point.Point,
    census_tract_gdf: geopandas.geodataframe.GeoDataFrame,
) -> str:
    # (predicate="within") and left join from mines to score[["tract", "geometry"]].

    # TODO: consider switching this order from `all polygons CONTAIN single point` (e.g., all tracts CONTAIN single mine) to 
    #   `all points WITHIN single polygon`. However, this requires refactoring the whole method to 
    #   iterate through each polygon rather than each point. 
    
    contains_result = census_tract_gdf.contains(geom_point)
    count_of_census_tract_matches = len(census_tract_gdf[contains_result])

    if count_of_census_tract_matches == 0:
        warnings.warn(
            f"Warning: no tract matches for {geom_point}",
            DeprecationWarning,
            stacklevel=2,
        )
        census_tract_id = None

    elif count_of_census_tract_matches > 1:
        warnings.warn(
            f"Warning: too many tract matches for {geom_point}",
            DeprecationWarning,
            stacklevel=2,
        )
        census_tract_id = None

    else:
        # With only one tract returned, extract the ID.
        census_tract_id = census_tract_gdf[contains_result][
            GEOJSON_TRACT_ID_FIELD
        ].values[0]

    return census_tract_id


def get_census_tracts_for_geom_points(
    points_gdf: geopandas.geodataframe.GeoDataFrame,
    census_tract_gdf: geopandas.geodataframe.GeoDataFrame,
) -> geopandas.geodataframe.GeoDataFrame:
    geometry_column_name = "geometry"
    result_gdf = points_gdf.parallel_apply(
        lambda frame: get_census_tract_for_one_coordinate(
            geom_point=frame[geometry_column_name], census_tract_gdf=census_tract_gdf
        ),
        axis=1,
    )
    return result_gdf


def get_census_tracts_for_dataframe_with_lat_long(
    coordinates_df: pd.DataFrame,
    latitude_column: str,
    longitude_column: str,
    census_tract_gdf: geopandas.geodataframe.GeoDataFrame,
):
    # First, convert the plain DataFrame into a geopandas data frame with lat/long geometry points.
    coordinates_geopandas_gdf = geopandas.GeoDataFrame(
        coordinates_df,
        geometry=geopandas.points_from_xy(
            x=coordinates_df[longitude_column],
            y=coordinates_df[latitude_column],
        ),
    )

    # Find the tract IDs for each point.
    tract_results = get_census_tracts_for_geom_points(
        points_gdf=coordinates_geopandas_gdf, census_tract_gdf=census_tract_gdf
    )

    # Join the tract IDs back on the original dataframe
    coordinates_with_tracts_df = coordinates_df
    coordinates_with_tracts_df[
        ExtractTransformLoad.GEOID_TRACT_FIELD_NAME
    ] = tract_results

    # Remove unnecessary `geometry` column
    # For unclear reasons, the initial `GeoDataFrame` creates a `geometry` column on the input dataframe that we don't want.
    coordinates_with_tracts_df = coordinates_with_tracts_df.drop("geometry", axis=1)

    return coordinates_with_tracts_df

In [4]:
t1 = time.time()

# Takes ~4 minutes with all of USA.
census_tract_gdf = geopandas.read_file(
    GEOJSON_PATH,
    # Use `pyogrio` because it's vectorized and faster.
    engine="pyogrio",
)

t2 = time.time()

print(f"Code took {str(t2-t1)} seconds.")

print(census_tract_gdf)

Code took 308.0567150115967 seconds.
      STATEFP10 COUNTYFP10 TRACTCE10      GEOID10  NAME10  \
0            27        139    080202  27139080202  802.02   
1            27        139    080204  27139080204  802.04   
2            27        139    080100  27139080100     801   
3            27        139    080302  27139080302  803.02   
4            27        139    080400  27139080400     804   
...         ...        ...       ...          ...     ...   
74129        16        005    001601  16005001601   16.01   
74130        16        005    001300  16005001300      13   
74131        16        005    001000  16005001000      10   
74132        16        005    000900  16005000900       9   
74133        16        005    000800  16005000800       8   

                NAMELSAD10 MTFCC10 FUNCSTAT10   ALAND10  AWATER10  \
0      Census Tract 802.02   G5020          S   5137595    109563   
1      Census Tract 802.04   G5020          S   4730968    120879   
2         Census Tract 

# Start work on FUDS

In [17]:
# Data accessed from: 
# "https://opendata.arcgis.com/api/v3/datasets/3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/data?format=csv&spatialRefId=4326&where=1%3D1"

# fuds_url = "https://opendata.arcgis.com/api/v3/datasets/3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/data?format=csv&spatialRefId=4326&where=1%3D1"

# Create temporary path
fuds_tmp_path = ExtractTransformLoad.DATA_PATH / "tmp" / "abandoned_mine_lands"
# Create directory if it doesn't exist
fuds_tmp_path.mkdir(parents=True, exist_ok=True)

fuds_path_in_s3 = (
    settings.AWS_JUSTICE40_DATASOURCES_URL + "/fuds_all_fy2019.csv.zip"
)

unzip_file_from_url(
    file_url=fuds_path_in_s3,
    download_path=fuds_tmp_path,
    unzipped_file_path=fuds_tmp_path,
)

fuds_path = fuds_tmp_path / "fuds_all_fy2019.csv"

fuds_source_df = pd.read_csv(
    filepath_or_buffer=fuds_path
)

# Only keep "eligible" sites with projects.
# TODO: confirm this is an appropriate interpretation of the eligible field.
fuds_source_df = fuds_source_df[fuds_source_df["ELIGIBILITY"] == "Eligible"]
fuds_source_df = fuds_source_df[fuds_source_df["HASPROJECTS"] == "Yes"]

# Drop columns that are not meaningful.
fuds_source_df = fuds_source_df[FUDS_COLUMNS_TO_KEEP]

fuds_source_df

2022-07-15 22:52:44,802 [data_pipeline.utils] INFO     Downloading https://justice40-data.s3.amazonaws.com/data-sources/fuds_all_fy2019.csv.zip
2022-07-15 22:53:12,094 [data_pipeline.utils] INFO     Extracting /Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/tmp/abandoned_mine_lands/downloaded-17fdff8f-5857-40b5-8de7-c79944f99095.zip


Unnamed: 0,LATITUDE,LONGITUDE,FUDSUNIQUEPROPERTYNUMBER,CURRENTOWNER,ELIGIBILITY,EMSMGMTACTIONPLANLINK,FEATUREDESCRIPTION,FEATURENAME,FUDSINSTALLATIONID,HASPROJECTS,STATUS,PROPERTY_HISTORY
0,30.098611,-93.722222,K06TX0667,LOCAL: CITY INDIVIDUAL OWNERS,Eligible,https://fudsportal.usace.army.mil/ems/inventor...,The site was initially acquired in 1946 and us...,ORANGE PORT OF NAV SHIP STOR,TX69799F675300,Yes,Properties with projects,The site was initially acquired in 1946 and us...
1,33.809700,-95.628304,K06TX0305,"DOD: USACE -AGRICULTURAL, RECREATIONAL, AND FL...",Eligible,https://fudsportal.usace.army.mil/ems/inventor...,Camp Maxey was activated in July 1942. It was ...,CAMP MAXEY,TX69799F668600,Yes,Properties with projects,Camp Maxey was activated in July 1942. It was ...
4,35.746111,-95.412778,K06OK0186,LOCAL: CITY CITY MUNICIPAL AIRFIELD\n,Eligible,https://fudsportal.usace.army.mil/ems/inventor...,HATBOX DOWNTOWN AIRPORT DATES BACK TO 1912 AND...,MUSKOGEE AUX AF,OK69799F639800,Yes,Properties with all projects at site closeout,HATBOX DOWNTOWN AIRPORT DATES BACK TO 1912 AND...
6,36.226944,-95.330000,K06OK0025,PRIV: PRIVATE CURRENTLY USED AS AN INDUSTRIAL ...,Eligible,https://fudsportal.usace.army.mil/ems/inventor...,The DoD began use in the early 1940s when the ...,OKLAHOMA ORDNANCE WORKS,OK69799F636200,Yes,Properties with all projects at site closeout,The DoD began use in the early 1940s when the ...
9,36.023333,-102.541667,K06TX0268,LOCAL: CITY THE SITE IS NOW USED AS A MUNICIPA...,Eligible,https://fudsportal.usace.army.mil/ems/inventor...,"In 1942, the DoD acquired 6,235.16 acres for u...",DALHART AAF,TX69799F665100,Yes,Properties with projects,"In 1942, the DoD acquired 6,235.16 acres for u..."
...,...,...,...,...,...,...,...,...,...,...,...,...
10080,51.379444,179.293889,F10AK0858,FWS: USFWSTRIBE: NATIVE AMERICAN ANCSA NATIVE ...,Eligible,https://fudsportal.usace.army.mil/ems/ems/inve...,,AMCHITKA AF AUXILIARY FIELD,AK09799F709900,Yes,Properties with projects,
10100,59.266111,-135.448889,F10AK1016,OTHER: Private Landowner has not been identifi...,Eligible,https://fudsportal.usace.army.mil/ems/ems/inve...,,HAINES FAIRBANKS PIPELINE,AK09799F980700,Yes,Properties with projects,
10101,61.200278,-149.900278,F10AK1023,STATE: STATE ALL BUILDINGS TURNED OVER TO THE ...,Eligible,https://fudsportal.usace.army.mil/ems/ems/inve...,,CAMP ANCHORAGE ARMY,AK09799FA25200,Yes,Properties with all projects at site closeout,
10102,60.555556,-151.267778,F10AK1024,,Eligible,https://fudsportal.usace.army.mil/ems/ems/inve...,,FORT KENAI ARMY POST,AK09799FA25300,Yes,Properties with all projects at site closeout,


In [12]:
# TODO: delete! 
# fuds_source_df_backup = fuds_source_df

# fuds_source_df = fuds_source_df[0:100]

In [18]:
t1 = time.time()

# Takes ~8 minutes with 2,900 rows.
fuds_source_with_tracts_df = get_census_tracts_for_dataframe_with_lat_long(
    coordinates_df=fuds_source_df,
    longitude_column=FUDS_LONG_FIELD,
    latitude_column=FUDS_LAT_FIELD,
    census_tract_gdf=census_tract_gdf,
)

t2 = time.time()

print(f"Code took {str(t2-t1)} seconds.")

fuds_source_with_tracts_df

  return GeometryArray(vectorized.points_from_xy(x, y, z), crs=crs)


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=717), Label(value='0 / 717'))), HB…

  lambda frame: get_census_tract_for_one_coordinate(
  lambda frame: get_census_tract_for_one_coordinate(
  lambda frame: get_census_tract_for_one_coordinate(
  lambda frame: get_census_tract_for_one_coordinate(
  lambda frame: get_census_tract_for_one_coordinate(
  lambda frame: get_census_tract_for_one_coordinate(
  lambda frame: get_census_tract_for_one_coordinate(


Code took 2248.5825169086456 seconds.


Unnamed: 0,LATITUDE,LONGITUDE,FUDSUNIQUEPROPERTYNUMBER,CURRENTOWNER,ELIGIBILITY,EMSMGMTACTIONPLANLINK,FEATUREDESCRIPTION,FEATURENAME,FUDSINSTALLATIONID,HASPROJECTS,STATUS,PROPERTY_HISTORY,GEOID10_TRACT
0,30.098611,-93.722222,K06TX0667,LOCAL: CITY INDIVIDUAL OWNERS,Eligible,https://fudsportal.usace.army.mil/ems/inventor...,The site was initially acquired in 1946 and us...,ORANGE PORT OF NAV SHIP STOR,TX69799F675300,Yes,Properties with projects,The site was initially acquired in 1946 and us...,48361020200
1,33.809700,-95.628304,K06TX0305,"DOD: USACE -AGRICULTURAL, RECREATIONAL, AND FL...",Eligible,https://fudsportal.usace.army.mil/ems/inventor...,Camp Maxey was activated in July 1942. It was ...,CAMP MAXEY,TX69799F668600,Yes,Properties with projects,Camp Maxey was activated in July 1942. It was ...,48277000102
4,35.746111,-95.412778,K06OK0186,LOCAL: CITY CITY MUNICIPAL AIRFIELD\n,Eligible,https://fudsportal.usace.army.mil/ems/inventor...,HATBOX DOWNTOWN AIRPORT DATES BACK TO 1912 AND...,MUSKOGEE AUX AF,OK69799F639800,Yes,Properties with all projects at site closeout,HATBOX DOWNTOWN AIRPORT DATES BACK TO 1912 AND...,40101000100
6,36.226944,-95.330000,K06OK0025,PRIV: PRIVATE CURRENTLY USED AS AN INDUSTRIAL ...,Eligible,https://fudsportal.usace.army.mil/ems/inventor...,The DoD began use in the early 1940s when the ...,OKLAHOMA ORDNANCE WORKS,OK69799F636200,Yes,Properties with all projects at site closeout,The DoD began use in the early 1940s when the ...,40097040400
9,36.023333,-102.541667,K06TX0268,LOCAL: CITY THE SITE IS NOW USED AS A MUNICIPA...,Eligible,https://fudsportal.usace.army.mil/ems/inventor...,"In 1942, the DoD acquired 6,235.16 acres for u...",DALHART AAF,TX69799F665100,Yes,Properties with projects,"In 1942, the DoD acquired 6,235.16 acres for u...",48205950200
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10080,51.379444,179.293889,F10AK0858,FWS: USFWSTRIBE: NATIVE AMERICAN ANCSA NATIVE ...,Eligible,https://fudsportal.usace.army.mil/ems/ems/inve...,,AMCHITKA AF AUXILIARY FIELD,AK09799F709900,Yes,Properties with projects,,02016000100
10100,59.266111,-135.448889,F10AK1016,OTHER: Private Landowner has not been identifi...,Eligible,https://fudsportal.usace.army.mil/ems/ems/inve...,,HAINES FAIRBANKS PIPELINE,AK09799F980700,Yes,Properties with projects,,02100000100
10101,61.200278,-149.900278,F10AK1023,STATE: STATE ALL BUILDINGS TURNED OVER TO THE ...,Eligible,https://fudsportal.usace.army.mil/ems/ems/inve...,,CAMP ANCHORAGE ARMY,AK09799FA25200,Yes,Properties with all projects at site closeout,,02020001400
10102,60.555556,-151.267778,F10AK1024,,Eligible,https://fudsportal.usace.army.mil/ems/ems/inve...,,FORT KENAI ARMY POST,AK09799FA25300,Yes,Properties with all projects at site closeout,,02122000600


In [19]:
fuds_source_with_tracts_df.to_csv(
    FUDS_OUTPUT_DIR / "formerly_used_defense_sites.csv", index=False
)

In [20]:
len(fuds_source_with_tracts_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME].unique())

2077

# Start work on eAMLIS

In [9]:
# Create temporary path
tmp_path = ExtractTransformLoad.DATA_PATH / "tmp" / "abandoned_mine_lands"
# Create directory if it doesn't exist
tmp_path.mkdir(parents=True, exist_ok=True)

eamlis_path_in_s3 = (
    settings.AWS_JUSTICE40_DATASOURCES_URL + "/eAMLIS export of all data.tsv.zip"
)

unzip_file_from_url(
    file_url=eamlis_path_in_s3,
    download_path=tmp_path,
    unzipped_file_path=tmp_path,
)

eamlis_path = tmp_path / "eAMLIS export of all data.tsv"

eamlis_source_df = pd.read_csv(
    filepath_or_buffer=eamlis_path,
    sep="\t",
)

eamlis_source_df.head()

2022-07-15 22:26:56,037 [data_pipeline.utils] INFO     Downloading https://justice40-data.s3.amazonaws.com/data-sources/eAMLIS export of all data.tsv.zip
2022-07-15 22:27:13,306 [data_pipeline.utils] INFO     Extracting /Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/tmp/abandoned_mine_lands/downloaded-60f877ab-7ca0-4e7c-8422-b89d0442a30f.zip
  eamlis_source_df = pd.read_csv(


Unnamed: 0,AMLIS Key,State/Tribe,County,Congressional District,Quadrangle Name,Watershed,HUC Code,FIPS Code,Latitude,Longitude,...,Unfunded Metric Units,Funded Standard Units,Funded Costs,Funded GPRA Acres,Funded Metric Units,Completed Standard Units,Completed Costs,Completed GPRA Acres,Completed Metric Units,Unnamed: 40
0,AK000001,AK,MATANUSKA-SUSITNA,1.0,ANCHORAGE C-8,,,2170,61.6,-149.8,...,0.0,0.0,0.0,0.0,0.0,2.0,10000.0,0.2,2.0,
1,AK000001,AK,MATANUSKA-SUSITNA,1.0,ANCHORAGE C-8,,,2170,61.6,-149.8,...,0.0,0.0,0.0,0.0,0.0,4.0,20000.0,0.4,4.0,
2,AK000001,AK,MATANUSKA-SUSITNA,1.0,ANCHORAGE C-8,,,2170,61.6,-149.8,...,0.0,0.0,0.0,0.0,0.0,900.0,33200.0,12.86,274.3,
3,AK000002,AK,FAIRBANKS NORTH STAR,1.0,Fairbanks D-3,19030004.0,,2090,64.8,-148.0,...,0.0,0.0,0.0,0.0,0.0,8.0,35324.0,0.8,8.0,
4,AK000002,AK,FAIRBANKS NORTH STAR,1.0,Fairbanks D-3,19030004.0,,2090,64.8,-148.0,...,0.0,0.0,0.0,0.0,0.0,1.0,4416.0,0.1,1.0,


In [None]:
mines_df = eamlis_source_df

print(mines_df.columns)

# TODO: investigate how to combine multiple rows for the same lat/long.
# Probably do something like, groupby([lat, long])[value_of_interest].size().
# TODO: Investigate aggregating over mine severity.
# This just keeps one of the rows arbitrarily. We might need additional columns of information.
mines_unique_df = mines_df.drop_duplicates(
    subset=[EAMLIS_LAT_FIELD, EAMLIS_LONG_FIELD], keep="last"
)

# TODO: investigate whether other columns (such as mine problem severity) are needed.
mines_unique_df = mines_unique_df[
    [EAMLIS_KEY_FIELD, EAMLIS_LAT_FIELD, EAMLIS_LONG_FIELD]
]

mines_unique_df.head()

In [None]:
t1 = time.time()

# Takes ~26 minutes with 4,000 rows.
mines_unique_with_tracts_df = get_census_tracts_for_dataframe_with_lat_long(
    coordinates_df=mines_unique_df,
    longitude_column=EAMLIS_LONG_FIELD,
    latitude_column=EAMLIS_LAT_FIELD,
    census_tract_gdf=census_tract_gdf,
)

t2 = time.time()

print(f"Code took {str(t2-t1)} seconds.")

print(mines_unique_with_tracts_df)

In [None]:
mines_unique_with_tracts_df.to_csv(
    EAMLIS_OUTPUT_DIR / "abandoned_mine_lands.csv", index=False
)

In [None]:
len(mines_unique_with_tracts_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME].unique())