In [9]:
# These are probably more imports than we need. Copied from comparison tool
import pandas as pd
import os
import sys

# TODO: add to poetry
import shapely
import warnings
from pandarallel import pandarallel

pandarallel.initialize(
    progress_bar=True,
    # If nb_workers is not set, it defaults to available cores.
    nb_workers=8,
)

from tqdm.notebook import tqdm_notebook

module_path = os.path.abspath(os.path.join("../.."))
if module_path not in sys.path:
    sys.path.append(module_path)

# from data_pipeline.utils import remove_all_from_dir, get_excel_column_name
from data_pipeline.etl.base import ExtractTransformLoad

# from data_pipeline.etl.sources.census.etl_utils import get_state_information
# from data_pipeline.etl.sources.ejscreen_areas_of_concern.etl import (
#     EJSCREENAreasOfConcernETL,
# )

from data_pipeline.etl.sources.census.etl import CensusETL


# from data_pipeline.score import field_names

# %load_ext lab_black
# Turn on TQDM for pandas so that we can have progress bars when running `apply`.
tqdm_notebook.pandas()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [10]:
# Define some input fields
LAT_FIELD = "Latitude"
LONG_FIELD = "Longitude"
KEY_FIELD = "AMLIS Key"


# TODO: switch to whole US
GEOJSON_PATH = CensusETL().GEOJSON_PATH / "us.json"
# GEOJSON_PATH = CensusETL().GEOJSON_PATH / "02.json"
GEOJSON_TRACT_ID_FIELD = "GEOID10"

2022-07-11 18:35:35,284 [data_pipeline.etl.sources.census.etl_utils] INFO     Downloading fips from S3 repository
2022-07-11 18:35:35,291 [data_pipeline.utils] INFO     Downloading https://justice40-data.s3.amazonaws.com/data-sources/fips_states_2010.zip
2022-07-11 18:35:35,780 [data_pipeline.utils] INFO     Extracting /Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/tmp/downloaded-4eec1600-71bb-48c7-825b-32f5830316e4.zip


In [None]:
# Load geojson
import geopandas

census_tract_gdf = geopandas.read_file(GEOJSON_PATH)

print(census_tract_gdf)

In [None]:
# TODO: change to `eAMLIS export of all data.tsv.zip`

eamlis_path = "/Users/lucas/Downloads/eAMLIS export of all data.tsv"


eamlis_source_df = pd.read_csv(
    filepath_or_buffer=eamlis_path,
    sep="\t",
)

eamlis_source_df.head()

In [None]:
mines_df = eamlis_source_df

print(mines_df.columns)

# TODO: investigate how to combine multiple rows for the same lat/long.
# This just keeps one of the rows arbitrarily. We might need additional columns of information.
mines_unique_df = mines_df.drop_duplicates(subset=[LAT_FIELD, LONG_FIELD], keep="last")

# TODO: investigate whether other columns (such as mine problem severity) are needed.
mines_unique_df = mines_unique_df[[KEY_FIELD, LAT_FIELD, LONG_FIELD]]

# mines_unique_df = mines_unique_df.head(100)
mines_unique_df.head()

In [None]:
# # Can be deleted from production code
# # Printing for inspection
# # We restrict to South America.
# world = geopandas.read_file(geopandas.datasets.get_path("naturalearth_lowres"))

# ax = world[world.continent == "North America"].plot(color="white", edgecolor="black")

# # We can now plot our ``GeoDataFrame``.
# mines_gdf.plot(ax=ax, color="red")

In [None]:
# long = -149.8
# lat = 61.6

# METHOD DEFINITIONS


def get_census_tract_for_one_coordinate(
    geom_point: shapely.geometry.point.Point,
    census_tract_gdf: geopandas.geodataframe.GeoDataFrame,
) -> str:
    # GEOJSON_TRACT_ID_FIELD

    # geopandas' contain method works row to row.
    # So create a duplicate row for the point across the length of the census tract gdf
    #     number_of_census_tracts = len(census_tract_gdf)
    #     point_as_gdf = geopandas.GeoDataFrame([[geom_point] * number_of_census_tracts])

    # Now run a row-to-row contains
    #     print(point_as_gdf)

    contains_result = census_tract_gdf.contains(geom_point)
    count_of_census_tract_matches = len(census_tract_gdf[contains_result])

    if count_of_census_tract_matches == 0:
        warnings.warn(
            f"Warning: no tract matches for {geom_point}",
            DeprecationWarning,
            stacklevel=2,
        )
        census_tract_id = None

    elif count_of_census_tract_matches > 1:
        warnings.warn(
            f"Warning: too many tract matches for {geom_point}",
            DeprecationWarning,
            stacklevel=2,
        )
        census_tract_id = None

    else:
        # With only one tract returned, extract the ID.
        census_tract_id = census_tract_gdf[contains_result][
            GEOJSON_TRACT_ID_FIELD
        ].values[0]

    return census_tract_id


def get_census_tracts_for_geom_points(
    points_gdf: geopandas.geodataframe.GeoDataFrame,
    census_tract_gdf: geopandas.geodataframe.GeoDataFrame,
) -> geopandas.geodataframe.GeoDataFrame:
    geometry_column_name = "geometry"
    result_gdf = points_gdf.parallel_apply(
        lambda frame: get_census_tract_for_one_coordinate(
            geom_point=frame[geometry_column_name], census_tract_gdf=census_tract_gdf
        ),
        axis=1,
    )
    return result_gdf


def get_census_tracts_for_dataframe_with_lat_long(
    coordinates_df: pd.DataFrame,
    latitude_column: str = LAT_FIELD,
    longitude_column: str = LONG_FIELD,
    census_tract_gdf: geopandas.geodataframe.GeoDataFrame = census_tract_gdf,
):
    # Avoid these side-effects by creating a duplicate.
    coordinates_df_duplicate = coordinates_df
    
    
    # First, convert the plain DataFrame into a geopandas data frame with lat/long geometry points.
    coordinates_geopandas_gdf = geopandas.GeoDataFrame(
        coordinates_df_duplicate,
        geometry=geopandas.points_from_xy(
            x=coordinates_df_duplicate[longitude_column], y=coordinates_df_duplicate[latitude_column]
        ),
    )

    # Find the tract IDs for each point.
    tract_results = get_census_tracts_for_geom_points(
        points_gdf=coordinates_geopandas_gdf, census_tract_gdf=census_tract_gdf
    )
    
    # Join the tract IDs back on the original dataframe
    coordinates_with_tracts_df = coordinates_df
    coordinates_with_tracts_df[
        ExtractTransformLoad.GEOID_TRACT_FIELD_NAME
    ] = tract_results

    # Remove unnecessary `geometry` column
    # For unclear reasons, the initial `GeoDataFrame` creates a `geometry` column on the input dataframe that we don't want.
    coordinates_with_tracts_df = coordinates_with_tracts_df.drop("geometry", axis=1)
    
    return coordinates_with_tracts_df


# get_census_tract_for_one_coordinate(geom_point=)

# example_point = geopandas.points_from_xy(x=[long], y=[lat])[0]

# get_census_tract_for_one_coordinate(
#     geom_point=example_point, census_tract_gdf=census_tract_gdf
# )

In [None]:
print(mines_unique_df)

mines_unique_df2 = mines_unique_df

x = get_census_tracts_for_dataframe_with_lat_long(coordinates_df=mines_unique_df2)

print(x)
# print(coordinate_df)