In [1]:
import geopandas as gpd
import os
import logging
import subprocess
import shapely.geometry
from datetime import datetime
import pandas as pd
import boto3
from botocore.exceptions import ClientError
import tempfile
import dask
from dask.diagnostics import ProgressBar

In [2]:
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# logging.getLogger().setLevel(logging.DEBUG)

In [3]:
# S3 Paths
s3_bucket = "gfw2-data"
s3_input_dir = "climate/AFOLU_flux_model/organic_soils/inputs/raw"
s3_osm_dir = "roads/osm_roads/OSM_Roads/OSM_Roads"
s3_osm_roads_dir = "climate/AFOLU_flux_model/organic_soils/inputs/processed/grip/roads_by_tile"
s3_osm_canals_dir = "climate/AFOLU_flux_model/organic_soils/inputs/processed/grip/canals_by_tile"

In [4]:
# Paths
regional_pbf_files = [
    f"s3://{s3_bucket}/{s3_input_dir}/{s3_osm_dir}/north-america-latest.osm.pbf",
    f"s3://{s3_bucket}/{s3_input_dir}/{s3_osm_dir}/africa-latest.osm.pbf",
    f"s3://{s3_bucket}/{s3_input_dir}/{s3_osm_dir}/antarctica-latest.osm.pbf",
    f"s3://{s3_bucket}/{s3_input_dir}/{s3_osm_dir}/asia-latest.osm.pbf",
    f"s3://{s3_bucket}/{s3_input_dir}/{s3_osm_dir}/australia-oceania-latest.osm.pbf",
    f"s3://{s3_bucket}/{s3_input_dir}/{s3_osm_dir}/central-america-latest.osm.pbf",
    f"s3://{s3_bucket}/{s3_input_dir}/{s3_osm_dir}/europe-latest.osm.pbf"
]

In [5]:
# Hardcoded bounds dictionary
bounds_dict = {
    f"s3://{s3_bucket}/{s3_input_dir}/{s3_osm_dir}/north-america-latest.osm.pbf": (-180.0, 5.57228, 180.0, 85.04177),
    f"s3://{s3_bucket}/{s3_input_dir}/{s3_osm_dir}/africa-latest.osm.pbf": (-27.262032, -60.3167, 66.722766, 37.77817),
    f"s3://{s3_bucket}/{s3_input_dir}/{s3_osm_dir}/antarctica-latest.osm.pbf": (-180.0, -90.0, 180.0, -60.0),
    f"s3://{s3_bucket}/{s3_input_dir}/{s3_osm_dir}/asia-latest.osm.pbf": (-180.0, -13.01165, 180.0, 84.52666),
    f"s3://{s3_bucket}/{s3_input_dir}/{s3_osm_dir}/australia-oceania-latest.osm.pbf": (-179.999999, -57.16482, 180.0, 26.27781),
    f"s3://{s3_bucket}/{s3_input_dir}/{s3_osm_dir}/central-america-latest.osm.pbf": (-99.82733, 3.283755, -44.93667, 28.05483),
    f"s3://{s3_bucket}/{s3_input_dir}/{s3_osm_dir}/europe-latest.osm.pbf": (-34.49296, 29.635548, 46.75348, 81.47299)
}


In [6]:
tiles_shapefile_path = f"s3://{s3_bucket}/{s3_input_dir}/index/Global_Peatlands.shp"
temp_output_dir = "/tmp/osm_temp"
os.makedirs(temp_output_dir, exist_ok=True)

In [7]:
def download_s3_file(bucket, key, local_dir):
    s3_client = boto3.client('s3')
    local_path = os.path.join(local_dir, os.path.basename(key))
    try:
        s3_client.download_file(bucket, key, local_path)
        return local_path
    except ClientError as e:
        logging.error(f"Failed to download file from S3: {bucket}/{key}, {e}")
        return None

In [8]:
def read_tiles_shapefile():
    logging.info("Reading tiles shapefile from S3")
    tiles_gdf = gpd.read_file(tiles_shapefile_path, storage_options={"anon": False})
    logging.info(f"Columns in tiles shapefile: {tiles_gdf.columns}")
    return tiles_gdf


In [9]:
def run_osmium_extract(pbf_file, tile_bounds, tile_id):
    temp_pbf = os.path.join(temp_output_dir, f'temp_{tile_id}_{datetime.now().strftime("%Y%m%d%H%M%S%f")}.osm.pbf')
    cmd = [
        'osmium', 'extract', '-b', f'{tile_bounds[0]},{tile_bounds[1]},{tile_bounds[2]},{tile_bounds[3]}',
        '-o', temp_pbf, pbf_file
    ]
    try:
        subprocess.check_call(cmd)
        return temp_pbf
    except subprocess.CalledProcessError as e:
        logging.error(f"Error running osmium extract: {e}")
        if os.path.exists(temp_pbf):
            os.remove(temp_pbf)
        return None

In [10]:
def run_ogr2ogr_local(pbf_file, tile_bounds, tile_id):
    temp_geojson = os.path.join(temp_output_dir, f'temp_{tile_id}_{datetime.now().strftime("%Y%m%d%H%M%S%f")}.geojson')
    cmd = [
        'ogr2ogr', '-f', 'GeoJSON', temp_geojson, pbf_file,
        '-spat', str(tile_bounds[0]), str(tile_bounds[1]), str(tile_bounds[2]), str(tile_bounds[3]), 'lines'
    ]
    try:
        subprocess.check_call(cmd)
        gdf = gpd.read_file(temp_geojson)
        os.remove(temp_geojson)
        return gdf
    except subprocess.CalledProcessError as e:
        logging.error(f"Error running ogr2ogr: {e}")
        if os.path.exists(temp_geojson):
            os.remove(temp_geojson)
        return gpd.GeoDataFrame()
    except Exception as e:
        logging.error(f"Error reading GeoJSON file: {e}")
        if os.path.exists(temp_geojson):
            os.remove(temp_geojson)
        return gpd.GeoDataFrame()


In [11]:
def upload_to_s3(df, s3_path):
    temp_file = os.path.join(temp_output_dir, f'{datetime.now().strftime("%Y%m%d%H%M%S%f")}.shp')
    df.to_file(temp_file)
    s3_client = boto3.client('s3')
    s3_key = s3_path.replace(f"s3://{s3_bucket}/", "")
    s3_client.upload_file(temp_file, s3_bucket, s3_key)
    os.remove(temp_file)

In [12]:
def s3_key_exists(bucket, key):
    s3_client = boto3.client('s3')
    try:
        s3_client.head_object(Bucket=bucket, Key=key)
        return True
    except ClientError as e:
        if e.response['Error']['Code'] == '404':
            return False
        else:
            raise

In [13]:
def extract_features(gdf, tile_geometry):
    roads = gdf[gdf['highway'].notnull()]
    canals = gdf[(gdf['waterway'].notnull()) & (gdf['waterway'].isin(['ditch', 'canal', 'drain']))]
    roads_in_tile = gpd.clip(roads, tile_geometry)
    canals_in_tile = gpd.clip(canals, tile_geometry)
    return roads_in_tile, canals_in_tile

In [14]:
def save_shapefile(gdf, output_path, output_dir):
    if not gdf.empty:
        temp_file = os.path.join(temp_output_dir, f'{datetime.now().strftime("%Y%m%d%H%M%S%f")}.shp')
        gdf.to_file(temp_file)
        s3_key = f"{output_dir}/{output_path}"
        print(f"Uploading to S3 path: s3://{s3_bucket}/{s3_key}")
        upload_to_s3(gdf, f"s3://{s3_bucket}/{s3_key}")
        os.remove(temp_file)

In [15]:
def process_tile(tile, pbf_files, bounds_dict):
    tile_id = tile['tile_id']
    roads_output_path = f"roads_{tile_id}.shp"
    canals_output_path = f"canals_{tile_id}.shp"

    if s3_key_exists(s3_bucket, f"{s3_osm_roads_dir}/{roads_output_path}") and s3_key_exists(s3_bucket, f"{s3_osm_canals_dir}/{canals_output_path}"):
        logging.info(f"Output files for {tile_id} already exist. Skipping tile {tile_id}.")
        return

    tile_bounds = tile.geometry.bounds
    logging.info(f"Processing tile {tile_id} with bounds {tile_bounds}")

    combined_roads = gpd.GeoDataFrame()
    combined_canals = gpd.GeoDataFrame()

    try:
        with tempfile.TemporaryDirectory() as tmpdir:
            for pbf_file in pbf_files:
                if tile.geometry.intersects(shapely.geometry.box(*bounds_dict[pbf_file])):
                    logging.info(f"Extracting relevant part of PBF file {pbf_file} for tile {tile_id}")
                    s3_key = pbf_file.replace(f"s3://{s3_bucket}/", "")
                    local_pbf_file = download_s3_file(s3_bucket, s3_key, tmpdir)
                    extracted_pbf = run_osmium_extract(local_pbf_file, tile_bounds, tile_id)

                    if extracted_pbf:
                        gdf = run_ogr2ogr_local(extracted_pbf, tile_bounds, tile_id)
                        if not gdf.empty:
                            roads, canals = extract_features(gdf, tile.geometry)
                            combined_roads = pd.concat([combined_roads, roads], ignore_index=True)
                            combined_canals = pd.concat([combined_canals, canals], ignore_index=True)
                        os.remove(extracted_pbf)

            if not combined_roads.empty:
                save_shapefile(combined_roads, roads_output_path, s3_osm_roads_dir)
            if not combined_canals.empty:
                save_shapefile(combined_canals, canals_output_path, s3_osm_canals_dir)
    except Exception as e:
        logging.error(f"Error processing tile {tile_id}: {e}")


In [16]:
def process_tiles_parallel(tiles_gdf, pbf_files, bounds_dict):
    delayed_tasks = []
    for index, tile in tiles_gdf.iterrows():
        task = dask.delayed(process_tile)(tile, pbf_files, bounds_dict)
        delayed_tasks.append(task)
    
    with ProgressBar():
        dask.compute(*delayed_tasks)

In [17]:
def main(tile_id=None):
    logging.info("Reading tiles shapefile from S3")
    tiles_gdf = read_tiles_shapefile()

    if tile_id:
        tile = tiles_gdf[tiles_gdf['tile_id'] == tile_id].iloc[0]
        process_tile(tile, regional_pbf_files, bounds_dict)
    else:
        process_tiles_parallel(tiles_gdf, regional_pbf_files, bounds_dict)

In [18]:
# Run the main function
# main(tile_id=None)

In [None]:
# Run the main function (test tile)
main(tile_id="00N_110E")

2024-05-21 16:17:25,522 - INFO - Reading tiles shapefile from S3
2024-05-21 16:17:25,523 - INFO - Reading tiles shapefile from S3
2024-05-21 16:17:27,727 - INFO - Columns in tiles shapefile: Index(['tile_id', 'download_u', 'ObjectId', 'Shape__Are', 'Shape__Len',
       'geometry'],
      dtype='object')
2024-05-21 16:17:27,791 - INFO - Found credentials in shared credentials file: ~/.aws/credentials
2024-05-21 16:17:29,120 - INFO - Processing tile 00N_110E with bounds (110.0, -10.0, 120.0, 0.0)
2024-05-21 16:17:29,187 - INFO - Extracting relevant part of PBF file s3://gfw2-data/climate/AFOLU_flux_model/organic_soils/inputs/raw/roads/osm_roads/OSM_Roads/OSM_Roads/asia-latest.osm.pbf for tile 00N_110E
