In [None]:
# import sys
# !{sys.executable} -m pip install pip geemap

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
import os, requests, json, geojson
import random, scipy
from collections import defaultdict
from scipy.optimize import curve_fit
import numpy as np
from shapely.geometry import Point, Polygon, MultiPolygon, MultiPoint, shape
from shapely.ops import unary_union

import shapely
from shapely.ops import transform

import pandas as pd
import geopandas as gpd
import ee
import geemap

import boto3

In [None]:
API_URL = 'https://api.gbif.org/v1/occurrence/search/'
DATASETKEY = '50c9509d-22c7-4a22-a47d-8c48425ef4a7'  # iNaturalist research-grade observations
TAXONKEYS = {'Arthropoda': '54', 'Aves': '212', 'Tracheophyta': '7707728'}

STARTYEAR = '2016'
ENDYEAR = '2021'

LIMIT = 300

In [None]:
# define directory
out_dir = os.getcwd()
bucket_name = 'cities-indicators'
aws_s3_dir = "https://"+bucket_name+".s3.eu-west-3.amazonaws.com"
boundary_ext = '/data/boundaries/'
indicators_file_aws = 'indicators/indicators.csv'

In [None]:
# get list of cities
boundary_georef = pd.read_csv(aws_s3_dir + boundary_ext + 'boundary_georef.csv')
boundary_georef

In [None]:
def boundingbox_wkt(p):
    # Returns WKT for bounding box.
    # Necessary because GBIF API won't accept complex polygons.
    minx, miny, maxx, maxy = p.bounds
    return 'POLYGON (({0} {3}, {0} {2}, {1} {2}, {0} {3}))'.format(str(minx), str(maxx), str(miny), str(maxy))

In [None]:
# connect to s3
aws_credentials = pd.read_csv('/home/jovyan/PlanetaryComputerExamples/aws_credentials.csv')
aws_key = aws_credentials.iloc[0]['Access key ID']
aws_secret = aws_credentials.iloc[0]['Secret access key']

s3 = boto3.resource(
    service_name='s3',
    aws_access_key_id=aws_key,
    aws_secret_access_key=aws_secret
)

In [None]:
def do_one_geom(poly, taxonname, boundary_id):
    print(boundary_id, taxonname)
    observations = []
    
    if poly.type == 'MultiPolygon':
        poly = unary_union(poly)
    
    if str(poly) != 'GEOMETRYCOLLECTION EMPTY':
        box = boundingbox_wkt(poly)

        offset = -LIMIT
        while offset == -LIMIT or not results['endOfRecords']:
            offset += LIMIT
            url = '{0}?dataset_key={1}&taxon_key={2}&year={3},{4}&geometry={5}&limit={6}&offset={7}&hasCoordinate=true'.format(API_URL, DATASETKEY, TAXONKEYS[taxonname], STARTYEAR, ENDYEAR, box, LIMIT, offset)
            resp = requests.get(url)
            results = resp.json()
            print('  {0}: {1}/{2}'.format(taxonname, results['offset'], results['count']))
            # Note spatial subsetting of points happens below (twice) as part of the conditions in the list comprehensions
            observations += [{
                'species': i['species'],
                'lat': i['decimalLatitude'],  # We don't really need to save lat/lon for this
                'lon': i['decimalLongitude'],
            } for i in results['results'] if 'species' in i.keys() and Point(float(i['decimalLongitude']), float(i['decimalLatitude'])).within(poly)]
        g = gpd.GeoDataFrame(geometry=[Point(i['lon'], i['lat']) for i in observations])
        g['species'] = [i['species'] for i in observations]
        filepath = "data/{1}-{0}-2016-2021.geojson".format(taxonname, boundary_id) # local folder must already exist or be created manually before running. 
        g.to_file(filepath, driver='GeoJSON')

        # upload in s3
        s3.meta.client.upload_file(
            filepath, 
            bucket_name, 
            'data/biodiversity/GBIF/{0}_observations/{1}-{0}-2016-2021.geojson'.format(taxonname, boundary_id),
            ExtraArgs={'ACL':'public-read'}
        )

In [None]:
# download extracts and upload to AWS

for i in range(len(boundary_georef)):
    boundary_id = boundary_georef.loc[i, 'geo_name']+'-' + boundary_georef.loc[i, 'aoi_boundary_name']
    boundary_path = aws_s3_dir + boundary_ext +'boundary-'+boundary_id+'.geojson'
    boundary_geo = requests.get(boundary_path).json()
    temp_gdf = gpd.GeoDataFrame.from_features(boundary_geo)
    for taxonname in TAXONKEYS:
        do_one_geom(temp_gdf.iloc[0]['geometry'], taxonname, boundary_id)

In [None]:
print("done")