In [None]:
# import sys
# !{sys.executable} -m pip install pip earthengine-api
# !{sys.executable} -m pip install pip geemap
# !{sys.executable} -m pip install --extra-index-url https://artifactory.vgt.vito.be/api/pypi/python-packages/simple terracatalogueclient
# !{sys.executable} -m pip install pip rasterstats

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
import os, requests, json, geojson
import random, scipy
from collections import defaultdict
from scipy.optimize import curve_fit
import numpy as np
from shapely.geometry import Point, Polygon, MultiPolygon, shape
from shapely.ops import unary_union

import shapely
from shapely.ops import transform

import pandas as pd
import geopandas as gpd
import ee
import geemap
from terracatalogueclient import Catalogue as Terracat
import boto3

In [None]:
# ee.Authenticate()

In [None]:
ee.Initialize()

In [None]:
OUTPUT_FILENAME = 'BIO-5-bird-species.csv'

In [None]:
API_URL = 'https://api.gbif.org/v1/occurrence/search/'
DATASETKEY = '50c9509d-22c7-4a22-a47d-8c48425ef4a7'  # iNaturalist research-grade observations
#TAXONKEYS = {'Arthropoda': '54', 'Aves': '212', 'Tracheophyta': '7707728'}
TAXON = 'Aves'
TAXON_KEY = '212'
STARTYEAR = '2016'
ENDYEAR = '2021'
LIMIT = 300

NUM_CURVEFITS = 200

In [None]:
# define directory
out_dir = os.getcwd()
bucket_name = 'cities-indicators'
aws_s3_dir = "https://"+bucket_name+".s3.eu-west-3.amazonaws.com"
boundary_ext = '/data/boundaries/'
indicators_file_aws = 'indicators/indicators.csv'

In [None]:
# get list of cities
boundary_georef = pd.read_csv(aws_s3_dir + boundary_ext + 'boundary_georef.csv')
boundary_georef

In [None]:
def boundingbox_wkt(p):
    # Returns WKT for bounding box.
    # Necessary because GBIF API won't accept complex polygons.
    minx, miny, maxx, maxy = p.bounds
    return 'POLYGON (({0} {3}, {0} {2}, {1} {2}, {0} {3}))'.format(str(minx), str(maxx), str(miny), str(maxy))

In [None]:
def get_count(poly):
    outputs = []
    # Get observation records from GBIF
    
    observations = []
    
    if poly.type == 'MultiPolygon':
        poly = unary_union(poly)
    
    if str(poly) != 'GEOMETRYCOLLECTION EMPTY':
        box = boundingbox_wkt(poly)

        offset = -LIMIT
        while offset == -LIMIT or not results['endOfRecords']:
            offset += LIMIT
            url = '{0}?dataset_key={1}&taxon_key={2}&year={3},{4}&geometry={5}&limit={6}&offset={7}&hasCoordinate=true'.format(API_URL, DATASETKEY, TAXON_KEY, STARTYEAR, ENDYEAR, box, LIMIT, offset)
            resp = requests.get(url)
            results = resp.json()
            print('  {0}: {1}/{2}'.format(TAXON, results['offset'], results['count']))
            # Note spatial subsetting of points happens below (twice) as part of the conditions in the list comprehensions
            observations += [{
                'species': i['species'],
                'lat': i['decimalLatitude'],  # We don't really need to save lat/lon for this
                'lon': i['decimalLongitude'],
            } for i in results['results'] if 'species' in i.keys() and Point(float(i['decimalLongitude']), float(i['decimalLatitude'])).within(poly)]
        # Estimate species counts by estimating asymptote of species-accumulation curve created when observation order is randomized
        # Final estimate is average over NUM_CURVEFITS estimates

        #count_estimate = None
        if len(observations) > 1:
            taxon_observations = [i['species'] for i in observations]
            asymptotes = []
            tries = 0
            while len(asymptotes) < NUM_CURVEFITS:   # Different observation-orders give different results, so average over many
                tries += 1
                taxon_observations.sort(key=lambda x: random.random())                    # Randomize order of observations
                sac = []                                                            # Initialize species accumulation curve data
                for obs_count in range(1, len(taxon_observations)):                       # Go through observation list from beginning
                    sac.append(len(set(taxon_observations[:obs_count])))                  # and count unique species from start to index
                if len(sac) > 5 and tries <= 1000:          # Avoid letting infinite-species errors stop the process
                    try:
                        asymptotes.append(scipy.optimize.curve_fit(lambda x,a,b,c: -((a*np.exp(-b*x))+c), list(range(1, len(sac)+1)), sac)[0][2])
                    except:
                        pass
                else:
                    asymptotes.append(-1)
            if -1 in asymptotes:
                count_estimate = -9999
            else:
                count_estimate = -round(np.mean(asymptotes))
            return count_estimate
    return -9999

In [None]:
def do_one_geom(row):
    poly = row[0]
    if poly.type == 'Polygon':
        poly = MultiPolygon([poly])

    
    taxoncount = get_count(poly)
    return taxoncount

In [None]:
for i in range(0,len(boundary_georef)):
    if not OUTPUT_FILENAME in os.listdir('.'):
        so_far_df = pd.DataFrame()
        so_far_df.to_csv(OUTPUT_FILENAME)
        so_far = []
    else:
        so_far_df = pd.read_csv(OUTPUT_FILENAME)
        so_far = [so_far_df.iloc[j]['geo_id'] for j in range(len(so_far_df))]
    
    most_recent = []
    for boundary_name in ['aoi_boundary_name', 'units_boundary_name']:
        if type(boundary_georef.loc[i, boundary_name]) != float: # sometimes boundary_id is nan
            boundary_id = boundary_georef.loc[i, 'geo_name']+'-' + boundary_georef.loc[i, boundary_name]
            if not boundary_id in so_far:
                print(boundary_id)
                boundary_path = aws_s3_dir + boundary_ext +'boundary-'+boundary_id+'.geojson'
                boundary_geo = requests.get(boundary_path).json()
                temp_gdf = gpd.GeoDataFrame.from_features(boundary_geo)
                temp_gdf['BIO_5_numberBirdSpecies'] = temp_gdf.apply(do_one_geom, axis=1)
                most_recent.append(temp_gdf.copy())

                result = pd.concat([so_far_df] + most_recent, axis=0)
                result[['geo_id', 'geo_level', 'geo_name', 'geo_parent_name', 'BIO_5_numberBirdSpecies']].to_csv(OUTPUT_FILENAME)

In [None]:
processedcities = pd.read_csv(OUTPUT_FILENAME)
processedcities

# Merge with indicator table

In [None]:
# read indicator table
cities_indicators = pd.read_csv(aws_s3_dir +'/'+ indicators_file_aws)
cities_indicators

In [None]:
def merge_indicators(indicator_table, new_indicator_table, indicator_name):
    if indicator_name in indicator_table.columns:
        print("replace with new calculations")
        indicator_table.drop(indicator_name, inplace=True, axis=1)
        cities_indicators_df = indicator_table.merge(new_indicator_table[["geo_id",indicator_name]], 
                                                     on='geo_id', 
                                                     how='left')
    else:
        print("add new indicators")
        cities_indicators_df = indicator_table.merge(new_indicator_table[["geo_id",indicator_name]], 
                                                     on='geo_id', 
                                                     how='left')
    return(cities_indicators_df)

In [None]:
cities_indicators_merged = merge_indicators(indicator_table = cities_indicators,
                                            new_indicator_table = processedcities,
                                            indicator_name = 'BIO_5_numberBirdSpecies')

In [None]:
cities_indicators_merged

## Upload in aws s3

In [None]:
# connect to s3
aws_credentials = pd.read_csv('/home/jovyan/PlanetaryComputerExamples/aws_credentials.csv')
aws_key = aws_credentials.iloc[0]['Access key ID']
aws_secret = aws_credentials.iloc[0]['Secret access key']

s3 = boto3.resource(
    service_name='s3',
    aws_access_key_id=aws_key,
    aws_secret_access_key=aws_secret
)

In [None]:
# upload to aws
key_data = indicators_file_aws
cities_indicators_merged.to_csv(
    f"s3://{bucket_name}/{key_data}",
    index=False,
    storage_options={
        "key": aws_key,
        "secret": aws_secret
    },
)

In [None]:
# make it public
object_acl = s3.ObjectAcl(bucket_name,key_data)
response = object_acl.put(ACL='public-read')