In [1]:
import sys
!{sys.executable} -m pip install pip earthengine-api
!{sys.executable} -m pip install pip geemap
!{sys.executable} -m pip install --extra-index-url https://artifactory.vgt.vito.be/api/pypi/python-packages/simple terracatalogueclient
!{sys.executable} -m pip install pip rasterstats

Looking in indexes: https://pypi.org/simple, https://artifactory.vgt.vito.be/api/pypi/python-packages/simple
Collecting terracatalogueclient
  Downloading https://artifactory.vgt.vito.be/api/pypi/python-packages/terracatalogueclient/0.1.14/terracatalogueclient-0.1.14-py3-none-any.whl (12 kB)
Collecting requests-auth>=5.3.0
  Using cached https://artifactory.vgt.vito.be/api/pypi/python-packages/packages/packages/60/af/da90802d91cbc45bdc160e9dcc70a07cbb581d748549edc3d42d25e04c8f/requests_auth-6.0.0-py3-none-any.whl (25 kB)
Installing collected packages: requests-auth, terracatalogueclient
Successfully installed requests-auth-6.0.0 terracatalogueclient-0.1.14


In [1]:
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
import os, requests, json, geojson
import random, scipy
from collections import defaultdict
from scipy.optimize import curve_fit
import numpy as np
from shapely.geometry import Point, Polygon, MultiPolygon, shape
from shapely.ops import unary_union

import shapely
from shapely.ops import transform

import pandas as pd
import geopandas as gpd
import ee
import geemap
from terracatalogueclient import Catalogue as Terracat

In [3]:
service_account = 'climate-hazard-demo@data-portal-adaptation.iam.gserviceaccount.com'
credentials = ee.ServiceAccountCredentials(service_account, 'google_cred.json')
ee.Initialize(credentials)

In [4]:
OUTPUT_FILENAME = 'BIO-5.csv'

In [5]:
API_URL = 'https://api.gbif.org/v1/occurrence/search/'
DATASETKEY = '50c9509d-22c7-4a22-a47d-8c48425ef4a7'  # iNaturalist research-grade observations
#TAXONKEYS = {'Arthropoda': '54', 'Aves': '212', 'Tracheophyta': '7707728'}
TAXON = 'Aves'
TAXON_KEY = '212'
STARTYEAR = '2016'
ENDYEAR = '2021'
LIMIT = 300

NUM_CURVEFITS = 200

In [6]:
# define directory
out_dir = os.getcwd()
aws_s3_dir = "https://cities-urbanshift.s3.eu-west-3.amazonaws.com/data"

In [7]:
# get list of urbanshift cities
boundary_georef = pd.read_csv('https://cities-urbanshift.s3.eu-west-3.amazonaws.com/data/boundaries/v_0/boundary_georef.csv')

# remove cities without tree cover data availability
#tml_not_available_cities = ['BRA-Salvador','MEX-Monterrey']
tml_not_available_cities = []
boundary_georef = boundary_georef[~boundary_georef['geo_name'].isin(tml_not_available_cities)].reset_index(drop=True)
boundary_georef

Unnamed: 0,geo_name,level,aoi_boundary_name,units_boundary_name,city_name,country_name,country_code,continent
0,ARG-Mendoza,region,ADM3union,ADM3,Mendoza,Argentina,ARG,America
1,ARG-Mar_del_Plata,city,ADM3,ADM4,Mar del Plata city,Argentina,ARG,America
2,ARG-Mar_del_Plata,region,ADM2,,Mar del Plata region,Argentina,ARG,America
3,ARG-Ushuaia,city,ADM4,ADM5,Ushuaia city,Argentina,ARG,America
4,ARG-Ushuaia,region,ADM3,,Ushuaia region,Argentina,ARG,America
5,ARG-Salta,region,ADM2union,ADM3,Salta,Argentina,ARG,America
6,ARG-Buenos_Aires,region,ADM2union,ADM2,Buenos Aires,Argentina,ARG,America
7,BRA-Teresina,city,ADM4union,ADM4,Teresina city,Brazil,BRA,America
8,BRA-Teresina,region,ADM2union,ADM2,Teresina region,Brazil,BRA,America
9,BRA-Florianopolis,city,ADM4union,ADM4,Florianopolis,Brazil,BRA,America


In [8]:
def boundingbox_wkt(p):
    # Returns WKT for bounding box.
    # Necessary because GBIF API won't accept complex polygons.
    minx, miny, maxx, maxy = p.bounds
    return 'POLYGON (({0} {3}, {0} {2}, {1} {2}, {0} {3}))'.format(str(minx), str(maxx), str(miny), str(maxy))

In [9]:
def get_count(poly):
    outputs = []
    # Get observation records from GBIF
    
    observations = []
    
    if poly.type == 'MultiPolygon':
        poly = unary_union(poly)
    
    if str(poly) != 'GEOMETRYCOLLECTION EMPTY':
        box = boundingbox_wkt(poly)

        offset = -LIMIT
        while offset == -LIMIT or not results['endOfRecords']:
            offset += LIMIT
            url = '{0}?dataset_key={1}&taxon_key={2}&year={3},{4}&geometry={5}&limit={6}&offset={7}&hasCoordinate=true'.format(API_URL, DATASETKEY, TAXON_KEY, STARTYEAR, ENDYEAR, box, LIMIT, offset)
            resp = requests.get(url)
            results = resp.json()
            print('  {0}: {1}/{2}'.format(TAXON, results['offset'], results['count']))
            # Note spatial subsetting of points happens below (twice) as part of the conditions in the list comprehensions
            observations += [{
                'species': i['species'],
                'lat': i['decimalLatitude'],  # We don't really need to save lat/lon for this
                'lon': i['decimalLongitude'],
            } for i in results['results'] if 'species' in i.keys() and Point(float(i['decimalLongitude']), float(i['decimalLatitude'])).within(poly)]
        # Estimate species counts by estimating asymptote of species-accumulation curve created when observation order is randomized
        # Final estimate is average over NUM_CURVEFITS estimates

        #count_estimate = None
        if len(observations) > 1:
            taxon_observations = [i['species'] for i in observations]
            asymptotes = []
            tries = 0
            while len(asymptotes) < NUM_CURVEFITS:   # Different observation-orders give different results, so average over many
                tries += 1
                taxon_observations.sort(key=lambda x: random.random())                    # Randomize order of observations
                sac = []                                                            # Initialize species accumulation curve data
                for obs_count in range(1, len(taxon_observations)):                       # Go through observation list from beginning
                    sac.append(len(set(taxon_observations[:obs_count])))                  # and count unique species from start to index
                if len(sac) > 5 and tries <= 1000:          # Avoid letting infinite-species errors stop the process
                    try:
                        asymptotes.append(scipy.optimize.curve_fit(lambda x,a,b,c: -((a*np.exp(-b*x))+c), list(range(1, len(sac)+1)), sac)[0][2])
                    except:
                        pass
                else:
                    asymptotes.append(-1)
            if -1 in asymptotes:
                count_estimate = -9999
            else:
                count_estimate = -round(np.mean(asymptotes))
            return count_estimate
    return -9999

In [10]:
def do_one_geom(row):
    poly = row[0]
    if poly.type == 'Polygon':
        poly = MultiPolygon([poly])

    
    taxoncount = get_count(poly)
    return taxoncount

In [11]:
for i in range(len(boundary_georef)):
    if not OUTPUT_FILENAME in os.listdir('.'):
        so_far_df = pd.DataFrame()
        so_far_df.to_csv(OUTPUT_FILENAME)
        so_far = []
    else:
        so_far_df = pd.read_csv(OUTPUT_FILENAME)
        so_far = [so_far_df.iloc[j]['geo_id'] for j in range(len(so_far_df))]
    
    most_recent = []
    for boundary_name in ['aoi_boundary_name', 'units_boundary_name']:
        if type(boundary_georef.loc[i, boundary_name]) != float: # sometimes boundary_id is nan
            boundary_id = boundary_georef.loc[i, 'geo_name']+'-' + boundary_georef.loc[i, boundary_name]
            if not boundary_id in so_far:
                print(boundary_id)
                boundary_path = aws_s3_dir +'/boundaries/v_0/boundary-' + boundary_id + '.geojson'
                boundary_geo = requests.get(boundary_path).json()
                temp_gdf = gpd.GeoDataFrame.from_features(boundary_geo)
                temp_gdf['BIO-5'] = temp_gdf.apply(do_one_geom, axis=1)
                most_recent.append(temp_gdf.copy())

                result = pd.concat([so_far_df] + most_recent, axis=0)
                result[['geometry', 'geo_id', 'geo_level', 'geo_name', 'geo_parent_name', 'creation_date', 'BIO-5']].to_csv(OUTPUT_FILENAME)

ARG-Mendoza-ADM3union
  Aves: 0/863
  Aves: 300/863
  Aves: 600/863
ARG-Mendoza-ADM3
  Aves: 0/41
  Aves: 0/1
  Aves: 0/9
  Aves: 0/12
  Aves: 0/4
  Aves: 0/7
  Aves: 0/9
  Aves: 0/55
  Aves: 0/140
  Aves: 0/1
  Aves: 0/1
  Aves: 0/0
  Aves: 0/1
  Aves: 0/3
  Aves: 0/19
  Aves: 0/9
  Aves: 0/4
  Aves: 0/0
  Aves: 0/0
  Aves: 0/1
  Aves: 0/2
  Aves: 0/0
  Aves: 0/1
  Aves: 0/0
  Aves: 0/4
  Aves: 0/0
  Aves: 0/0
  Aves: 0/0
  Aves: 0/5
  Aves: 0/0
  Aves: 0/0
  Aves: 0/0
  Aves: 0/0
  Aves: 0/0
  Aves: 0/1
  Aves: 0/0
  Aves: 0/2
  Aves: 0/7
  Aves: 0/5
  Aves: 0/1
  Aves: 0/5
  Aves: 0/1
  Aves: 0/2
  Aves: 0/0
  Aves: 0/1
  Aves: 0/87
  Aves: 0/26
  Aves: 0/1
  Aves: 0/0
  Aves: 0/8
  Aves: 0/1
  Aves: 0/38
  Aves: 0/10
  Aves: 0/2
  Aves: 0/1
  Aves: 0/36
  Aves: 0/0
  Aves: 0/0
  Aves: 0/2
  Aves: 0/0
  Aves: 0/310
  Aves: 300/310
  Aves: 0/1
  Aves: 0/136
  Aves: 0/2
  Aves: 0/0
  Aves: 0/8
  Aves: 0/8
  Aves: 0/16
  Aves: 0/0
  Aves: 0/6
  Aves: 0/13
  Aves: 0/2
  Aves: 0/156
  Av

In [59]:
outdf = result[['geometry', 'geo_id', 'geo_level', 'geo_name', 'geo_parent_name', 'creation_date', 'SICB-3']]

In [61]:
# connect to s3

import boto3

aws_key = "AKIA4GK7IHHC5RCMFKEG"
aws_secret = "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"

s3 = boto3.resource(
    service_name='s3',
    aws_access_key_id=aws_key,
    aws_secret_access_key=aws_secret
)

# upload to aws
key_data = 'indicators/biodiversity/BIO-3.csv'
bucket_name = 'cities-urbanshift' 
outdf.to_csv(
    f"s3://{bucket_name}/{key_data}",
    index=False,
    storage_options={
        "key": aws_key,
        "secret": aws_secret
    },
)

# make it public
object_acl = s3.ObjectAcl(bucket_name,key_data)
response = object_acl.put(ACL='public-read')

In [1]:
import boto3

aws_key = "AKIA4GK7IHHC5RCMFKEG"
aws_secret = "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"

In [8]:
ofile = open('test.csv', 'w')
ofile.write('test,test,test\n')
ofile.close()

In [9]:
import pandas as pd

In [10]:
outdf = pd.read_csv('test.csv')

In [15]:
outdf.to_csv(
    f"s3://{bucket_name}/{key_data}",
    index=False,
    storage_options={
        "key": aws_key,
        "secret": aws_secret
    },
)

In [16]:
s3 = boto3.resource(
    service_name='s3',
    aws_access_key_id=aws_key,
    aws_secret_access_key=aws_secret
)
key_data = 'indicators/biodiversity/test.csv'
bucket_name = 'cities-urbanshift' 
object_acl = s3.ObjectAcl(bucket_name,key_data)

In [17]:
my_bucket = s3.Bucket(bucket_name)

In [18]:
for my_bucket_object in my_bucket.objects.all():
    print(my_bucket_object.key)

baseline-indicators/
baseline-indicators/biodiversity/
baseline-indicators/biodiversity/data/
baseline-indicators/biodiversity/data/CRI-San_Jose/
baseline-indicators/biodiversity/data/CRI-San_Jose/CRI-San_Jose-ESA-landcover-2020.tif
baseline-indicators/biodiversity/data/CRI-San_Jose/CRI-San_Jose-GAIA_impervious_surfaces-2018.tif
baseline-indicators/biodiversity/data/CRI-San_Jose/CRI-San_Jose-GBIF-species_observation-2020.geojson
baseline-indicators/biodiversity/data/CRI-San_Jose/CRI-San_Jose-GLAD-landclasses-2000.tif
baseline-indicators/biodiversity/data/CRI-San_Jose/CRI-San_Jose-GLAD-landclasses-2020.tif
baseline-indicators/biodiversity/data/CRI-San_Jose/CRI-San_Jose-GLAD_habitat_changes-2000_2020.tif
baseline-indicators/biodiversity/data/CRI-San_Jose/CRI-San_Jose-IUCN-RedList_counts.xlsx
baseline-indicators/biodiversity/data/CRI-San_Jose/CRI-San_Jose-KBA-2022.geojson
baseline-indicators/biodiversity/data/CRI-San_Jose/CRI-San_Jose-LSTmean-20210305.tif
baseline-indicators/biodiversity/