In [1]:
import s3fs
import xarray as xr
import numpy as np
import coiled

In [2]:
CITYLATLON = {}
with open('ghsl_500k.csv', 'r') as ifile:
    for line in ifile.readlines():
        items = [i.strip() for i in line.split(',')]
        CITYLATLON['city_{0}'.format(items[0])] = (float(items[2]), float(items[3]), int(items[0]))

In [3]:
PERCENTILE_STARTYEAR = 1980
PERCENTILE_ENDYEAR = 2019

In [10]:
ERA_TRANSFORMS = {'air_temperature_at_2_metres': lambda x: x - 273.5, 'precipitation_amount_1hour_Accumulation': lambda x: x * 86400}

In [5]:
def s3open(path):
    fs = s3fs.S3FileSystem(anon=True, default_fill_cache=False)
    return s3fs.S3Map(path, s3=fs)

In [6]:
MONTHS = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]

In [43]:
cluster.shutdown()

In [44]:
cluster = coiled.Cluster(n_workers=50, compute_purchase_option="spot_with_fallback", shutdown_on_close=False, region="us-east-1")
client = cluster.get_client()

Output()

Output()

In [11]:
%%time
varname='precipitation_amount_1hour_Accumulation'
FILE_PATTERN = 'era5-pds/zarr/{year}/{month}/data/{varname}.zarr/'
files_mapper = [s3open(FILE_PATTERN.format(year=year, month=month, varname=varname)) for year in range(PERCENTILE_STARTYEAR,PERCENTILE_ENDYEAR+1) for month in MONTHS]
pr_ds = xr.open_mfdataset(files_mapper, engine='zarr', 
       concat_dim=['time0', 'time1'][int(varname=='precipitation_amount_1hour_Accumulation')], combine='nested', 
       coords='minimal', compat='override', parallel=True)
pr_dailysum = ERA_TRANSFORMS[varname](pr_ds.resample(time1='D').sum())
pr_dailysum = pr_dailysum.chunk({"time1": -1, "lat": "auto", "lon": "auto"})

CPU times: total: 18.5 s
Wall time: 24.8 s


In [None]:
%%time
import time
def returnperiod_value_daily(timeseries, rp):
    d = np.delete(timeseries, np.argwhere(timeseries <= 0.01).flatten())  # Only consider actual positive events
    vals, counts = np.unique(d, return_counts=True)
    freqs = counts / d.size
    cdf_y = np.cumsum(freqs)
    targetfreq = (PERCENTILE_ENDYEAR - PERCENTILE_STARTYEAR + 1) / rp
    return np.interp(1-targetfreq, vals, cdf_y)

def pr_rp100_onecity(ts, loc_id):
    ts_vals = ts.values
    return returnperiod_value_daily(ts_vals, 100), loc_id

results = []
for idx_start, idx_end in [(0,100), (100,200), (200,300), (300,400), (400,500), (500,600), (600,700), (700,800), (800,900), (900,996)]:
# Why break this job up? For some reason it seems to help avoid cancellation errors due to client disappearing
    print(idx_start)
    futures = []
    for c in list(CITYLATLON.keys())[idx_start: idx_end]:
        lat, lon, loc_id = CITYLATLON[c]
        futures.append(client.submit(pr_rp100_onecity, pr_dailysum.sel(lat=lat, lon=lon, method='nearest')[varname], loc_id))
    for f in futures:
        if f.status == 'error':
            f.retry()
        results.append(f.result())



0
100
200
300
400
500
600
700
800


In [None]:
for res in results:
    with open('precip_rp100.txt', 'a') as ifile:
        ifile.write('{0}\t{1}\n'.format(res[1], res[0]))