In [2]:
import planetary_computer
import xarray as xr
import fsspec
import pystac_client

import s3fs

import pandas as pd


import pandas as pd
import dask
import dask_gateway

import dask.bag as db 
import dask.array as da


from dask.distributed import wait


import planetary_computer

from datetime import datetime

In [3]:
gateway = dask_gateway.Gateway()
cluster_options = gateway.cluster_options()

In [4]:
cluster_options

VBox(children=(HTML(value='<h2>Cluster Options</h2>'), GridBox(children=(HTML(value="<p style='font-weight: bo…

Options<worker_cores=1.0,
        worker_memory=8.0,
        image='pcccr.azurecr.io/public/planetary-computer/python:2023.6.22.0',
        gpu=False,
        environment={'GDAL_DISABLE_READDIR_ON_OPEN': 'EMPTY_DIR',
         'GDAL_HTTP_MERGE_CONSECUTIVE_RANGES': 'YES',
         'GDAL_HTTP_MAX_RETRY': '5',
         'GDAL_HTTP_RETRY_DELAY': '3',
         'USE_PYGEOS': '0'}>


In [35]:
cluster = gateway.new_cluster(cluster_options)
# cluster = dask_gateway.GatewayCluster(public_address="https://pccompute.westeurope.cloudapp.azure.com/compute/services/dask-gateway/clusters/prod.76e890ca286b43558f8cece0d48e0ff6/individual-scheduler-system")
client = cluster.get_client()

cluster.adapt(minimum=100)

In [36]:
client

0,1
Connection method: Cluster object,Cluster type: dask_gateway.GatewayCluster
Dashboard: https://pccompute.westeurope.cloudapp.azure.com/compute/services/dask-gateway/clusters/prod.7385d6c95d2a4e07a5f3658b16d6256d/status,


In [7]:
import pystac_client

catalog = pystac_client.Client.open(
    "https://planetarycomputer.microsoft.com/api/stac/v1/"
)
search = catalog.search(
    collections=["era5-pds"], datetime="1980/2014")
items = search.get_all_items()

items = sorted(list(items), key=lambda x: x.to_dict()['properties']['end_datetime'])



In [30]:
%%time
# ds = xr.open_mfdataset(
#     [
#         fsspec.open(asset.href).open() for item in items for asset in item.assets.values()
#     ],
#     chunks="auto"
# )

# signed_item = planetary_computer.sign(item)

era5_variables = [
    # 'air_temperature_at_2_metres_1hour_Minimum',
    # 'air_temperature_at_2_metres_1hour_Maximum',
    # 'air_temperature_at_2_metres',
    # 'precipitation_amount_1hour_Accumulation',
    # 'dew_point_temperature_at_2_metres',
]

variable = 'dew_point_temperature_at_2_metres'

datasets = [
    xr.open_dataset(asset.href, **asset.extra_fields["xarray:open_kwargs"])
    for item in items[400:] for asset in planetary_computer.sign(item).assets.values()
    if asset.href.split('/')[-1].split('.')[0] == variable
]


CPU times: user 8.37 s, sys: 640 ms, total: 9.01 s
Wall time: 25.7 s


In [11]:
for dataset in datasets:
    # get rid of inconsistent attribute names that raise exception when merging
    if getattr(dataset, 'air_temperature_at_2_metres_1hour_Maximum', None) is not None:
        dataset.air_temperature_at_2_metres_1hour_Maximum.attrs.pop('nameCDM')


In [37]:
%%time

ds = xr.combine_by_coords(datasets, join="exact", data_vars="minimal")
# ds = (ds.precipitation_amount_1hour_Accumulation.resample(time='D').sum() * 1000 / 86400).to_dataset().persist()
ds = ds[variable].resample(time='D').mean().to_dataset().persist()


# ds = ds[variable].resample(time='D').max().to_dataset().persist()
# data = zarr_cities_bag.map(get_zarr_point_data, 'air_temperature_at_2_metres_1hour_Minimum').compute()


data = zarr_cities_bag.map(get_zarr_point_data, variable).compute()

df = pd.DataFrame(data).T
df.columns = (cities.countrycode + cities.cityname.apply(lambda x: f"_{x}")).values

df.to_csv(
    # f"s3://cities-climate-hazard/{variable}_era5.csv",
    f"{variable}_second_400_chunk_era5.csv",
    # storage_options={
    #     "key": "",
    #     "secret": "",
    # },
)

CPU times: user 2min 38s, sys: 5.7 s, total: 2min 44s
Wall time: 43min 12s


In [None]:
daily_ds = ds.air_temperature_at_2_metres_1hour_Minimum.resample(time='D').min().to_dataset()

In [None]:
daily_ds.air_temperature_at_2_metres_1hour_Minimum

In [None]:
daily_ds['air_temperature_at_2_metres_1hour_Maximum'] = ds.air_temperature_at_2_metres_1hour_Maximum.resample(time='D').max()
daily_ds['air_temperature_at_2_metres'] = ds.air_temperature_at_2_metres.resample(time='D').mean()
daily_ds['precipitation_amount_1hour_Accumulation'] = ds.precipitation_amount_1hour_Accumulation.resample(time='D').sum() * 1000 / 86400
daily_ds['dew_point_temperature_at_2_metres'] = ds.dew_point_temperature_at_2_metres.resample(time='D').max()
daily_ds['relative_humidity'] = 100 * da.exp(17.625 * daily_ds.dew_point_temperature_at_2_metres / (243.04 + daily_ds.dew_point_temperature_at_2_metres)) \
        / da.exp(17.625 * daily_ds.air_temperature_at_2_metres / (243.04 + daily_ds.air_temperature_at_2_metres))
        
daily_ds.persist()

In [None]:
def get_era5_data(items):
    era5_variables = [
        'air_temperature_at_2_metres_1hour_Minimum',
        'air_temperature_at_2_metres_1hour_Maximum',
        'air_temperature_at_2_metres',
        'precipitation_amount_1hour_Accumulation',
        'dew_point_temperature_at_2_metres',
    ]

    datasets = [
        xr.open_dataset(asset.href, **asset.extra_fields["xarray:open_kwargs"])
        for item in items for asset in planetary_computer.sign(item).assets.values()
        if asset.href.split('/')[-1].split('.')[0] in era5_variables
    ]

    for dataset in datasets:
        # get rid of inconsistent attribute names that raise exception when merging
        if getattr(dataset, 'air_temperature_at_2_metres_1hour_Maximum', None) is not None:
            dataset.air_temperature_at_2_metres_1hour_Maximum.attrs.pop('nameCDM')

    

    ds = xr.combine_by_coords(datasets, join="exact", data_vars="minimal")
    
    # ds = ds.chunk({'time': 100, 'lat': 300, 'lon': 300})
    
    daily_ds = ds.air_temperature_at_2_metres_1hour_Minimum.resample(time='D').min().to_dataset()
    daily_ds['air_temperature_at_2_metres_1hour_Maximum'] = ds.air_temperature_at_2_metres_1hour_Maximum.resample(time='D').max()
    daily_ds['air_temperature_at_2_metres'] = ds.air_temperature_at_2_metres.resample(time='D').mean()
    daily_ds['precipitation_amount_1hour_Accumulation'] = ds.precipitation_amount_1hour_Accumulation.resample(time='D').sum() * 1000 / 86400
    daily_ds['dew_point_temperature_at_2_metres'] = ds.dew_point_temperature_at_2_metres.resample(time='D').max()

    # daily_ds['relative_humidity'] = 100 * da.exp(17.625 * (daily_ds.dew_point_temperature_at_2_metres / (243.04 + daily_ds.dew_point_temperature_at_2_metres)) \
    #     / da.exp(17.625 * daily_ds.air_temperature_at_2_metres_1hour_Maximum / (243.04 + daily_ds.air_temperature_at_2_metres_1hour_Maximum))
        
    daily_ds.persist()
    
    # daily_ds.persist()
    wait(daily_ds)
    
    return daily_ds

In [15]:
def get_zarr_point_data(point, variable):
    data = ds[variable].sel(
        lat=point[1],
        lon=point[0], method='nearest'
    ).compute()

    return pd.Series(data.data, index=[pd.Timestamp(t) for t in data.time.data])

In [12]:
cities = pd.read_csv('/home/jovyan/ghslcities_popgte50k.csv', sep=',', encoding = "ISO-8859-1")
cities.head()

Unnamed: 0,latitude,longitude,countrycode,cityname,pop2015
0,21.340678,-157.893497,USA,Honolulu,512853.6667
1,-17.534103,-149.568053,PYF,Papeete,91521.1246
2,34.923123,-120.434372,USA,Santa Maria,123181.2848
3,36.60772,-121.882378,USA,Monterey,67772.28886
4,34.427664,-119.743693,USA,Santa Barbara,114753.1502


In [13]:
zarr_cities_bag = db.from_sequence(
    zip(cities.longitude.values, cities.latitude.values),
    npartitions=100  # Number of partitions should match the number of workers
)

In [None]:
ds = get_era5_data(items[:10])

In [None]:
ds

In [None]:
%%time

data = zarr_cities_bag.map(get_zarr_point_data, 'air_temperature_at_2_metres_1hour_Minimum').compute()

In [None]:
%%time

ds['air_temperature_at_2_metres_1hour_Minimum'] = ds.air_temperature_at_2_metres_1hour_Minimum.resample(time='D').max()
ds.persist()
wait(ds)

data = zarr_cities_bag.map(get_zarr_point_data, 'air_temperature_at_2_metres_1hour_Minimum').compute()

In [None]:
era5_variables = [
    'air_temperature_at_2_metres_1hour_Minimum',
    'air_temperature_at_2_metres_1hour_Maximum',
    'air_temperature_at_2_metres',
    'precipitation_amount_1hour_Accumulation',
    'dew_point_temperature_at_2_metres',
]

chunk_size = 10
start = 10

chunks = len(items) / chunk_size
    
for chunk in range(int(chunks))[1:]:
    chunk_items = items[start:start+chunk_size]
    ds = get_era5_data(chunk_items)
    for variable in era5_variables:
        data = zarr_cities_bag.map(get_zarr_point_data, variable).compute()
        df = pd.DataFrame(data).T
        df.columns = (cities.countrycode + cities.cityname.apply(lambda x: f"_{x}")).values

        df.to_csv(
            # f"s3://cities-climate-hazard/{variable}_era5.csv",
            f"./{variable}_{chunk_items[0].id}.csv",
            # storage_options={
            #     "key": "",
            #     "secret": "",
            # },
        )
    start += chunk_size
    print(start)


In [None]:
%%time

campinas = daily_ds.sel(lat=-22.907104, lon=(-47.063240 + 360) % 360, method='nearest').compute()

In [None]:
nairobi = daily_ds.sel(lon=36.8219, lat=-1.2921, method='nearest').compute()

In [None]:
campinas.to_dataframe()[['air_temperature_at_2_metres', 'air_temperature_at_2_metres_1hour_Maximum', 'air_temperature_at_2_metres_1hour_Minimum']].iloc[:300].plot()

In [None]:
campinas.to_dataframe().relative_humidity.iloc[:300].plot()

In [None]:
(nairobi.to_dataframe().precipitation_amount_1hour_Accumulation * 1000).plot()

extracting cities data out of 100 assets (1/8th of the items) for a single variable is taking 15 mins on the 100 worker cluster.. which will be 2 hours for all the years. This doesn't account for downsampling the data to daily

on smaller test took 2min42sec on 10 items 

Let's see the effect of downsampling the data before extracting cities data

takes 1min30sec on the 10 items smaller test case above

In [None]:
tasmax = 