In [2]:
import coiled

import fsspec
import numpy as np
import rioxarray
import xarray as xr
import fsspec
import pandas as pd
import logging
from flox.xarray import xarray_reduce
import numpy as np
import dask

In [3]:
logging.getLogger("distributed.client").setLevel(logging.ERROR)  # or logging.ERROR

In [4]:
fs = fsspec.filesystem("s3", requester_pays=True)

## Start the cluster

- Computation is sensitive to the cpu to memory ratio of instances
- Memory optimized instance type is best - using r7g.2xlarge (8vcpu and 64GB; 1:8 cpu to memory)
- Out of memory error on instances with low cpu to memory ratio (even 1:4)
- Graviton instances performed better than intel (and they are cheaper!)
- Notebook, the data and cluster are in the same region
- Using spot instances when available

In [5]:
cluster = coiled.Cluster(
    name="dist_alerts_zonal_stat_count",
    region="us-east-1",
    n_workers=50,
    tags={"project": "dist_alerts_zonal_stat"},
    scheduler_vm_types="r7g.xlarge",
    worker_vm_types="r7g.2xlarge",
    compute_purchase_option="spot_with_fallback"
)

client = cluster.get_client()

Output()

Output()

## Reading the grouping datasets and aligning them to pixel_area
- These are already saved to zarr for performance
- All have the same chunk size (10k by 10k)
- Using optimal data types- They must all be aligned to same dataset before computation

In [6]:
pixel_area_url = "s3://gfw-data-lake/umd_area_2013/v1.10/raster/epsg-4326/zarr/pixel_area.zarr"
pixel_area = xr.open_zarr(pixel_area_url)
pixel_area

Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f21ad29dc10>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7f21ad12dd30>, 363.119359822)])']
connector: <aiohttp.connector.TCPConnector object at 0x7f21b469b4a0>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f21ad0ef710>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7f21ad12db50>, 363.086545187)])']
connector: <aiohttp.connector.TCPConnector object at 0x7f21ad29e240>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f21ad23c470>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7f21ad12d790>, 363.092398628)])']
connector: <aiohttp.connector.TCPConnector object at 0x7f21ad2008c0>


Unnamed: 0,Array,Chunk
Bytes,2.93 TiB,381.47 MiB
Shape,"(1, 560000, 1440000)","(1, 10000, 10000)"
Dask graph,8064 chunks in 2 graph layers,8064 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 2.93 TiB 381.47 MiB Shape (1, 560000, 1440000) (1, 10000, 10000) Dask graph 8064 chunks in 2 graph layers Data type float32 numpy.ndarray",1440000  560000  1,

Unnamed: 0,Array,Chunk
Bytes,2.93 TiB,381.47 MiB
Shape,"(1, 560000, 1440000)","(1, 10000, 10000)"
Dask graph,8064 chunks in 2 graph layers,8064 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [7]:
countries = xr.open_zarr(
    's3://gfw-data-lake/gadm_administrative_boundaries/v4.1.85/raster/epsg-4326/zarr/adm0.zarr'
)
countries_from_clipped = xr.align(pixel_area, countries, join='left')[1].band_data


In [8]:
regions = xr.open_zarr(
    's3://gfw-data-lake/gadm_administrative_boundaries/v4.1.85/raster/epsg-4326/zarr/adm1.zarr'
)
regions_from_clipped = xr.align(pixel_area, regions, join='left')[1].band_data

In [9]:
subregions = xr.open_zarr(
    's3://gfw-data-lake/gadm_administrative_boundaries/v4.1.85/raster/epsg-4326/zarr/adm2.zarr'
)
subregions_from_clipped = xr.align(pixel_area, subregions, join='left')[1].band_data

In [10]:
natural_lands  = xr.open_zarr(
    's3://gfw-data-lake/sbtn_natural_lands/zarr/sbtn_natural_lands_all_classes.zarr'
)
natural_lands_from_clipped = xr.align(pixel_area, natural_lands, join='left')[1].band_data

In [11]:
adm0_ids = [
    4, 8, 10, 12, 16, 20, 24, 28, 31, 32, 36, 40, 44, 48, 50, 51, 52, 56, 60,
    64, 68, 70, 72, 74, 76, 84, 86, 90, 92, 96, 100, 104, 108, 112, 116, 120,
    124, 132, 136, 140, 144, 148, 152, 156, 158, 162, 166, 170, 174, 175, 178,
    180, 184, 188, 191, 192, 196, 203, 204, 208, 212, 214, 218, 222, 226, 231,
    232, 233, 234, 238, 239, 242, 246, 248, 250, 254, 258, 260, 262, 266, 268,
    270, 275, 276, 288, 292, 296, 300, 304, 308, 312, 316, 320, 324, 328, 332,
    334, 336, 340, 344, 348, 352, 356, 360, 364, 368, 372, 376, 380, 384, 388,
    392, 398, 400, 404, 408, 410, 414, 417, 418, 422, 426, 428, 430, 434, 438,
    440, 442, 446, 450, 454, 458, 462, 466, 470, 474, 478, 480, 484, 492, 496,
    498, 499, 500, 504, 508, 512, 516, 520, 524, 528, 531, 533, 534, 535, 540,
    548, 554, 558, 562, 566, 570, 574, 578, 580, 581, 583, 584, 585, 586, 591,
    598, 600, 604, 608, 612, 616, 620, 624, 626, 630, 634, 638, 642, 643, 646,
    652, 654, 659, 660, 662, 663, 666, 670, 674, 678, 682, 686, 688, 690, 694,
    702, 703, 704, 705, 706, 710, 716, 724, 728, 729, 732, 740, 744, 748, 752,
    756, 760, 762, 764, 768, 772, 776, 780, 784, 788, 792, 795, 796, 798, 800,
    804, 807, 818, 826, 831, 832, 833, 834, 840, 850, 854, 858, 860, 862, 876,
    882, 887, 894
]


## Computation

In [12]:
%%time

from flox import ReindexArrayType, ReindexStrategy


countries_from_clipped.name = "countries"
regions_from_clipped.name = "regions"
subregions_from_clipped.name = "subregions"
natural_lands_from_clipped.name = "natural_lands"
gadm_natlands_area = xarray_reduce(
    pixel_area.band_data,
    *(
        countries_from_clipped,
        regions_from_clipped,
        subregions_from_clipped,
        natural_lands_from_clipped
    ),
    func='sum',
    expected_groups=(
        adm0_ids,
        np.arange(1,86),
        np.arange(1,854),
        np.arange(1,22),
    ),
    reindex=ReindexStrategy(
        blockwise=False, array_type=ReindexArrayType.SPARSE_COO
    ),
    fill_value=0
).compute()

CPU times: user 2.92 s, sys: 170 ms, total: 3.09 s
Wall time: 5min 15s


## Transforming sparse array to dataframe and saving to parquet

In [13]:
sparse_data = gadm_natlands_area.data

dim_names = gadm_natlands_area.dims
indices = sparse_data.coords
values = sparse_data.data

coord_dict = {
    dim: gadm_natlands_area.coords[dim].values[indices[i]]
    for i, dim in enumerate(dim_names)
}
coord_dict["value"] = values

df = pd.DataFrame(coord_dict)

In [15]:
df.head()

Unnamed: 0,countries,regions,subregions,natural_lands,value
0,4,1,1,2,80224.64
1,4,1,1,3,1931114000.0
2,4,1,1,4,2087068.0
3,4,1,1,6,940770900.0
4,4,1,1,7,1962260.0


In [16]:
df[((df.countries == 4) & (df.regions== 1) & (df.subregions == 1))]

Unnamed: 0,countries,regions,subregions,natural_lands,value
0,4,1,1,2,80224.64
1,4,1,1,3,1931114000.0
2,4,1,1,4,2087068.0
3,4,1,1,6,940770900.0
4,4,1,1,7,1962260.0
5,4,1,1,10,12770230.0
6,4,1,1,12,96675660.0
7,4,1,1,13,18303120.0
8,4,1,1,14,87021.12
9,4,1,1,17,6787.935


In [17]:
df.to_parquet('s3://gfw-data-lake/sbtn_natural_lands/zarr/area_by_natural_lands_all_adm2_raw.parquet', index=False)