In [1]:
import coiled

import fsspec
import numpy as np
import rioxarray
import xarray as xr
import fsspec
import pandas as pd
import logging
from flox.xarray import xarray_reduce
import numpy as np
import dask

In [2]:
logging.getLogger("distributed.client").setLevel(logging.ERROR)  # or logging.ERROR

## Start the cluster

- Computation is sensitive to the cpu to memory ratio of instances
- Memory optimized instance type is best - using r7g.2xlarge (8vcpu and 64GB; 1:8 cpu to memory)
- Out of memory error on instances with low cpu to memory ratio (even 1:4)
- Graviton instances performed better than intel (and they are cheaper!)
- Notebook, the data and cluster are in the same region
- Using spot instances when available

In [27]:
cluster = coiled.Cluster(
    name="land_cover_zonal_stat",
    region="us-east-1",
    n_workers=50,
    tags={"project": "dist_alerts_zonal_stat"},
    scheduler_vm_types="r7g.xlarge",
    worker_vm_types="r7g.2xlarge",
    compute_purchase_option="spot_with_fallback"
)

client = cluster.get_client()

Dask logger 'distributed.client' is configured to show DEBUG logs on your cluster.
Debug logs can be very verbose, and there may be unexpected costs from your cloud provider for ingesting very large logs.


Output()

Output()

In [4]:
umd_land_cover  = xr.open_zarr(
    's3://gfw-data-lake/umd_lcl_land_cover/v2/raster/epsg-4326/zarr/umd_lcl_land_cover.zarr/'
)
umd_land_cover = umd_land_cover.assign_coords(year=np.arange(2015, 2025)).rename_vars({"2015": "lc_class"})


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0xe9d9eab251d0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0xe9d9ea82ed50>, 6065.808904078)])']
connector: <aiohttp.connector.TCPConnector object at 0xe9d9ebbe9bd0>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0xe9d9ebbebd90>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0xe9d9ea82eb70>, 6065.811323913)])']
connector: <aiohttp.connector.TCPConnector object at 0xe9d9ebbe9d10>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0xe9d9eac08190>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0xe9d9ea82ef30>, 6065.818131568)])']
connector: <aiohttp.connector.TCPConnector object at 0xe9d9ebbe9e50>


In [28]:
umd_land_cover.lc_class.max().compute()

In [7]:
pixel_area = xr.open_zarr('s3://gfw-data-lake/umd_area_2013/v1.10/raster/epsg-4326/zarr/pixel_area.zarr').band_data.rename("area")
pixel_area = xr.align(umd_land_cover, pixel_area.reindex_like(umd_land_cover, method='nearest', tolerance=1e-5), join="left")[1]
pixel_area

Unnamed: 0,Array,Chunk
Bytes,2.93 TiB,381.47 MiB
Shape,"(1, 560000, 1440000)","(1, 10000, 10000)"
Dask graph,8064 chunks in 2 graph layers,8064 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 2.93 TiB 381.47 MiB Shape (1, 560000, 1440000) (1, 10000, 10000) Dask graph 8064 chunks in 2 graph layers Data type float32 numpy.ndarray",1440000  560000  1,

Unnamed: 0,Array,Chunk
Bytes,2.93 TiB,381.47 MiB
Shape,"(1, 560000, 1440000)","(1, 10000, 10000)"
Dask graph,8064 chunks in 2 graph layers,8064 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [8]:
countries = xr.open_zarr(
    's3://gfw-data-lake/gadm_administrative_boundaries/v4.1.85/raster/epsg-4326/zarr/adm0.zarr'
).band_data.rename("country")
    
countries = xr.align(umd_land_cover, countries.reindex_like(umd_land_cover, method='nearest', tolerance=1e-5), join="left")[1].astype(np.int16)

In [9]:
regions = xr.open_zarr(
    's3://gfw-data-lake/gadm_administrative_boundaries/v4.1.85/raster/epsg-4326/zarr/adm1.zarr'
).band_data.rename("region")

regions = xr.align(umd_land_cover, regions.reindex_like(umd_land_cover, method='nearest', tolerance=1e-5), join="left")[1].astype(np.uint8)

In [10]:
subregions = xr.open_zarr(
    's3://gfw-data-lake/gadm_administrative_boundaries/v4.1.85/raster/epsg-4326/zarr/adm2.zarr'
).band_data.rename("subregion")

subregions = xr.align(umd_land_cover, subregions.reindex_like(umd_land_cover, method='nearest', tolerance=1e-5), join="left")[1].astype(np.int16)



In [11]:
lc_data_2015 = umd_land_cover.sel(year=2015).lc_class.chunk({"x": 10000, "y": 10000})
lc_data_2024 = umd_land_cover.sel(year=2024).lc_class.chunk({"x": 10000, "y": 10000})


In [35]:
transition = lc_data_2015 * 9 + lc_data_2024
transition.name = "transition"

In [34]:
8 * 9 + 8

80

In [59]:
xmin, xmax = (8.48, 8.56) 
ymax, ymin = (12.03, 11.98)


lc_2024_sub = lc_data_2024.sel(x=slice(xmin, xmax), y=slice(ymax, ymin))
lc_2015_sub = lc_data_2015.sel(x=slice(xmin, xmax), y=slice(ymax, ymin))
countries_sub = countries.sel(x=slice(xmin, xmax), y=slice(ymax, ymin))
regions_sub = regions.sel(x=slice(xmin, xmax), y=slice(ymax, ymin))
subregions_sub = subregions.sel(x=slice(xmin, xmax), y=slice(ymax, ymin))
transition_sub = transition.sel(x=slice(xmin, xmax), y=slice(ymax, ymin))
pixe_area_sub = pixel_area.sel(x=slice(xmin, xmax), y=slice(ymax, ymin))

In [63]:
lc_2024_sub.compute()

In [61]:
lc_2015_sub.compute()

## Computation

In [36]:
%%time

from flox import ReindexArrayType, ReindexStrategy

land_cover_change = xarray_reduce(
    pixel_area,
    *(
        countries,
        regions,
        subregions,
        transition
    ),
    func='sum',
    expected_groups=(
        np.arange(895),
        np.arange(86),
        np.arange(854),
        np.arange(81),
    ),
    reindex=ReindexStrategy(
        blockwise=False, array_type=ReindexArrayType.SPARSE_COO
    ),
    fill_value=0
).compute()

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


CPU times: user 1.8 s, sys: 201 ms, total: 2 s
Wall time: 8min 34s


In [37]:
%%time

from flox import ReindexArrayType, ReindexStrategy

land_cover_composition = xarray_reduce(
    pixel_area,
    *(
        countries,
        regions,
        subregions,
        lc_data_2024
    ),
    func='sum',
    expected_groups=(
        np.arange(895),
        np.arange(86),
        np.arange(854),
        np.arange(9),
    ),
    reindex=ReindexStrategy(
        blockwise=False, array_type=ReindexArrayType.SPARSE_COO
    ),
    fill_value=0
).compute()

CPU times: user 1.67 s, sys: 116 ms, total: 1.78 s
Wall time: 7min 18s


## Transforming sparse array to dataframe and saving to parquet

In [104]:
sparse_data = land_cover_change.data

dim_names = land_cover_change.dims
indices = sparse_data.coords
values = sparse_data.data

coord_dict = {
    dim: land_cover_change.coords[dim].values[indices[i]]
    for i, dim in enumerate(dim_names)
}
coord_dict["area"] = values

land_cover_change_df = pd.DataFrame(coord_dict)

In [105]:
land_cover_change_df = land_cover_change_df[land_cover_change_df.country != 0]

In [127]:
sparse_data = land_cover_composition.data

dim_names = land_cover_composition.dims
indices = sparse_data.coords
values = sparse_data.data

coord_dict = {
    dim: land_cover_composition.coords[dim].values[indices[i]]
    for i, dim in enumerate(dim_names)
}
coord_dict["area"] = values

land_cover_composition_df = pd.DataFrame(coord_dict)

A QC spot-check - counted in qgis that this region has total 26 alerts

In [43]:
land_cover_change_df[(land_cover_change_df.country == 566) & (land_cover_change_df.region == 20) & (land_cover_change_df.subregion == 31)].area.sum()

np.float32(18953610.0)

In [75]:
land_cover_composition_df[(land_cover_composition_df.country == 566) & (land_cover_composition_df.region == 20) & (land_cover_composition_df.subregion == 31)]

Unnamed: 0,country,region,subregion,lc_class,area
203748,566,20,31,0,1505.846
203749,566,20,31,1,36893.08
203750,566,20,31,6,752.9236
203751,566,20,31,7,18914430.0


In [108]:
land_cover_mapping = {
    0: "Bare and sparse vegetation",
    1: "Short vegetation",
    2: "Tree cover",
    3: "Wetland – short vegetation",
    4: "Water",
    5: "Snow/ice",
    6: "Cropland",
    7: "Built-up",
    8: "Cultivated grasslands"
}


In [109]:
land_cover_change_df['land_cover_class_start'] = land_cover_change_df.transition.apply(lambda x: land_cover_mapping[x // 9])
land_cover_change_df['land_cover_class_end'] = land_cover_change_df.transition.apply(lambda x: land_cover_mapping[x % 9])

In [110]:
numeric_to_alpha3 = {
    4: 'AFG', 248: 'ALA', 8: 'ALB', 12: 'DZA', 16: 'ASM', 20: 'AND', 24: 'AGO', 660: 'AIA',
    10: 'ATA', 28: 'ATG', 32: 'ARG', 51: 'ARM', 533: 'ABW', 36: 'AUS', 40: 'AUT', 31: 'AZE',
    44: 'BHS', 48: 'BHR', 50: 'BGD', 52: 'BRB', 112: 'BLR', 56: 'BEL', 84: 'BLZ', 204: 'BEN',
    60: 'BMU', 64: 'BTN', 68: 'BOL', 535: 'BES', 70: 'BIH', 72: 'BWA', 74: 'BVT', 76: 'BRA',
    86: 'IOT', 96: 'BRN', 100: 'BGR', 854: 'BFA', 108: 'BDI', 132: 'CPV', 116: 'KHM', 120: 'CMR',
    124: 'CAN', 136: 'CYM', 140: 'CAF', 148: 'TCD', 152: 'CHL', 156: 'CHN', 162: 'CXR', 166: 'CCK',
    170: 'COL', 174: 'COM', 178: 'COG', 180: 'COD', 184: 'COK', 188: 'CRI', 384: 'CIV', 191: 'HRV',
    192: 'CUB', 531: 'CUW', 196: 'CYP', 203: 'CZE', 208: 'DNK', 262: 'DJI', 212: 'DMA', 214: 'DOM',
    218: 'ECU', 818: 'EGY', 222: 'SLV', 226: 'GNQ', 232: 'ERI', 233: 'EST', 748: 'SWZ', 231: 'ETH',
    238: 'FLK', 234: 'FRO', 242: 'FJI', 246: 'FIN', 250: 'FRA', 254: 'GUF', 258: 'PYF', 260: 'ATF',
    266: 'GAB', 270: 'GMB', 268: 'GEO', 276: 'DEU', 288: 'GHA', 292: 'GIB', 300: 'GRC', 304: 'GRL',
    308: 'GRD', 312: 'GLP', 316: 'GUM', 320: 'GTM', 831: 'GGY', 324: 'GIN', 624: 'GNB', 328: 'GUY',
    332: 'HTI', 334: 'HMD', 336: 'VAT', 340: 'HND', 344: 'HKG', 348: 'HUN', 352: 'ISL', 356: 'IND',
    360: 'IDN', 364: 'IRN', 368: 'IRQ', 372: 'IRL', 833: 'IMN', 376: 'ISR', 380: 'ITA', 388: 'JAM',
    392: 'JPN', 832: 'JEY', 400: 'JOR', 398: 'KAZ', 404: 'KEN', 296: 'KIR', 408: 'PRK', 410: 'KOR',
    414: 'KWT', 417: 'KGZ', 418: 'LAO', 428: 'LVA', 422: 'LBN', 426: 'LSO', 430: 'LBR', 434: 'LBY',
    438: 'LIE', 440: 'LTU', 442: 'LUX', 446: 'MAC', 450: 'MDG', 454: 'MWI', 458: 'MYS', 462: 'MDV',
    466: 'MLI', 470: 'MLT', 584: 'MHL', 474: 'MTQ', 478: 'MRT', 480: 'MUS', 175: 'MYT', 484: 'MEX',
    583: 'FSM', 498: 'MDA', 492: 'MCO', 496: 'MNG', 499: 'MNE', 500: 'MSR', 504: 'MAR', 508: 'MOZ',
    104: 'MMR', 516: 'NAM', 520: 'NRU', 524: 'NPL', 528: 'NLD', 540: 'NCL', 554: 'NZL', 558: 'NIC',
    562: 'NER', 566: 'NGA', 570: 'NIU', 574: 'NFK', 807: 'MKD', 580: 'MNP', 578: 'NOR', 512: 'OMN',
    586: 'PAK', 585: 'PLW', 275: 'PSE', 591: 'PAN', 598: 'PNG', 600: 'PRY', 604: 'PER', 608: 'PHL',
    612: 'PCN', 616: 'POL', 620: 'PRT', 630: 'PRI', 634: 'QAT', 638: 'REU', 642: 'ROU', 643: 'RUS',
    646: 'RWA', 652: 'BLM', 654: 'SHN', 659: 'KNA', 662: 'LCA', 663: 'MAF', 666: 'SPM', 670: 'VCT',
    882: 'WSM', 674: 'SMR', 678: 'STP', 682: 'SAU', 686: 'SEN', 688: 'SRB', 690: 'SYC', 694: 'SLE',
    702: 'SGP', 534: 'SXM', 703: 'SVK', 705: 'SVN', 90: 'SLB', 706: 'SOM', 710: 'ZAF', 239: 'SGS',
    728: 'SSD', 724: 'ESP', 144: 'LKA', 729: 'SDN', 740: 'SUR', 744: 'SJM', 752: 'SWE', 756: 'CHE',
    760: 'SYR', 158: 'TWN', 762: 'TJK', 834: 'TZA', 764: 'THA', 626: 'TLS', 768: 'TGO', 772: 'TKL',
    776: 'TON', 780: 'TTO', 788: 'TUN', 792: 'TUR', 795: 'TKM', 796: 'TCA', 798: 'TUV', 800: 'UGA',
    804: 'UKR', 784: 'ARE', 826: 'GBR', 840: 'USA', 581: 'UMI', 858: 'URY', 860: 'UZB', 548: 'VUT',
    862: 'VEN', 704: 'VNM', 92: 'VGB', 850: 'VIR', 876: 'WLF', 732: 'ESH', 887: 'YEM', 894: 'ZMB',
    716: 'ZWE'
}

In [111]:
land_cover_change_df["country"] = land_cover_change_df.country.apply(lambda x: numeric_to_alpha3[x])

In [114]:
land_cover_change_df.drop("transition", axis=1, inplace=True)

In [116]:
land_cover_change_df.head()

Unnamed: 0,country,region,subregion,area,land_cover_class_start,land_cover_class_end
2784,AFG,1,1,826885400.0,Bare and sparse vegetation,Bare and sparse vegetation
2785,AFG,1,1,54046850.0,Bare and sparse vegetation,Short vegetation
2786,AFG,1,1,69842.62,Bare and sparse vegetation,Wetland – short vegetation
2787,AFG,1,1,13630.26,Bare and sparse vegetation,Water
2788,AFG,1,1,43920.09,Bare and sparse vegetation,Snow/ice


In [118]:
land_cover_change_df[(land_cover_change_df.country == "NGA") & (land_cover_change_df.region == 20) & (land_cover_change_df.subregion == 31)]

Unnamed: 0,country,region,subregion,area,land_cover_class_start,land_cover_class_end
941464,NGA,20,31,1505.846,Short vegetation,Bare and sparse vegetation
941465,NGA,20,31,36893.08,Short vegetation,Short vegetation
941466,NGA,20,31,752.9236,Short vegetation,Cropland
941467,NGA,20,31,26352.12,Short vegetation,Built-up
941468,NGA,20,31,18888110.0,Built-up,Built-up


In [119]:
land_cover_change_df.to_parquet('s3://gfw-data-lake/umd_lcl_land_cover/v2/tabular/statistics/admin_land_cover_change.parquet', index=False)

In [140]:
land_cover_composition_df[(land_cover_composition_df.country == "NGA") & (land_cover_composition_df.region == 20) & (land_cover_composition_df.subregion == 31)]

Unnamed: 0,country,region,subregion,lc_class,area,land_cover_class
203748,NGA,20,31,0,1505.846,Bare and sparse vegetation
203749,NGA,20,31,1,36893.08,Short vegetation
203750,NGA,20,31,6,752.9236,Cropland
203751,NGA,20,31,7,18914430.0,Built-up


In [130]:
land_cover_composition_df['land_cover_class'] = land_cover_composition_df.lc_class.apply(lambda x: land_cover_mapping[x])

In [131]:
land_cover_composition_df = land_cover_composition_df[land_cover_composition_df.country != 0]

land_cover_composition_df['country'] = land_cover_composition_df.country.apply(lambda x: numeric_to_alpha3[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  land_cover_composition_df['country'] = land_cover_composition_df.country.apply(lambda x: numeric_to_alpha3[x])


In [135]:
land_cover_composition_df.to_parquet('s3://gfw-data-lake/umd_lcl_land_cover/v2/tabular/statistics/admin_land_cover_composition_2024.parquet', index=False)