In [1]:
from eumap.raster import read_rasters, save_rasters
from eumap.misc import find_files, ttprint
from eumap import parallel
from pathlib import Path
import numpy as np
import joblib
import bottleneck as bn
import pandas as pd
import geopandas as gpd
import rasterio

## Layer aggregation

In [None]:
def mean_sd(raster_files, outfile):
    data, _ = read_rasters(raster_files=raster_files, n_jobs=4, dtype='float32')
    result = np.stack([bn.nanmean(data, axis=-1), bn.nanstd(data, axis=-1)], axis=-1)
    outfiles = [ outfile.replace('{op}', 'm'), outfile.replace('{op}', 'std') ]
    save_rasters(raster_files[0], outfiles, result)

mean_sd(find_files('/mnt/landmark/land1km/layers1km/static/', 'CHELSA_pr*.tif'), \
        './rasters_4326/clm_pre_chelsea.monthly.sum_{op}_1km_s0..0cm_1981..2010_v2.1.tif')
mean_sd(find_files('/mnt/landmark/land1km/layers1km/static/', 'clm_lst_*nighttime*p50_*.tif'), \
        './rasters_4326/clm_lst_mod11a2.daytime.monthly.p50_{op}_1km_s0..0cm_2000..2021_v1.2.tif')
mean_sd(find_files('/mnt/landmark/land1km/layers1km/static/', 'clm_lst_*daytime*p50_*.tif'), \
        './rasters_4326/clm_lst_mod11a2.nightime.monthly.p50_{op}_1km_s0..0cm_2000..2021_v1.2.tif')
mean_sd(find_files('/mnt/landmark/land1km/layers1km/static/', 'clm_snow.prob*p.90*.tif'), \
        './rasters_4326/clm_snow.prob_esacci.p90_{op}_1km_s0..0cm_2000..2012_v2.0.tif')


## Layer Reprojection

## Load Data

In [2]:
pasture_layer = find_files('/mnt/europa/WRI/rasters_3857', '*short.veg*')

raster_layers = find_files('/mnt/europa/WRI/rasters_3857/', 'clm*.tif') + \
                find_files('/mnt/europa/WRI/rasters_3857/', 'dtm*.tif') + \
                find_files('/mnt/europa/WRI/rasters_3857/', 'lcv_water*.tif') + \
                find_files('/mnt/europa/WRI/rasters_3857/', 'CHELSA*.tif')

raster_layers

[PosixPath('/mnt/europa/WRI/rasters_3857/clm_lst_mod11a2.daytime.monthly.p50_m_1km_s0..0cm_2000..2021_v1.2.tif'),
 PosixPath('/mnt/europa/WRI/rasters_3857/clm_lst_mod11a2.daytime.monthly.p50_std_1km_s0..0cm_2000..2021_v1.2.tif'),
 PosixPath('/mnt/europa/WRI/rasters_3857/clm_lst_mod11a2.nightime.monthly.p50_m_1km_s0..0cm_2000..2021_v1.2.tif'),
 PosixPath('/mnt/europa/WRI/rasters_3857/clm_lst_mod11a2.nightime.monthly.p50_std_1km_s0..0cm_2000..2021_v1.2.tif'),
 PosixPath('/mnt/europa/WRI/rasters_3857/clm_pre_chelsea.monthly.sum_m_1km_s0..0cm_1981..2010_v2.1.tif'),
 PosixPath('/mnt/europa/WRI/rasters_3857/clm_pre_chelsea.monthly.sum_std_1km_s0..0cm_1981..2010_v2.1.tif'),
 PosixPath('/mnt/europa/WRI/rasters_3857/clm_snow.prob_esacci.p90_m_1km_s0..0cm_2000..2012_v2.0.tif'),
 PosixPath('/mnt/europa/WRI/rasters_3857/clm_snow.prob_esacci.p90_std_1km_s0..0cm_2000..2012_v2.0.tif'),
 PosixPath('/mnt/europa/WRI/rasters_3857/dtm_elevation_glo90.copernicus_m_1km_s0..0cm_2019_epsg.4326_v1.0.tif'),
 Po

In [3]:
import rasterio
for r in raster_layers:
    ds = rasterio.open(r)
    print(ds.width, ds.height, Path(r).name)

36136 29694 clm_lst_mod11a2.daytime.monthly.p50_m_1km_s0..0cm_2000..2021_v1.2.tif
36136 29694 clm_lst_mod11a2.daytime.monthly.p50_std_1km_s0..0cm_2000..2021_v1.2.tif
36136 29694 clm_lst_mod11a2.nightime.monthly.p50_m_1km_s0..0cm_2000..2021_v1.2.tif
36136 29694 clm_lst_mod11a2.nightime.monthly.p50_std_1km_s0..0cm_2000..2021_v1.2.tif
36136 29694 clm_pre_chelsea.monthly.sum_m_1km_s0..0cm_1981..2010_v2.1.tif
36136 29694 clm_pre_chelsea.monthly.sum_std_1km_s0..0cm_1981..2010_v2.1.tif
36136 29694 clm_snow.prob_esacci.p90_m_1km_s0..0cm_2000..2012_v2.0.tif
36136 29694 clm_snow.prob_esacci.p90_std_1km_s0..0cm_2000..2012_v2.0.tif
36136 29694 dtm_elevation_glo90.copernicus_m_1km_s0..0cm_2019_epsg.4326_v1.0.tif
36136 29694 dtm_floodmap.500y_jrc.hazardmapping_m_1km_s0..0cm_1500..2016_v1.0.tif
36136 29694 dtm_slope_merit.dem_m_1km_s0..0cm_2017_v1.0.tif
36136 29694 lcv_water.occurrence_jrc.gsw_p_1km_s0..0cm_2020_epsg4326.tif
36136 29694 CHELSA_bio12_1981-2010_V.2.1.tif
36136 29694 CHELSA_bio13_1981-2

In [5]:
data, _ = read_rasters(raster_files=raster_layers, n_jobs=10, dtype='float32')
print(data.shape)

(29694, 36136, 15)


In [4]:
pasture_layer

[PosixPath('/mnt/europa/WRI/rasters_3857/lcv_land.cover_esacci.lc.short.veg_c_1km_s0..0cm_2020_v1.0.tif')]

In [6]:
data_mask, _ = read_rasters(raster_files=pasture_layer, n_jobs=1)
data_mask = (data_mask >= 1)[:,:,0]

In [7]:
fscs_input = np.ascontiguousarray(data[data_mask])

In [8]:
result = {
    'data': data,
    'data_mask': data_mask,
    'fscs_input': fscs_input
}
joblib.dump(result, 'data.joblib', compress = 'lz4')

['data.joblib']

In [2]:
result = joblib.load('data.joblib')

## Feature Space Coverage Sampling

In [3]:
data = result['data']

In [9]:
not_nan_mask = np.all(~np.isnan(data), axis=-1)
not_inf_mask = np.all(~np.isinf(data), axis=-1)
fscs_mask = np.logical_and.reduce([data_mask, not_nan_mask, not_inf_mask])

In [10]:
fscs_input = data[fscs_mask]

In [11]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.pipeline import Pipeline

n_cluster = 10000 
n_components = 10
batch_size = int(fscs_input.shape[0] / 1024)

fscs_model = Pipeline(
    steps=[
        ('scaler', StandardScaler()), 
        ('pca', PCA(n_components=n_components, svd_solver='arpack', random_state=0)),
        ('kmeans', MiniBatchKMeans(n_clusters=n_cluster, n_init=5, max_no_improvement=25, compute_labels=True, random_state=0, batch_size=batch_size, verbose=True)),
    ], verbose=True)

fscs_model

Pipeline(steps=[('scaler', StandardScaler()),
                ('pca',
                 PCA(n_components=10, random_state=0, svd_solver='arpack')),
                ('kmeans',
                 MiniBatchKMeans(batch_size=166411, max_no_improvement=25,
                                 n_clusters=10000, n_init=5, random_state=0,
                                 verbose=True))],
         verbose=True)

In [12]:
#fscs_distances = fscs_model.fit_transform(fscs_input_sub[np.random.choice(fscs_input_sub.shape[0], 25000000, replace=False)])
fscs_model.fit(fscs_input)

[Pipeline] ............ (step 1 of 3) Processing scaler, total= 1.1min
[Pipeline] ............... (step 2 of 3) Processing pca, total=  55.9s
Init 1/5 with method: k-means++
Inertia for init 1/5: 86792.085938
Init 2/5 with method: k-means++
Inertia for init 2/5: 86727.406250
Init 3/5 with method: k-means++
Inertia for init 3/5: 86793.140625
Init 4/5 with method: k-means++
Inertia for init 4/5: 86719.640625
Init 5/5 with method: k-means++
Inertia for init 5/5: 86827.664062
Minibatch iteration 1/102500: mean batch inertia: 0.191721, ewa inertia: 0.191721 
Minibatch iteration 2/102500: mean batch inertia: 0.192278, ewa inertia: 0.191722 
Minibatch iteration 3/102500: mean batch inertia: 0.187767, ewa inertia: 0.191714 
Minibatch iteration 4/102500: mean batch inertia: 0.188556, ewa inertia: 0.191708 
Minibatch iteration 5/102500: mean batch inertia: 0.188037, ewa inertia: 0.191701 
Minibatch iteration 6/102500: mean batch inertia: 0.184546, ewa inertia: 0.191687 
Minibatch iteration 7/102

Pipeline(steps=[('scaler', StandardScaler()),
                ('pca',
                 PCA(n_components=10, random_state=0, svd_solver='arpack')),
                ('kmeans',
                 MiniBatchKMeans(batch_size=166411, max_no_improvement=25,
                                 n_clusters=10000, n_init=5, random_state=0,
                                 verbose=True))],
         verbose=True)

In [18]:
joblib.dump(fscs_model, 'fscs_model.joblib', compress = 'lz4')

['fscs_model.joblib']

## Calculating cluster distances

In [45]:
fscs_input.shape

(170405809, 15)

In [19]:
import bottleneck as bn

indices = np.arange(fscs_input.shape[0])

map_fscs_min = []
map_fscs_argmin = []

def map_fscs_distance(fscs_model, input_data, ind):
    fscs_distance = fscs_model.transform(input_data)
    return bn.nanmin(fscs_distance, axis=0), ind[bn.nanargmin(fscs_distance, axis=0)]

ttprint("Calculating cluster distances for all samples")
args = [ (fscs_model, fscs_input[ind], ind) for ind in np.array_split(indices, 10000) ]

for (fscs_min, fscs_argmin) in parallel.job(map_fscs_distance, args, n_jobs=-1, joblib_args={'backend': 'threading'}):
    map_fscs_min.append(fscs_min)
    map_fscs_argmin.append(fscs_argmin)
ttprint("End")
# 00:00

[07:16:23] Calculating cluster distances for all samples
[08:58:05] End


In [20]:
arg_ind = np.argmin(np.stack(map_fscs_min, axis=0), axis=0)
fscs_argmin = np.stack(map_fscs_argmin, axis=0)
print(arg_ind.shape, fscs_argmin.shape)

(10000,) (10000, 10000)


In [61]:
arg_min.shape

(10000,)

In [60]:
arg_min = np.min(np.stack(map_fscs_min, axis=0), axis=0)
nearest_center = np.take_along_axis(fscs_argmin, np.stack([arg_ind], axis=0), axis=0)
#nearest_center_val = np.take_along_axis(fscs_argmin, np.stack([arg_min], axis=0), axis=0)

array([[115080479, 145945787,  65360628, ..., 103997607, 155019070,
        170313051]])

In [22]:
fscs_distances = {
    'map_fscs_min': map_fscs_min,
    'map_fscs_argmin': map_fscs_argmin,
    'nearest_center': nearest_center
}
joblib.dump(fscs_distances, 'fscs_distances.joblib')

['fscs_distances.joblib']

## Predicting cluster

In [None]:
ttprint("Calculating cluster")
clusters = fscs_model.predict(fscs_input)
ttprint("End")

In [64]:
val, count = np.unique(clusters, return_counts=True)

In [86]:
count_df = pd.DataFrame(np.stack([val, count], axis=0).transpose(1,0), columns=['value', 'cluster_size'])
count_df

Unnamed: 0,value,cluster_size
0,0,14261
1,1,17025
2,2,21028
3,3,6254
4,4,30944
...,...,...
9993,9995,12310
9994,9996,28960
9995,9997,2614
9996,9998,31850


## Generating point samples

In [23]:
import rasterio
ds = rasterio.open(raster_layers[0])

lon = np.arange(0.5, ds.width + 0.5)
lat = np.arange(0.5, ds.height + 0.5)
pixel_size = ds.transform[0]
lon_grid, lat_grid = ds.transform * np.meshgrid(lon, lat)

In [71]:
print(fscs_input.shape, lon_grid[fscs_mask].shape)

(170405809, 15) (170405809,)


In [38]:
lon = lon_grid[fscs_mask][nearest_center.flatten()]
lat = lat_grid[fscs_mask][nearest_center.flatten()]
print(lon.shape, lat.shape)

(10000,) (10000,)


In [42]:
samp_clusters = fscs_model.predict(fscs_input[nearest_center.flatten()])

Computing label assignment and total inertia


In [108]:
np.sum(points['cluster_size'])

170405809.0

In [109]:
samples = np.stack([lon, lat, samp_clusters, arg_min], axis=0).transpose(1,0)
samples_df = pd.DataFrame(samples, columns=['X', 'Y', 'cluster_id', 'cluster_distance'])
samples_df = samples_df.join(count_df).drop(columns=['value'])

samples_df = samples_df.sort_values('cluster_size', ascending=False).reset_index(drop=True)
samples_df['priority'] = range(0, samples_df.shape[0])
samples_df

points = gpd.GeoDataFrame(samples_df, geometry=gpd.points_from_xy(samples_df.X, samples_df.Y))
points = points.set_crs(ds.crs)
points

Unnamed: 0,X,Y,cluster_id,cluster_distance,cluster_size,priority,geometry
0,7.708703e+06,4.055308e+06,4778.0,0.161393,172650.0,0,POINT (7708702.815 4055307.859)
1,1.644638e+07,1.171749e+07,858.0,0.028470,172390.0,1,POINT (16446383.333 11717486.478)
2,-1.429353e+07,8.116518e+06,7602.0,0.129038,169294.0,2,POINT (-14293528.619 8116517.600)
3,8.475010e+06,8.062176e+06,954.0,0.033937,157247.0,3,POINT (8475010.372 8062175.908)
4,9.813553e+06,6.584969e+06,8765.0,0.124192,151456.0,4,POINT (9813553.383 6584969.087)
...,...,...,...,...,...,...,...
9995,1.108778e+07,1.357076e+06,7621.0,0.124492,5.0,9995,POINT (11087775.355 1357076.482)
9996,-1.530936e+07,1.026911e+07,6740.0,0.050092,3.0,9996,POINT (-15309357.450 10269114.025)
9997,-1.419039e+07,6.994195e+06,4970.0,0.287460,2.0,9997,POINT (-14190393.159 6994195.301)
9998,4.999456e+06,-2.212840e+06,9998.0,0.087632,,9998,POINT (4999456.273 -2212840.000)


In [91]:
import fiona
points.to_file(f'samples_2.gpkg',  driver="GPKG")

## Generating Tiles

In [120]:
tiles = points.copy()
tiles['geometry'] = tiles['geometry'].buffer(500).envelope
tiles

Unnamed: 0,X,Y,cluster_id,cluster_distance,cluster_size,priority,geometry
0,7.708703e+06,4.055308e+06,4778.0,0.161393,172650.0,0,"POLYGON ((7708202.815 4054807.859, 7709202.815..."
1,1.644638e+07,1.171749e+07,858.0,0.028470,172390.0,1,"POLYGON ((16445883.333 11716986.478, 16446883...."
2,-1.429353e+07,8.116518e+06,7602.0,0.129038,169294.0,2,"POLYGON ((-14294028.619 8116017.600, -14293028..."
3,8.475010e+06,8.062176e+06,954.0,0.033937,157247.0,3,"POLYGON ((8474510.372 8061675.908, 8475510.372..."
4,9.813553e+06,6.584969e+06,8765.0,0.124192,151456.0,4,"POLYGON ((9813053.383 6584469.087, 9814053.383..."
...,...,...,...,...,...,...,...
9995,1.108778e+07,1.357076e+06,7621.0,0.124492,5.0,9995,"POLYGON ((11087275.355 1356576.482, 11088275.3..."
9996,-1.530936e+07,1.026911e+07,6740.0,0.050092,3.0,9996,"POLYGON ((-15309857.450 10268614.025, -1530885..."
9997,-1.419039e+07,6.994195e+06,4970.0,0.287460,2.0,9997,"POLYGON ((-14190893.159 6993695.301, -14189893..."
9998,4.999456e+06,-2.212840e+06,9998.0,0.087632,,9998,"POLYGON ((4998956.273 -2213340.000, 4999956.27..."


In [121]:
tiles.to_file(f'tiles.gpkg',  driver="GPKG")

## Filtering to MVP area

In [122]:
mvp_area = gpd.read_file('/mnt/lacus/raw/WRI_Pasture_Mapping/sampling_design/vectors_3857/mvp_area.gpkg')

  for feature in features_lst:


(4, 2)

In [128]:
tiles_mvp = gpd.sjoin(tiles, mvp_area, how="inner")
samples_mvp = gpd.sjoin(points, mvp_area, how="inner")
tiles_mvp.shape

(1793, 9)

In [129]:
tiles_mvp.to_file(f'lcv_pastures_fscs.tiles_epsg.3857_mvp.gpkg',  driver="GPKG")
samples_mvp.to_file(f'lcv_pastures_fscs.samples_epsg.3857_mvp.gpkg',  driver="GPKG")

In [149]:
n_analyst = 10
out_dir = Path('tiles_split')

out_dir.mkdir(parents=True, exist_ok=True)
tiles_mvp = tiles_mvp.sort_values('cluster_size', ascending=False).reset_index(drop=True)

for idx in np.array_split(range(0,tiles_mvp.shape[0]), n_analyst):
    out_file = out_dir.joinpath(f'lcv_pastures_fscs.tiles_epsg.3857_mvp.{np.min(idx)}.{np.max(idx)}.gpkg')
    tiles_mvp.iloc[idx].to_file(out_file,  driver="GPKG")

## Raster output

In [28]:
land_layer = find_files('/mnt/europa/WRI/rasters_3857/', '*landmask*')
land, _ = read_rasters(raster_files=land_layer, n_jobs=1)
land = np.where(land[:,:,0] == 2, -10000, -1)

In [30]:
land[fscs_mask] = clusters

In [31]:
save_rasters(land_layer[0], ['kmeans.tif'], np.stack([land],axis=-1), dtype='int16', nodata=-10000)

['kmeans.tif']