# Sentinel 2 Spatial Data
Download spatial data and calculate vegetation index that is not inside the Black Forest AOI

Source:
- [Planetary Computer](https://planetarycomputer.microsoft.com/docs/overview/about)



In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import ipyleaflet

import planetary_computer as pc

import pystac_client
import geopandas as gpd
from satstac import Item, ItemCollection
import intake_stac

import rasterio
import rioxarray
import xarray

import dask
from dask import compute, delayed
from dask.distributed import Client

import datetime
import calendar
import re
import os

pd.set_option("display.max_columns", None)

In [3]:
client = Client(n_workers=58)

In [4]:
BASE_DIR = "./data/monthly ndvi"
AOI = "observations"

## Planetary Computer Interface

In [5]:
# get catalog from planetary computer
catalog = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1/")

# explre available collections on planetary computer
collections = catalog.get_children()
for collection in collections:
    if 'landsat' in collection.title.lower() or 'sentinel' in collection.title.lower():
        print(f"{collection.id:<25} \t- {collection.title}")
        
COLLECTION = "sentinel-2-l2a"

collection_childs = catalog.get_child(COLLECTION)
bands = pd.json_normalize(collection_childs.extra_fields["summaries"]["eo:bands"])
bands

sentinel-1-grd            	- Sentinel 1 Level-1 Ground Range Detected (GRD)
sentinel-1-rtc            	- Sentinel 1 Radiometrically Terrain Corrected (RTC)
landsat-8-c2-l2           	- Landsat 8 Collection 2 Level-2
sentinel-2-l2a            	- Sentinel-2 Level-2A
landsat-c2-l1             	- Landsat Collection 2 Level-1
landsat-c2-l2             	- Landsat Collection 2 Level-2


Unnamed: 0,name,description,gsd,common_name,center_wavelength,full_width_half_max
0,AOT,aerosol optical thickness,,,,
1,B01,coastal aerosol,60.0,coastal,0.443,0.027
2,B02,visible blue,10.0,blue,0.49,0.098
3,B03,visible green,10.0,green,0.56,0.045
4,B04,visible red,10.0,red,0.665,0.038
5,B05,vegetation classification red edge,20.0,rededge,0.704,0.019
6,B06,vegetation classification red edge,20.0,rededge,0.74,0.018
7,B07,vegetation classification red edge,20.0,rededge,0.783,0.028
8,B08,near infrared,10.0,nir,0.842,0.145
9,B8A,vegetation classification red edge,20.0,rededge,0.865,0.033


### Area of Interest
Download data that intersects with AOI

In [6]:
geodf = gpd.read_file(f"./data/wze_observations_out_aoi.geojson")

geodf.head()

Unnamed: 0,id,count_trees,geometry
0,BW_18A,162,"POLYGON ((8.37418 48.01841, 8.37414 48.01775, ..."
1,BW_13A,154,"POLYGON ((9.66377 48.94567, 9.66371 48.94500, ..."
2,BW_11A,145,"POLYGON ((8.53432 48.36555, 8.53428 48.36489, ..."
3,BY_90754,144,"POLYGON ((11.89871 49.36668, 11.89862 49.36603..."
4,ST_150061,144,"POLYGON ((11.73014 52.43167, 11.73005 52.43101..."


In [7]:
def bounding_box(points):
    x_coordinates, y_coordinates = zip(*points)

    return [(min(x_coordinates), min(y_coordinates)), (max(x_coordinates), max(y_coordinates))]

geodf["bbox"] = geodf["geometry"].apply(lambda x: bounding_box([point for point in x.exterior.coords]))
geodf["bbox_search"] = geodf["bbox"].apply(lambda x: (x[0][0], x[0][1], x[1][0], x[1][1]))

geodf_32632 = geodf.copy()
geodf_32632 = geodf_32632.to_crs(32632)
geodf_32632 = geodf_32632.rename(columns={"geometry":"geometry_32632"})
geodf = geodf.merge(geodf_32632[["id", "geometry_32632"]], on="id")

geodf.head()

Unnamed: 0,id,count_trees,geometry,bbox,bbox_search,geometry_32632
0,BW_18A,162,"POLYGON ((8.37418 48.01841, 8.37414 48.01775, ...","[(8.354067836793147, 48.0116098913687), (8.374...","(8.354067836793147, 48.0116098913687, 8.374182...","POLYGON ((453333.515 5318536.165, 453329.904 5..."
1,BW_13A,154,"POLYGON ((9.66377 48.94567, 9.66371 48.94500, ...","[(9.643286471909962, 48.93897799103043), (9.66...","(9.643286471909962, 48.93897799103043, 9.66377...","POLYGON ((548602.512 5421628.042, 548598.900 5..."
2,BW_11A,145,"POLYGON ((8.53432 48.36555, 8.53428 48.36489, ...","[(8.514070792455046, 48.35876363600368), (8.53...","(8.514070792455046, 48.35876363600368, 8.53432...","POLYGON ((465508.674 5357035.904, 465505.062 5..."
3,BY_90754,144,"POLYGON ((11.89871 49.36668, 11.89862 49.36603...","[(11.878072593315167, 49.36020511473224), (11....","(11.878072593315167, 49.36020511473224, 11.898...","POLYGON ((710444.371 5472261.133, 710440.760 5..."
4,ST_150061,144,"POLYGON ((11.73014 52.43167, 11.73005 52.43101...","[(11.708106362545951, 52.42519178301072), (11....","(11.708106362545951, 52.42519178301072, 11.730...","POLYGON ((685599.773 5812557.953, 685596.162 5..."


#### Identfy Tiles

In [8]:
time_range = "2016-05-01/2016-08-31"

search_mapping = dict()

for _i, _r in geodf.iterrows():
    search = catalog.search(
        collections=[COLLECTION], 
        bbox=_r["bbox_search"], 
        datetime=time_range
    )

    # save search results into geojson file
    search.get_all_items().save_object(f"{BASE_DIR}/GeoJSON/{AOI}_all_tiles.geojson")
    # load geojson file to get all attributes
    gf = gpd.read_file(f"{BASE_DIR}/GeoJSON/{AOI}_all_tiles.geojson")

    df_tiles = gf.groupby("s2:mgrs_tile").agg({"geometry": lambda x:x.value_counts().index[0]}).reset_index()

    tile_names = list(df_tiles.loc[df_tiles.intersects(_r["geometry"])]["s2:mgrs_tile"].unique())
    
    for tile in tile_names:
        if tile in search_mapping:
            search_mapping[tile].append(_r["id"])
        else:
            search_mapping[tile] = [_r["id"]]
            
print(f"Tiles of interest for selected area: {' '.join(list(search_mapping.keys()))}")

Tiles of interest for selected area: 32UMU 32UNV 32UPV 32UQV 32UPD 32UNU 32UPU 32UQU 33UUP 33UUQ 32UMA 32UMB 32UQA 32UNB 32UQC 33UUT 31UGU 32ULD 33UUR 32UNC 32UNF 32UPF 32UMV 32UND 31UGV 32ULE 32UME 32UQD 32UMD 32UNE 33UUU 33UVT 32ULB 32TNT 32UPE 32UQE 33UUV 32TQT 33TUN 32UMC 33UVV 33UVU 32TLT 32TMT 32UPA 32ULC 32ULU 32UNA 32UMF 31UGS 32UPC 32TPT 31UGT 31UGR 32ULA


## Data Preparation
- Download data
- Remove clouds
- Calculate vegetation index
- Save data by month
- Combine data by year
- Clip AOI
- Save AOI

parallize using DASK

In [9]:
def get_scenes_in_year(year: str, bbox_search)->pystac_client.item_search.ItemSearch:
    time_range = f"{year}-05-01/{year}-08-31"

    search = catalog.search(
        collections=[COLLECTION], 
        bbox=bbox_search, 
        datetime=time_range
    )

    search.get_all_items().save_object(f"{BASE_DIR}/GeoJSON/{AOI}_{year}.geojson")

    print("-"*75)
    print(f'Total: {len([i for i in search.get_items()])} matches from May to August {year}')
    print("-"*75)
    return search

@dask.delayed
def calc_vegetation_index(intake_stac_scene: intake_stac.catalog.StacItem, search_item)->(xarray.Dataset, str):
    try:
        # Create DataArray using Raster data from RED and NIR band
        da_red = rioxarray.open_rasterio(pc.sign(intake_stac_scene.B04.metadata["href"]))
        da_nir = rioxarray.open_rasterio(pc.sign(intake_stac_scene.B08.metadata["href"]))
        scl = rioxarray.open_rasterio(pc.sign(intake_stac_scene.SCL.metadata["href"]))

        scl_high = scl.reindex(x=da_nir.x, y=da_nir.y, method='nearest')
        no_cloud_map = (scl_high[0] != 8) & (scl_high[0] != 9) & (scl_high[0] != 10) & (scl_high[0] != 3)

        # Calculate NDVI from float32 (f4) arrays
        nir_values = da_nir.values[0].astype('f4')
        red_values = da_red.values[0].astype('f4')
        ndvi = (nir_values - red_values) / (nir_values + red_values)
        evi2 = 2.5 * (nir_values - red_values) / (nir_values + 2.4 * red_values + 1.0)
        # include nan after these have been lost due to evi2 calculation
        evi2 = np.where(~np.isnan(ndvi), evi2, np.nan)

        ds = xarray.Dataset(
            data_vars=dict(
                ndvi=(["x", "y"], np.where(no_cloud_map, ndvi, np.nan)),
                evi2=(["x", "y"], np.where(no_cloud_map, evi2, np.nan))
            ),
            coords=dict(
                x=(["x"], da_nir.x.values),
                y=(["y"], da_nir.y.values)
            )
        )

        #ds = ds.rio.write_crs(int(str(da_nir.rio.crs).split(":")[-1]))
        ds = ds.rio.write_crs(32632)

        #if intake_stac_scene.metadata["date"].month == 5 and intake_stac_scene.metadata["date"].year == 2016:
        #    ds.to_netcdf(f"{BASE_DIR}/scenes/{intake_stac_scene.name}.nc", 'w', engine='netcdf4')

        # return dataset with ndvi and evi2 and scene date
        del ndvi, evi2, da_red, da_nir, nir_values, red_values, scl, scl_high, no_cloud_map

        # rio clip bounding box
        ds = ds.rio.clip(search_item["geometry_32632"])
        
        return ds, intake_stac_scene.metadata["date"]
    except: # (rasterio.RasterioIOError, rasterio.NoDataInBounds)
        return None, None
    
def process_scenes(intakeStacItemCollection: intake_stac.catalog.StacItemCollection, seach_item)->xarray.Dataset:
    #scenes = []
    date_scenes = []
    ds_scenes = []

    delayed_tasks = []
    for s in range(len(list(intakeStacItemCollection))):
        print(f'Current scene ({s}) {list(intakeStacItemCollection)[s]} started at {str(datetime.datetime.now())}')
        delayed_tasks.append(calc_vegetation_index(intakeStacItemCollection[list(intakeStacItemCollection)[s]], seach_item))
    
    processed_index = compute(*delayed_tasks)
    
    ds_scenes, date_scenes = zip(*processed_index)

    ds_scenes = list(filter(None, ds_scenes))
    date_scenes = list(filter(None, date_scenes))

    for i, ds in enumerate(ds_scenes):
        if ds["ndvi"].shape[0] == 0 or ds["ndvi"].shape[1] == 0:
            del ds_scenes[i]
            del date_scenes[i]

    try:
        #ds_concat_max = xarray.concat([ds_scene.to_array(name="vegetation index", dim="index") for ds_scene in ds_scenes], dim=xarray.Variable('date', pd.to_datetime(date_scenes))).to_dataset(dim="index").groupby("date.month").max()
        ds_concat_max = xarray.concat(ds_scenes, dim=xarray.Variable('date', pd.to_datetime(date_scenes))).groupby('date.year').max()
    except ValueError:
        ds_concat_max = None
        
    # RAM mgmt
    del processed_index, delayed_tasks
    ds_scenes = []
    date_scenes = []

    return ds_concat_max

In [None]:
years = [str(year) for year in range(2015, 2021)]

for tile, observation_ids in search_mapping.items():
    if tile in list(search_mapping.keys()):
        for year in years:
            observations_in_tile = geodf.loc[geodf.id.isin(observation_ids)]
            bbox_search = observations_in_tile.iloc[0]["bbox_search"]

            # get scenes from planetary computer
            search = get_scenes_in_year(year, bbox_search)
            # create pyStacItemCollection
            pyStacItemCollection = search.get_all_items()

            items_list = np.array([item.to_dict() for item in pyStacItemCollection])

            item_list_tile_part = list()
            for item_in_list in items_list:
                if tile in item_in_list["id"]:
                    item_list_tile_part.append(item_in_list)

            # pystac to stacstac item
            # necessary to use within intake_stac collection
            stacStacItems = [Item(sentinel_item) for sentinel_item in item_list_tile_part]

            # items to stac item collection
            stacStacItemCollection = ItemCollection(stacStacItems)

            # StacItemCollection to filter Catalog by Item Name
            intakeStacItemCollection = intake_stac.catalog.StacItemCollection(stacStacItemCollection)

            # get dataset (max index values)
            ds = process_scenes(intakeStacItemCollection, observations_in_tile)

            try:
                # export to nc file
                ds.to_netcdf(f"{BASE_DIR}/year/{AOI}_{year}_{tile}.nc", 'w', engine='netcdf4')
            except:
                pass

            del ds, stacStacItems, stacStacItemCollection, intakeStacItemCollection