## Python notebook for generating the STAC catalog json and corresponding Item json for Raster & Vector layers

### Tools:
1. Pystac 
2. Rasterio
3. Geopandas
4. Matplotlib

This notebook returns Catalog json for Raster and Vector layers.

### 1. Importing the required modules

In [13]:
import os
import json
import xml.etree.ElementTree as ET
from datetime import datetime

import rasterio
import geopandas as gpd
import matplotlib.pyplot as plt
import pystac
import sys
import constants
from shapely.geometry import mapping, box
from pystac.extensions.table import TableExtension
from pystac import Asset, MediaType
from pystac.extensions.classification import ClassificationExtension, Classification



### 2. Defining the variables used in the notebook

In [14]:

base_dir="data/"
qml_path="data/style_file.qml"
vector_qml_file ="data/swb_style.qml"

raster_filename="saraikela-kharsawan_gobindpur_2023-07-01_2024-06-30_LULCmap_10m.tif"
vector_filename="swb2_saraikela-kharsawan_gobindpur.geojson"

corestack_dir = os.path.join(base_dir, "CorestackCatalogs")
gobindpur_dir = os.path.join(corestack_dir, "gobindpur")
raster_dir = os.path.join(gobindpur_dir, "raster")
vector_dir = os.path.join(gobindpur_dir, "vector")

os.makedirs(raster_dir, exist_ok=True)
os.makedirs(vector_dir, exist_ok=True)


raster_path = os.path.join(base_dir, raster_filename)
vector_path = os.path.join(base_dir, vector_filename)

raster_thumbnail = os.path.join(raster_dir, "raster_thumbnail.png")
vector_thumbnail = os.path.join(vector_dir, "vector_thumbnail.png")

raster_style_file = os.path.join(base_dir, "style_file.qml")
vector_style_file = os.path.join(base_dir, "swb_style.qml")

blocks_info = [
    {
        "block": "gobindpur",
        "location": "jharkhand",
        "raster_file": "saraikela-kharsawan_gobindpur_2023-07-01_2024-06-30_LULCmap_10m.tif",
        "vector_file": "swb2_saraikela-kharsawan_gobindpur.geojson",
        "raster_style_file": "style_file.qml",
        "vector_style_file": "swb_style.qml"
    },
    {
        "block": "mirzapur",
        "location": "uttar_pradesh",
        "raster_file":"Mirzapur_Mirzapur_2023-07-01_2024-06-30_LULCmap_10m.tif",
        "vector_file":"surface_waterbodies_mirzapur_mirzapur.geojson",
        "raster_style_file": "style_file.qml",
        "vector_style_file": "swb_style.qml"
    },
    {
        "block": "koraput",
        "location": "odisha",
        "raster_file": "Narayanpatana_Koraput_2023-07-01_2024-06-30_LULCmap_10m.tif",
        "vector_file": "surface_waterbodies_koraput_narayanpatana.geojson",
        "raster_style_file": "style_file.qml",
        "vector_style_file": "swb_style.qml"
    }
]



### 3. For Raster layers the data range fecthed from filename

In [15]:
def extract_raster_dates_from_filename(raster_filename):
    try:
        print(raster_filename)
        parts = raster_filename.split('_')
        start_date = datetime.strptime(parts[2], "%Y-%m-%d")
        end_date = datetime.strptime(parts[3], "%Y-%m-%d")
        print(start_date)
        print(end_date)
    except Exception as e:
        raise ValueError(f"Failed to extract raster dates from filename '{raster_filename}': {e}")
        
    return start_date, end_date    

In [16]:
extract_raster_dates_from_filename(raster_filename=raster_filename)

saraikela-kharsawan_gobindpur_2023-07-01_2024-06-30_LULCmap_10m.tif
2023-07-01 00:00:00
2024-06-30 00:00:00


(datetime.datetime(2023, 7, 1, 0, 0), datetime.datetime(2024, 6, 30, 0, 0))

### 4. Parsing the QML file for Raster Layers

In [17]:


def parse_qml_classes(qml_path):
    tree = ET.parse(qml_path)
    root = tree.getroot()
    classes = []

    for entry in root.findall(".//paletteEntry"):
        class_info = {}
        for attr_key, attr_value in entry.attrib.items():
            if attr_key == "value":
                try:
                    class_info[attr_key] = int(attr_value)
                except ValueError:
                    class_info[attr_key] = attr_value
            else:
                class_info[attr_key] = attr_value
        classes.append(class_info)
    return classes

### 5. Generating the thumbnails from the files 

In [18]:
def generate_raster_thumbnail(tif_path, out_path):
    with rasterio.open(tif_path) as src:
        arr = src.read(1)
    plt.figure(figsize=(3, 3))
    plt.imshow(arr, cmap="tab20")
    plt.axis('off')
    plt.savefig(out_path, bbox_inches='tight', pad_inches=0)
    plt.close()

def generate_vector_thumbnail(vector_path, out_path):
    gdf = gpd.read_file(vector_path)
    if gdf.crs is None or gdf.crs.to_epsg() != 4326:
        gdf = gdf.to_crs(epsg=4326)
    fig, ax = plt.subplots(figsize=(3, 3))
    fig.patch.set_facecolor("white")
    ax.set_facecolor("white")
    gdf.plot(ax=ax, color="lightblue", edgecolor="blue", linewidth=0.5)
    ax.axis('off')
    plt.savefig(out_path, dpi=150, bbox_inches='tight', pad_inches=0, facecolor=fig.get_facecolor())
    plt.close()


### 6. Creating the Raster items and adding the assets

In [19]:
def create_raster_item(
    block,
    raster_filename,
    raster_path,
    raster_dir,
    raster_thumbnail,
    raster_style_file
):
    try:
        start_date, end_date = extract_raster_dates_from_filename(raster_filename=raster_filename)
    except ValueError as e:
        raise RuntimeError(f"Raster item creation failed")
    
    with rasterio.open(raster_path) as src:
        bounds = src.bounds
        geom = mapping(box(*bounds))
        bbox = [bounds.left, bounds.bottom, bounds.right, bounds.top]

    generate_raster_thumbnail(raster_path, raster_thumbnail)
    style_info = parse_qml_classes(raster_style_file)

    print(style_info)
   
    style_json_path = os.path.join(raster_dir, "legend.json")
    with open(style_json_path, "w") as f:
        json.dump(style_info, f, indent=2)

    

    

    item = pystac.Item(
        id=constants.raster_lulc_id,
        geometry=geom,
        bbox=bbox,
        datetime=start_date,
        start_datetime= start_date,
        end_datetime= end_date,
        properties={
            "title" :constants.raster_lulc_title,
            "description":constants.raster_lulc_description,
            "classification:classes": style_info,
            
        }
    )
    print(item)
    

    item.add_asset("data", Asset(
        href=f"{constants.data_url}/{raster_filename}",
        media_type=MediaType.GEOTIFF,
        roles=["data"],
        title="Raster Layer"
    ))


    ## Using classification extension 
    classification_ext = ClassificationExtension.ext(item.assets["data"], add_if_missing=True)

    stac_classes = []
    for cls in style_info:
        
        stac_class_obj = Classification.create(
            value=int(cls["value"]),
            name=cls.get("name") or cls.get("label"),
            description=cls.get("description"),
        )
        stac_classes.append(stac_class_obj)

    classification_ext.classes = stac_classes




    item.add_asset("thumbnail", Asset(
        href=f"{constants.base_url}/CorestackCatalogs/raster/{os.path.basename(raster_thumbnail)}",
        media_type=MediaType.PNG,
        roles=["thumbnail"],
        title="Raster Thumbnail"
    ))

    item.add_asset("legend", Asset(
        href=f"{constants.base_url}/CorestackCatalogs/raster/{os.path.basename(style_json_path)}",
        media_type=MediaType.JSON,
        roles=["metadata"],
        title="Legend JSON"
    ))

    item.add_asset("style", Asset(
        href=f"{constants.base_url}/CorestackCatalogs/raster/{os.path.basename(raster_style_file)}",
        media_type=MediaType.XML,
        roles=["metadata"],
        title="Raster Style (QML)"
    ))

    item.set_self_href(os.path.join(raster_dir, "item.json"))
    item.save_object()
    return item


### 7.Creating the Vector items and adding the assets

In [20]:


def create_vector_item(
        block,
        vector_filename,
        vector_path,
        vector_dir,
        vector_thumbnail,
        vector_style_file
    ):
    start_date = constants.DEFAULT_START_DATE
    end_date = constants.DEFAULT_END_DATE

    
    gdf = gpd.read_file(vector_path)

   
    geom = mapping(gdf.unary_union)
    bounds = gdf.total_bounds
    bbox = [float(b) for b in bounds]

    
    generate_vector_thumbnail(vector_path, vector_thumbnail)




    item = pystac.Item(
        id=constants.swb_vector_id,
        geometry=geom,
        bbox=bbox,
        datetime=start_date,
        start_datetime=start_date,
        end_datetime=end_date,
        properties={
            "title": constants.swb_vector_title,
            "description": constants.swb_vector_description,
        }
    )

    # Use table extension
    table_ext = TableExtension.ext(item, add_if_missing=True)

    table_ext.columns = [
        {
            "name": col,
            "type": str(dtype),
        }
        for col, dtype in gdf.dtypes.items()
    ]

    
    item.properties["table:summary"] = {
    "number_of_records": gdf.shape[0]
}


    
    item.add_asset("data", Asset(
        href=f"{constants.data_url}/{vector_filename}",
        media_type=MediaType.GEOJSON,
        roles=["data"],
        title="Vector Layer"
    ))

    item.add_asset("thumbnail", Asset(
        href=f"{constants.base_url}/CorestackCatalogs/vector/{os.path.basename(vector_thumbnail)}",
        media_type=MediaType.PNG,
        roles=["thumbnail"],
        title="Vector Thumbnail"
    ))

    item.add_asset("style", Asset(
        href=f"{constants.base_url}/CorestackCatalogs/vector/{os.path.basename(vector_style_file)}",
        media_type=MediaType.XML,
        roles=["style"],
        title="Vector Style"
    ))


    
    item.set_self_href(os.path.join(vector_dir, "item.json"))
    item.save_object()

    return item


In [21]:
def generate_stac_for_block(info):
    base_dir = 'data/'
    corestack_dir = os.path.join(base_dir, 'CorestackCatalogs')

    block = info['block']
    location = info['location']

    raster_filename = info['raster_file']
    vector_filename = info['vector_file']
    raster_style_file = os.path.join(base_dir, info['raster_style_file'])
    vector_style_file = os.path.join(base_dir, info['vector_style_file'])
    raster_path = os.path.join(base_dir, raster_filename)
    vector_path = os.path.join(base_dir, vector_filename)

    location_dir = os.path.join(corestack_dir, location)
    block_dir = os.path.join(location_dir, block)

    raster_dir = os.path.join(block_dir, 'raster')
    vector_dir = os.path.join(block_dir, 'vector')

    os.makedirs(raster_dir, exist_ok=True)
    os.makedirs(vector_dir, exist_ok=True)

    raster_thumbnail = os.path.join(raster_dir, f'{block}_raster_thumbnail.png')
    vector_thumbnail = os.path.join(vector_dir, f'{block}_vector_thumbnail.png')

    raster_item = create_raster_item(block, raster_filename, raster_path, raster_dir, raster_thumbnail, raster_style_file)
    vector_item = create_vector_item(block, vector_filename, vector_path, vector_dir, vector_thumbnail, vector_style_file)

    block_catalog = pystac.Catalog(
        id=block,
        title=f"STAC for {block}",
        description=f"STAC catalog for {block} block data in {location}"
    )
    block_catalog.add_item(raster_item)
    block_catalog.add_item(vector_item)
    block_catalog.set_self_href(os.path.join(block_dir, 'catalog.json'))
    block_catalog.normalize_and_save(block_dir, catalog_type=pystac.CatalogType.SELF_CONTAINED)
    print(f" STAC catalog created for block: {block} in {location}")

    ##Load location catalog
    location_catalog_path = os.path.join(location_dir, 'catalog.json')

    location_catalog_modified = False

    if os.path.exists(location_catalog_path):
        location_catalog = pystac.read_file(location_catalog_path)
        print(f"Loaded existing location catalog: {location}")
    else:
        os.makedirs(location_dir, exist_ok=True)
        location_catalog = pystac.Catalog(
            id=location,
            title=f"STAC for {location}",
            description=f"STAC catalog for data in {location}"
        )
        location_catalog.set_self_href(location_catalog_path)
        print(f"Created new location catalog: {location}")

        location_catalog_modified = True 

    child_id_to_add = block_catalog.id
    existing_child_ids = {child.id for child in location_catalog.get_children()}
    
    
    if child_id_to_add not in existing_child_ids: 
        child_to_add = pystac.read_file(os.path.join(block_dir, 'catalog.json'))
        location_catalog.add_child(child_to_add)
        
        location_catalog_modified = True
        print(f"Added block '{block}' to location catalog '{location}'.")
    else:
        print(f"Block '{block}' already exists in location catalog '{location}'")
    
    
    if location_catalog_modified:
        location_catalog.normalize_and_save(location_dir, catalog_type=pystac.CatalogType.SELF_CONTAINED)
        print(f"Updated location catalog for: {location}")   




In [None]:
def generate_root_catalog(blocks_info, base_dir, corestack_dir):
    root_catalog_path = os.path.join(corestack_dir, "catalog.json")

    
    if os.path.exists(root_catalog_path):
        root_catalog = pystac.read_file(root_catalog_path)
        print("Loaded existing root catalog.")
    else:
        root_catalog = pystac.Catalog(
            id="corestack",
            title="CorestackCatalogs",
            description="Root catalog containing all location-based sub-catalogs"
        )
        
        root_catalog.set_self_href(root_catalog_path) 
        print("Created new root catalog.")
    
    
    existing_root_children_ids = {child.id for child in root_catalog.get_children()}

    

    for info in blocks_info:
        location = info["location"]
        location_catalog_path = os.path.join(corestack_dir, location, "catalog.json")

        
        if os.path.exists(location_catalog_path):
            if location not in existing_root_children_ids: 
                location_catalog = pystac.read_file(location_catalog_path)
                root_catalog.add_child(location_catalog)
                existing_root_children_ids.add(location) 
                print(f"Added location catalog '{location}' to root catalog.")
            else:
                print(f"Location catalog '{location}' already linked in root catalog.")
        else:
            print(f"Warning: Location catalog not found for {location} at {location_catalog_path}.")
                
    root_catalog.set_self_href(os.path.join(corestack_dir, "catalog.json"))
    root_catalog.normalize_and_save(corestack_dir, catalog_type=pystac.CatalogType.SELF_CONTAINED)
    print(f"Root catalog generated at {os.path.join(corestack_dir, 'catalog.json')}")

In [23]:
for block_info in blocks_info:
    print(f"Processing block: {block_info['block']}")
    generate_stac_for_block(block_info)

Processing block: gobindpur
saraikela-kharsawan_gobindpur_2023-07-01_2024-06-30_LULCmap_10m.tif
2023-07-01 00:00:00
2024-06-30 00:00:00


[{'value': 0, 'label': 'clear', 'alpha': '0', 'color': '#000000'}, {'value': 1, 'label': 'built up', 'alpha': '255', 'color': '#ff0000'}, {'value': 2, 'label': 'kharif water', 'alpha': '255', 'color': '#74ccf4'}, {'value': 3, 'label': 'kharif and rabi water', 'alpha': '255', 'color': '#1ca3ec'}, {'value': 4, 'label': 'kharif and rabi and zaid water', 'alpha': '255', 'color': '#0f5e9c'}, {'value': 5, 'label': 'croplands', 'alpha': '255', 'color': '#f1c232'}, {'value': 6, 'label': 'Tree/Forests', 'alpha': '255', 'color': '#38761d'}, {'value': 7, 'label': 'barren lands', 'alpha': '255', 'color': '#a9a9a9'}, {'value': 8, 'label': 'Single Kharif Cropping', 'alpha': '255', 'color': '#bad93e'}, {'value': 9, 'label': 'Single Non-Kharif Cropping', 'alpha': '255', 'color': '#f59d22'}, {'value': 10, 'label': 'Double Cropping', 'alpha': '255', 'color': '#ff9371'}, {'value': 11, 'label': 'Triple Cropping', 'alpha': '255', 'color': '#b3561d'}, {'value': 12, 'label': 'Shrubs_Scrubs', 'alpha': '255', 

  geom = mapping(gdf.unary_union)


 STAC catalog created for block: gobindpur in jharkhand
Loaded existing location catalog: jharkhand
Block 'gobindpur' already exists in location catalog 'jharkhand'
Processing block: mirzapur
Mirzapur_Mirzapur_2023-07-01_2024-06-30_LULCmap_10m.tif
2023-07-01 00:00:00
2024-06-30 00:00:00
[{'value': 0, 'label': 'clear', 'alpha': '0', 'color': '#000000'}, {'value': 1, 'label': 'built up', 'alpha': '255', 'color': '#ff0000'}, {'value': 2, 'label': 'kharif water', 'alpha': '255', 'color': '#74ccf4'}, {'value': 3, 'label': 'kharif and rabi water', 'alpha': '255', 'color': '#1ca3ec'}, {'value': 4, 'label': 'kharif and rabi and zaid water', 'alpha': '255', 'color': '#0f5e9c'}, {'value': 5, 'label': 'croplands', 'alpha': '255', 'color': '#f1c232'}, {'value': 6, 'label': 'Tree/Forests', 'alpha': '255', 'color': '#38761d'}, {'value': 7, 'label': 'barren lands', 'alpha': '255', 'color': '#a9a9a9'}, {'value': 8, 'label': 'Single Kharif Cropping', 'alpha': '255', 'color': '#bad93e'}, {'value': 9, 'l

  geom = mapping(gdf.unary_union)


 STAC catalog created for block: mirzapur in uttar_pradesh
Loaded existing location catalog: uttar_pradesh
Block 'mirzapur' already exists in location catalog 'uttar_pradesh'
Processing block: koraput
Narayanpatana_Koraput_2023-07-01_2024-06-30_LULCmap_10m.tif
2023-07-01 00:00:00
2024-06-30 00:00:00
[{'value': 0, 'label': 'clear', 'alpha': '0', 'color': '#000000'}, {'value': 1, 'label': 'built up', 'alpha': '255', 'color': '#ff0000'}, {'value': 2, 'label': 'kharif water', 'alpha': '255', 'color': '#74ccf4'}, {'value': 3, 'label': 'kharif and rabi water', 'alpha': '255', 'color': '#1ca3ec'}, {'value': 4, 'label': 'kharif and rabi and zaid water', 'alpha': '255', 'color': '#0f5e9c'}, {'value': 5, 'label': 'croplands', 'alpha': '255', 'color': '#f1c232'}, {'value': 6, 'label': 'Tree/Forests', 'alpha': '255', 'color': '#38761d'}, {'value': 7, 'label': 'barren lands', 'alpha': '255', 'color': '#a9a9a9'}, {'value': 8, 'label': 'Single Kharif Cropping', 'alpha': '255', 'color': '#bad93e'}, {'

  geom = mapping(gdf.unary_union)


 STAC catalog created for block: koraput in odisha
Loaded existing location catalog: odisha
Block 'koraput' already exists in location catalog 'odisha'


In [24]:
generate_root_catalog(blocks_info, base_dir="data/", corestack_dir="data/CorestackCatalogs")

Loaded existing root catalog.
Location catalog 'jharkhand' already linked in root catalog. Skipping addition.
Location catalog 'uttar_pradesh' already linked in root catalog. Skipping addition.
Location catalog 'odisha' already linked in root catalog. Skipping addition.
Root catalog generated at data/CorestackCatalogs/catalog.json
