## Essencial Functions and Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import torch
from pathlib import Path
import numpy as np
import tifffile as tiff
from difflib import get_close_matches

## Read data

In [2]:
# Main dataset directory
dataset_dir = Path('spotlite_dataset_loca1_date1_dir1')

images_dir = dataset_dir / 'images'
masks_dir = dataset_dir / 'masks'

assert images_dir.exists(), f"Image folder not found: {images_dir}"
assert masks_dir.exists(), f"Mask folder not found: {masks_dir}"

# Collect all image and mask files (multiple extensions)
image_paths = sorted(
    [*images_dir.glob('*.tif'), *images_dir.glob('*.tiff')],
    key=lambda p: p.stem
)
mask_paths = sorted(
    [*masks_dir.glob('*.tif'), *masks_dir.glob('*.tiff'),
     *masks_dir.glob('*.png'), *masks_dir.glob('*.jpg')],
    key=lambda p: p.stem
)

# Dictionary of masks by base name
mask_dict = {p.stem: p for p in mask_paths}
unused_masks = set(mask_dict.keys())

imgs_array, masks_array = [], []

for img_path in image_paths:
    stem = img_path.stem
    mask_path = None

    # 1) Exact match
    if stem in mask_dict:
        mask_path = mask_dict[stem]
        unused_masks.discard(stem)
    else:
        # 2) Approximate match
        candidates = get_close_matches(stem, mask_dict.keys(), n=1, cutoff=0.6)
        if candidates:
            sel = candidates[0]
            mask_path = mask_dict[sel]
            unused_masks.discard(sel)
        else:
            # 3) No match found; just notify and skip
            print(f"Warning: no mask found for {img_path.name}; skipping.")
            continue

    # Read image and mask
    img = tiff.imread(str(img_path))
    msk = tiff.imread(str(mask_path))

    # Check dimensions
    if img.shape[:2] != msk.shape:
        raise ValueError(
            f"Incompatible dimensions: {img_path.name} {img.shape[:2]} vs "
            f"{mask_path.name} {msk.shape}"
        )

    imgs_array.append(img)
    masks_array.append(msk)

# Remaining masks
if unused_masks:
    print("Warning: these masks were not used (no corresponding image):")
    for s in sorted(unused_masks):
        print("  ", s)

# Stack arrays
imgs = np.stack(imgs_array, axis=0)   # (N, H, W, C)
masks = np.stack(masks_array, axis=0)  # (N, H, W)

print(f'Final: {len(imgs)} pairs loaded.')
print(f'Shape of imgs: {imgs.shape}')
print(f'Shape of masks: {masks.shape}')


Final: 391 pairs loaded.
Shape of imgs: (391, 512, 512, 4)
Shape of masks: (391, 512, 512)


## Analyse Vegetation Annotation

In [None]:
import numpy as np

def analyze_total_subannotation(imgs, masks, ndvi_threshold=0.2, classes_vegetation=[2, 3, 4]): 
    N, H, W, C = imgs.shape
    R = imgs[:, :, :, 0]
    NIR = imgs[:, :, :, 3]
    
    NDVI = (NIR - R) / (NIR + R + 1e-5)
    veg_estimated = (NDVI > ndvi_threshold).astype(np.uint8)
    veg_annotated = np.isin(masks, classes_vegetation).astype(np.uint8)

    total_pixels = masks.size
    n_estimated = veg_estimated.sum()
    n_annotated = veg_annotated.sum()

    result = {
        'vegetation_estimated': round(100 * n_estimated / total_pixels, 0) if total_pixels > 0 else 0.0,
        'vegetation_annotated': round(100 * n_annotated / total_pixels, 0) if total_pixels > 0 else 0.0,
    }

    return result

# This threshold attempts to separate real vegetation from soil, shadows, or built-up areas.
# | NDVI          | Interpretation                        |
# | ------------- | ------------------------------------- |
# | < 0           | Water, clouds, dense shadows          |
# | 0.0 – 0.1     | Exposed soil, rocks                   |
# | **0.2 – 0.3** | Sparse or sparse vegetation           |
# | **0.3 – 0.6** | Moderate vegetation (grass, shrubs)   |
# | **> 0.6**     | Dense vegetation (healthy forests)    |

# Why is 0.2 a safe value to detect vegetation?
# It is a conservative cutoff point: it already captures light or sparse vegetation.
# Avoids excluding weak or young vegetation (which would have NDVI between 0.2 and 0.4).
# It is ideal for making general vegetation estimates over large areas.

THRESHOLD = 0.2 # conservative threshold

result = analyze_total_subannotation(imgs, masks, ndvi_threshold=THRESHOLD)

print(f"Analysis with NDVI threshold {THRESHOLD}:")

for key, value in result.items():
    print(f"{key}: {value}%")


Analysis with NDVI threshold 0.2:
vegetation_estimated: 37.0%
vegetation_annotated: 2.0%
