In [3]:
import json
import os
import shapely.geometry
from shapely.ops import transform
import pyproj

# Make folder if it doesn't exist
folder = "aois_json"
os.makedirs(folder, exist_ok=True)

# City centers: (lon, lat)
cities = {
    "Vienna": (16.3738, 48.2082),
    "Paris": (2.3522, 48.8566),
    "London": (-0.1276, 51.5074),
    "Toronto": (-79.3832, 43.6532),
    "Vancouver": (-123.1207, 49.2827),
    "San Francisco": (-122.4194, 37.7749),
    "Lisbon": (-9.1393, 38.7223),
    "Madrid": (-3.7038, 40.4168),
    "Barcelona": (2.1734, 41.3851),
    "Berlin": (13.4050, 52.5200),
    "Amsterdam": (4.9041, 52.3676),
    "Melbourne": (144.9631, -37.8136),
    "Sydney": (151.2093, -33.8688),
    "Auckland": (174.7633, -36.8485),
    "Seattle": (-122.3321, 47.6062)
}

# AOI size in meters (~5.12 km to match 512x512 pixels at 10 m)
size_m = 512 * 10  

def create_square_aoi(center_lon, center_lat, size_m):
    """
    Create a square AOI of size_m x size_m around the city center.
    """
    utm_zone = int((center_lon + 180) / 6) + 1
    utm = pyproj.Proj(proj='utm', zone=utm_zone, ellps='WGS84')
    wgs84 = pyproj.Proj(proj='latlong', datum='WGS84')

    project_to_utm = pyproj.Transformer.from_proj(wgs84, utm, always_xy=True).transform
    project_to_wgs = pyproj.Transformer.from_proj(utm, wgs84, always_xy=True).transform

    point = shapely.geometry.Point(center_lon, center_lat)
    point_utm = transform(project_to_utm, point)

    half_size = size_m / 2
    square_utm = shapely.geometry.box(
        point_utm.x - half_size,
        point_utm.y - half_size,
        point_utm.x + half_size,
        point_utm.y + half_size
    )

    square_wgs = transform(project_to_wgs, square_utm)
    return square_wgs

# Generate one JSON per city
for city, (lon, lat) in cities.items():
    square = create_square_aoi(lon, lat, size_m)
    polygon_coords = [list(coord) for coord in square.exterior.coords]
    geojson = {
        "type": "FeatureCollection",
        "features": [
            {
                "type": "Feature",
                "properties": {"city": city},
                "geometry": {"type": "Polygon", "coordinates": [polygon_coords]}
            }
        ]
    }
    filepath = os.path.join(folder, f"{city.replace(' ', '_')}.geojson")
    with open(filepath, "w") as f:
        json.dump(geojson, f, indent=2)
    print(f"Saved {filepath}")

print("All 15 city AOIs saved as separate GeoJSON files in 'aois_json' folder.")

ModuleNotFoundError: No module named 'shapely'

In [None]:
import rioxarray as rxr
import geopandas as gpd
import numpy as np
import xarray as xr
import os
import glob

# ------------------------------
# Paths and Configuration
# ------------------------------
aoi_base_folder = "aois_json"
sentinel_base_folder = "sentinel_data"

# Define cities and months
cities = ["Amsterdam", "Auckland", "Barcelona", "Berlin", "Kyiv"]
months = ["April", "August", "November"]
band_substrings = ["B02", "B03", "B04", "B08"]

# ------------------------------
# Process each city
# ------------------------------
for city in cities:
    print(f"\n{'='*60}")
    print(f"PROCESSING CITY: {city}")
    print(f"{'='*60}")
    
    # Define paths for this city
    aoi_file = os.path.join(aoi_base_folder, f"{city}.geojson")
    output_file = os.path.join(sentinel_base_folder, city, f"{city}_MultiMonth_stack.tif")
    
    # Check if AOI file exists
    if not os.path.exists(aoi_file):
        print(f"WARNING: AOI file not found for {city}: {aoi_file}")
        print(f"Skipping {city}...")
        continue
    
    # ------------------------------
    # Load AOI and extract geometries
    # ------------------------------
    aoi = gpd.read_file(aoi_file)
    
    # Merge multiple features if needed
    if len(aoi) > 1:
        merged_geom = aoi.unary_union
        geometries = [merged_geom]
    else:
        geometries = [aoi.geometry.iloc[0]]
    
    # Ensure geometries are in WGS84
    for i, g in enumerate(geometries):
        if aoi.crs is None:
            aoi.set_crs("EPSG:4326", inplace=True)
        if aoi.crs.to_epsg() != 4326:
            geometries[i] = g.to_crs("EPSG:4326")
    
    # ------------------------------
    # Process each month for this city
    # ------------------------------
    all_band_arrays = []
    all_band_names = []
    
    for month in months:
        folder_path = os.path.join(sentinel_base_folder, city, f"{city}-{month}-10m")
        print(f"\n=== Processing {city} - {month} ===")
        
        # Check if folder exists
        if not os.path.exists(folder_path):
            print(f"WARNING: Folder not found: {folder_path}")
            print(f"Skipping {month} for {city}...")
            continue
        
        month_band_dict = {}
        
        # Load and clip each band for this month
        for substring in band_substrings:
            matched_files = glob.glob(os.path.join(folder_path, f"*{substring}*"))
            if not matched_files:
                print(f"WARNING: No file found for band '{substring}' in {folder_path}")
                continue
            
            band_path = matched_files[0]
            band = rxr.open_rasterio(band_path, masked=True).squeeze()
            
            # Clip to AOI using plain list of shapely geometries
            band_clipped = band.rio.clip(geometries, crs="EPSG:4326")
            
            # Store with month-specific name
            band_name = f"{substring}-{month}"
            all_band_arrays.append(band_clipped)
            all_band_names.append(band_name)
            month_band_dict[substring] = band_clipped
            
            print(f"Loaded and clipped {band_name} -> shape: {band_clipped.shape}")
        
        # Only calculate indices if all required bands are present
        if len(month_band_dict) < 3:
            print(f"WARNING: Missing bands for {month}, skipping index calculations")
            continue
        
        # ------------------------------
        # Calculate NDVI for this month
        # ------------------------------
        if "B08" in month_band_dict and "B04" in month_band_dict:
            nir = month_band_dict["B08"].astype(np.float32)
            red = month_band_dict["B04"].astype(np.float32)
            
            ndvi = (nir - red) / (nir + red)
            ndvi = xr.where(np.isfinite(ndvi), ndvi, np.nan)
            ndvi_name = f"NDVI-{month}"
            all_band_arrays.append(ndvi)
            all_band_names.append(ndvi_name)
            print(f"Calculated {ndvi_name} -> range: [{float(ndvi.min()):.3f}, {float(ndvi.max()):.3f}]")
        
        # ------------------------------
        # Calculate EVI
        # ------------------------------
        if "B08" in month_band_dict and "B04" in month_band_dict and "B02" in month_band_dict:
            blue = month_band_dict["B02"].astype(np.float32)
            
            evi = 2.5 * (nir - red) / (nir + 6*red - 7.5*blue + 1)
            evi = xr.where(np.isfinite(evi), evi, np.nan)
            evi_name = f"EVI-{month}"
            all_band_arrays.append(evi)
            all_band_names.append(evi_name)
            print(f"Calculated {evi_name} -> range: [{float(evi.min()):.3f}, {float(evi.max()):.3f}]")
        
        # ------------------------------
        # Calculate SAVI
        # ------------------------------
        if "B08" in month_band_dict and "B04" in month_band_dict:
            L = 0.5
            savi = ((nir - red) * (1 + L)) / (nir + red + L)
            savi = xr.where(np.isfinite(savi), savi, np.nan)
            savi_name = f"SAVI-{month}"
            all_band_arrays.append(savi)
            all_band_names.append(savi_name)
            print(f"Calculated {savi_name} -> range: [{float(savi.min()):.3f}, {float(savi.max()):.3f}]")
    
    # ------------------------------
    # Stack all bands from all months for this city
    # ------------------------------
    if not all_band_arrays:
        print(f"\nWARNING: No bands processed for {city}, skipping stack creation")
        continue
    
    print(f"\n=== Creating final stack for {city} ===")
    stack = xr.concat(all_band_arrays, dim="band")
    stack = stack.assign_coords(band=all_band_names)
    
    # Convert entire stack to float32
    stack = stack.astype(np.float32)
    
    print(f"Stacked all bands > shape: {stack.shape}")
    print(f"Total bands: {len(all_band_names)}")
    
    # ------------------------------
    # Save as GeoTIFF
    # ------------------------------
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    stack.rio.to_raster(output_file, dtype=np.float32)
    print(f"\nSaved stacked GeoTIFF: {output_file}")
    print(f"Band order: {all_band_names}")

print(f"\n{'='*60}")
print("ALL CITIES PROCESSED!")
print(f"{'='*60}")


PROCESSING CITY: Amsterdam

=== Processing Amsterdam - April ===
Skipping April for Amsterdam...

=== Processing Amsterdam - August ===
Skipping August for Amsterdam...

=== Processing Amsterdam - November ===
Skipping November for Amsterdam...


PROCESSING CITY: Auckland

=== Processing Auckland - April ===
Skipping April for Auckland...

=== Processing Auckland - August ===
Skipping August for Auckland...

=== Processing Auckland - November ===
Skipping November for Auckland...


PROCESSING CITY: Barcelona

=== Processing Barcelona - April ===
Skipping April for Barcelona...

=== Processing Barcelona - August ===
Skipping August for Barcelona...

=== Processing Barcelona - November ===
Skipping November for Barcelona...


PROCESSING CITY: Berlin

=== Processing Berlin - April ===
Skipping April for Berlin...

=== Processing Berlin - August ===
Skipping August for Berlin...

=== Processing Berlin - November ===
Skipping November for Berlin...


PROCESSING CITY: Kyiv

=== Processing K

In [None]:
import osmnx as ox
import geopandas as gpd
import os

# ------------------------------
# Configuration
# ------------------------------
aoi_base_folder = "aois_json"
output_base_folder = "sentinel_data"
cities = ["Amsterdam", "Auckland", "Barcelona", "Berlin", "Kyiv"]

# Define tags for green areas
tags = {
    "leisure": ["park", "garden"],
    "landuse": ["forest", "grass", "meadow", "village_green"],
    "natural": ["wood", "scrub"]
}

# ------------------------------
# Process each city
# ------------------------------
for city in cities:
    print(f"\n{'='*60}")
    print(f"PROCESSING: {city}")
    print(f"{'='*60}")
    
    # Define paths
    aoi_file = os.path.join(aoi_base_folder, f"{city}.geojson")
    output_file = os.path.join(output_base_folder, city, f"{city}_OSM_green.geojson")
    
    # Check if AOI file exists
    if not os.path.exists(aoi_file):
        print(f"WARNING: AOI file not found: {aoi_file}")
        print(f"Skipping {city}...")
        continue
    
    try:
        # Load AOI
        aoi = gpd.read_file(aoi_file)
        aoi = aoi.to_crs("EPSG:4326")  # ensure WGS84
        polygon = aoi.geometry.iloc[0]  # get shapely polygon
        
        print(f"Loaded AOI for {city}")
        
        # Fetch green features from OSM
        print(f"Fetching OSM green areas...")
        green_features = ox.features_from_polygon(polygon, tags)
        
        # Keep only polygons
        green_features = green_features[green_features.geometry.type.isin(["Polygon", "MultiPolygon"])]
        
        if len(green_features) == 0:
            print(f"WARNING: No green area polygons found for {city}")
            continue
        
        # Calculate total area
        total_area_km2 = green_features.to_crs('EPSG:3857').area.sum() / 1e6
        
        # Save to GeoJSON
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        green_features.to_file(output_file, driver="GeoJSON")
        
        print(f"✓ Saved: {output_file}")
        print(f"  Features: {len(green_features)}")
        print(f"  Total area: {total_area_km2:.2f} km²")
        
    except Exception as e:
        print(f"ERROR processing {city}: {str(e)}")
        continue

print(f"\n{'='*60}")
print("ALL CITIES PROCESSED!")
print(f"{'='*60}")




PROCESSING: Amsterdam
Loaded AOI for Amsterdam
Fetching OSM green areas...
✓ Saved: sentinel_data/Amsterdam/Amsterdam_OSM_green.geojson
  Features: 4399
  Total area: 16.04 km²

PROCESSING: Auckland
Loaded AOI for Auckland
Fetching OSM green areas...
✓ Saved: sentinel_data/Auckland/Auckland_OSM_green.geojson
  Features: 1615
  Total area: 5.75 km²

PROCESSING: Barcelona
Loaded AOI for Barcelona
Fetching OSM green areas...
✓ Saved: sentinel_data/Barcelona/Barcelona_OSM_green.geojson
  Features: 1915
  Total area: 6.38 km²

PROCESSING: Berlin
Loaded AOI for Berlin
Fetching OSM green areas...
✓ Saved: sentinel_data/Berlin/Berlin_OSM_green.geojson
  Features: 3371
  Total area: 20.91 km²

PROCESSING: Kyiv
Loaded AOI for Kyiv
Fetching OSM green areas...
✓ Saved: sentinel_data/Kyiv/Kyiv_OSM_green.geojson
  Features: 1471
  Total area: 24.64 km²

ALL CITIES PROCESSED!


In [None]:
import rasterio
from rasterio.features import rasterize
import geopandas as gpd
import numpy as np
import os

# ------------------------------
# Configuration
# ------------------------------
sentinel_base_folder = "sentinel_data"
cities = ["Amsterdam", "Auckland", "Barcelona", "Berlin", "Kyiv"]

# ------------------------------
# Process each city
# ------------------------------
for city in cities:
    print(f"\n{'='*60}")
    print(f"PROCESSING: {city}")
    print(f"{'='*60}")
    
    # Define paths
    stack_path = os.path.join(sentinel_base_folder, city, f"{city}_MultiMonth_stack.tif")
    osm_path = os.path.join(sentinel_base_folder, city, f"{city}_OSM_green.geojson")
    label_path = os.path.join(sentinel_base_folder, city, f"{city}_OSM_labels.tif")
    
    # Check if required files exist
    if not os.path.exists(stack_path):
        print(f"WARNING: Stack file not found: {stack_path}")
        print(f"Skipping {city}...")
        continue
    
    if not os.path.exists(osm_path):
        print(f"WARNING: OSM green file not found: {osm_path}")
        print(f"Skipping {city}...")
        continue
    
    try:
        # Load Sentinel-2 stack metadata
        with rasterio.open(stack_path) as src:
            transform = src.transform
            out_shape = (src.height, src.width)
            crs = src.crs
        
        print(f"Loaded stack metadata: {out_shape[0]}x{out_shape[1]} pixels")
        
        # Load GeoJSON of green areas from OSM
        green_features = gpd.read_file(osm_path)
        
        # Ensure CRS matches Sentinel-2
        green_features = green_features.to_crs(crs)
        
        # Keep only polygons
        green_features = green_features[green_features.geometry.type.isin(["Polygon", "MultiPolygon"])]
        
        if len(green_features) == 0:
            print(f"WARNING: No polygon features found for {city}")
            continue
        
        print(f"Loaded {len(green_features)} green area polygons")
        
        # ------------------------------
        # Rasterize with all_touched approach
        # ------------------------------
        print("Rasterizing green areas...")
        
        labels = rasterize(
            [(geom, 1) for geom in green_features.geometry],
            out_shape=out_shape,
            transform=transform,
            fill=0,
            all_touched=True,   # include pixels partially covered
            dtype="uint8"
        )
        
        green_count = np.sum(labels == 1)
        total_pixels = labels.size
        print(f"  Completed: {green_count}/{total_pixels} pixels labeled as green ({100*green_count/total_pixels:.2f}%)")
        
        # ------------------------------
        # Save raster labels
        # ------------------------------
        with rasterio.open(
            label_path,
            "w",
            driver="GTiff",
            height=out_shape[0],
            width=out_shape[1],
            count=1,
            dtype="uint8",
            crs=crs,
            transform=transform,
            compress="lzw"  # Add compression to reduce file size
        ) as dst:
            dst.write(labels, 1)
        
        print(f"✓ Saved: {label_path}")
        
    except Exception as e:
        print(f"ERROR processing {city}: {str(e)}")
        import traceback
        traceback.print_exc()
        continue

print(f"\n{'='*60}")
print("ALL CITIES PROCESSED!")
print(f"{'='*60}")


PROCESSING: Amsterdam
Skipping Amsterdam...

PROCESSING: Auckland
Skipping Auckland...

PROCESSING: Barcelona
Skipping Barcelona...

PROCESSING: Berlin
Skipping Berlin...

PROCESSING: Kyiv
Skipping Kyiv...

ALL CITIES PROCESSED!


In [None]:
import rasterio
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score, f1_score
import warnings
warnings.filterwarnings('ignore')

# Load Sentinel-2 stack
with rasterio.open("sentinel_data/Amsterdam/Amsterdam_MultiMonth_stack.tif") as src:
    X = src.read()  # shape: (bands, height, width)

# Load labels
with rasterio.open("sentinel_data/Amsterdam/Amsterdam_OSM_labels.tif") as src:
    y = src.read(1)  # shape: (height, width)

# Flatten to (n_samples, n_features)
n_bands, h, w = X.shape
X_flat = X.reshape(n_bands, -1).T  # shape: (h*w, n_bands)
y_flat = y.flatten()                # shape: (h*w,)

# Remove NaN values
mask = ~np.isnan(X_flat).any(axis=1)
X_flat = X_flat[mask]
y_flat = y_flat[mask]

# ------------------------------
# Check class distribution
# ------------------------------
unique, counts = np.unique(y_flat, return_counts=True)
total = len(y_flat)
print("="*60)
print("CLASS DISTRIBUTION:")
print("="*60)
for label, count in zip(unique, counts):
    print(f"Class {label}: {count:,} samples ({100*count/total:.2f}%)")
print(f"\nImbalance ratio: {counts[0]/counts[1]:.1f}:1 (non-green:green)")
print("="*60)

# ------------------------------
# Split with stratification
# ------------------------------
# Stratify ensures both train and test have similar class distributions
X_train, X_test, y_train, y_test = train_test_split(
    X_flat, y_flat, 
    test_size=0.2, 
    random_state=42,
    stratify=y_flat  # Important for imbalanced data!
)

print(f"\nTrain set: {len(y_train):,} samples")
print(f"  Green: {np.sum(y_train==1):,} ({100*np.mean(y_train==1):.2f}%)")
print(f"Test set: {len(y_test):,} samples")
print(f"  Green: {np.sum(y_test==1):,} ({100*np.mean(y_test==1):.2f}%)")

# ------------------------------
# Train Random Forest
# ------------------------------
print("\n" + "="*60)
print("TRAINING RANDOM FOREST...")
print("="*60)

clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=25,
    min_samples_split=10,      # Prevent overfitting on minority class
    min_samples_leaf=5,        # Ensure leaves have enough samples
    class_weight="balanced",   # Handle imbalance
    n_jobs=-1,
    random_state=42,
    verbose=1
)
clf.fit(X_train, y_train)

# ------------------------------
# Evaluate with multiple metrics
# ------------------------------
y_pred = clf.predict(X_test)

print("\n" + "="*60)
print("EVALUATION METRICS:")
print("="*60)

# Classification report (precision, recall, F1 for each class)
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred, target_names=["Non-green", "Green"]))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print("                Predicted")
print("              Non-green  Green")
print(f"Actual Non-g  {cm[0,0]:8d}  {cm[0,1]:6d}")
print(f"       Green  {cm[1,0]:8d}  {cm[1,1]:6d}")

# Additional metrics for imbalanced data
print("\n" + "-"*60)
print("IMBALANCED DATA METRICS:")
print("-"*60)
print(f"Balanced Accuracy: {balanced_accuracy_score(y_test, y_pred):.3f}")
print(f"F1 Score (Green):  {f1_score(y_test, y_pred, pos_label=1):.3f}")
print(f"F1 Score (Macro):  {f1_score(y_test, y_pred, average='macro'):.3f}")

# Calculate specificity and sensitivity manually
tn, fp, fn, tp = cm.ravel()
sensitivity = tp / (tp + fn)  # Recall for green class
specificity = tn / (tn + fp)  # Recall for non-green class
print(f"Sensitivity (Green Recall):     {sensitivity:.3f}")
print(f"Specificity (Non-green Recall): {specificity:.3f}")

# ------------------------------
# Feature importance
# ------------------------------
print("\n" + "="*60)
print("BAND IMPORTANCE:")
print("="*60)

# Get band names (you can customize this based on your actual bands)
band_names = [f"Band_{i+1}" for i in range(n_bands)]

# Sort by importance
importance_idx = np.argsort(clf.feature_importances_)[::-1]
for idx in importance_idx[:10]:  # Top 10
    print(f"{band_names[idx]:15s}: {clf.feature_importances_[idx]:.4f}")

print("\n" + "="*60)
print("TRAINING COMPLETE!")
print("="*60)

RasterioIOError: sentinel_data/Amsterdam/Amsterdam_MultiMonth_stack.tif: No such file or directory