In [3]:
import json
import os
import shapely.geometry
from shapely.ops import transform
import pyproj

# Make folder if it doesn't exist
folder = "aois_json"
os.makedirs(folder, exist_ok=True)

# City centers: (lon, lat)
cities = {
    "Vienna": (16.3738, 48.2082),
    "Paris": (2.3522, 48.8566),
    "London": (-0.1276, 51.5074),
    "Toronto": (-79.3832, 43.6532),
    "Vancouver": (-123.1207, 49.2827),
    "San_Francisco": (-122.4194, 37.7749),
    "Lisbon": (-9.1393, 38.7223),
    "Madrid": (-3.7038, 40.4168),
    "Barcelona": (2.1734, 41.3851),
    "Berlin": (13.4050, 52.5200),
    "Amsterdam": (4.9041, 52.3676),
    "Melbourne": (144.9631, -37.8136),
    "Sydney": (151.2093, -33.8688),
    "Auckland": (174.7633, -36.8485),
    "Seattle": (-122.3321, 47.6062)
}

# AOI size in meters (~5.12 km to match 512x512 pixels at 10 m)
size_m = 512 * 10  

def create_square_aoi(center_lon, center_lat, size_m):
    """
    Create a square AOI of size_m x size_m around the city center.
    """
    utm_zone = int((center_lon + 180) / 6) + 1
    utm = pyproj.Proj(proj='utm', zone=utm_zone, ellps='WGS84')
    wgs84 = pyproj.Proj(proj='latlong', datum='WGS84')

    project_to_utm = pyproj.Transformer.from_proj(wgs84, utm, always_xy=True).transform
    project_to_wgs = pyproj.Transformer.from_proj(utm, wgs84, always_xy=True).transform

    point = shapely.geometry.Point(center_lon, center_lat)
    point_utm = transform(project_to_utm, point)

    half_size = size_m / 2
    square_utm = shapely.geometry.box(
        point_utm.x - half_size,
        point_utm.y - half_size,
        point_utm.x + half_size,
        point_utm.y + half_size
    )

    square_wgs = transform(project_to_wgs, square_utm)
    return square_wgs

# Generate one JSON per city
for city, (lon, lat) in cities.items():
    square = create_square_aoi(lon, lat, size_m)
    polygon_coords = [list(coord) for coord in square.exterior.coords]
    geojson = {
        "type": "FeatureCollection",
        "features": [
            {
                "type": "Feature",
                "properties": {"city": city},
                "geometry": {"type": "Polygon", "coordinates": [polygon_coords]}
            }
        ]
    }
    filepath = os.path.join(folder, f"{city.replace(' ', '_')}.geojson")
    with open(filepath, "w") as f:
        json.dump(geojson, f, indent=2)
    print(f"Saved {filepath}")

print("All 15 city AOIs saved as separate GeoJSON files in 'aois_json' folder.")

Saved aois_json/Vienna.geojson
Saved aois_json/Paris.geojson
Saved aois_json/London.geojson
Saved aois_json/Toronto.geojson
Saved aois_json/Vancouver.geojson
Saved aois_json/San_Francisco.geojson
Saved aois_json/Lisbon.geojson
Saved aois_json/Madrid.geojson
Saved aois_json/Barcelona.geojson
Saved aois_json/Berlin.geojson
Saved aois_json/Amsterdam.geojson
Saved aois_json/Melbourne.geojson
Saved aois_json/Sydney.geojson
Saved aois_json/Auckland.geojson
Saved aois_json/Seattle.geojson
All 15 city AOIs saved as separate GeoJSON files in 'aois_json' folder.


In [4]:
"""
Sentinel-2 Multi-Month Data Preprocessing Pipeline
Processes Sentinel-2 data for multiple cities and creates multi-temporal stacks
"""

import rasterio
import numpy as np
import os
from pathlib import Path
from rasterio.warp import calculate_default_transform, reproject, Resampling
from rasterio.merge import merge

# ============================================================
# CONFIGURATION
# ============================================================

BASE_PATH = "sentinel_data"
CITIES = ["London", "Melbourne", "Paris", "Seattle", "San_Francisco"]

MONTHS = {
    "April": "April-10m",
    "August": "August-10m", 
    "November": "November-10m"
}

BANDS_10M = ["B02", "B03", "B04", "B08"]
OUTPUT_FOLDER = "processed_stacks"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# ============================================================
# HELPER FUNCTIONS
# ============================================================

def find_band_file(folder_path, band_name):
    folder = Path(folder_path)
    
    if not folder.exists():
        print(f"      ERROR: Folder does not exist: {folder}")
        return None
    
    patterns = [
        f"*_{band_name}_10m.jp2",
        f"*_{band_name}.jp2",
        f"**/*_{band_name}_10m.jp2",
        f"**/*_{band_name}.jp2",
        f"**/IMG_DATA/**/*{band_name}*.jp2"
    ]
    
    for pattern in patterns:
        matches = list(folder.glob(pattern))
        if matches:
            print(f"      ✓ Found {band_name}: {matches[0].name}")
            return str(matches[0])
    
    all_jp2_files = list(folder.rglob("*.jp2"))
    for file in all_jp2_files:
        if band_name in file.name:
            print(f"      ✓ Found {band_name} (fuzzy match): {file.name}")
            return str(file)
    
    print(f"      ✗ Could not find {band_name}")
    print(f"      DEBUG: Listing .jp2 files in {folder.name}:")
    if all_jp2_files:
        for f in all_jp2_files[:8]:
            print(f"        - {f.name}")
        if len(all_jp2_files) > 8:
            print(f"        ... and {len(all_jp2_files) - 8} more files")
    else:
        print(f"        - No .jp2 files found in {folder} or subdirectories!")
        all_files = list(folder.rglob("*"))
        print(f"        - Total files/folders: {len(all_files)}")
    
    return None


def load_and_stack_bands(folder_path, bands):
    band_arrays = []
    profile = None
    
    print(f"  Loading bands from: {Path(folder_path).name}")
    
    for band in bands:
        band_file = find_band_file(folder_path, band)
        
        if band_file is None:
            print(f"    WARNING: {band} not found, skipping...")
            continue
            
        print(f"    ✓ Loading {band}: {Path(band_file).name}")
        
        with rasterio.open(band_file) as src:
            band_data = src.read(1)
            band_arrays.append(band_data)
            
            if profile is None:
                profile = src.profile.copy()
    
    if not band_arrays:
        return None, None
    
    stacked = np.stack(band_arrays, axis=0)
    
    profile.update(
        count=len(band_arrays), 
        dtype='uint16',
        driver='GTiff',
        compress='lzw'
    )
    
    return stacked, profile


def create_multimonth_stack(city, base_path, months, bands):
    print(f"\n{'='*60}")
    print(f"PROCESSING CITY: {city}")
    print(f"{'='*60}")
    
    all_month_data = []
    final_profile = None
    
    for month_name, month_suffix in months.items():
        folder_path = f"{base_path}/{city}/{city}-{month_suffix}"
        
        print(f"\n=== Processing {city} - {month_name} ===")
        
        if not os.path.exists(folder_path):
            print(f"  WARNING: Folder not found: {folder_path}")
            print(f"  Skipping {month_name} for {city}...")
            continue
        
        month_stack, profile = load_and_stack_bands(folder_path, bands)
        
        if month_stack is None:
            print(f"  WARNING: No bands loaded for {month_name}, skipping...")
            continue
        
        print(f"  ✓ Loaded {month_stack.shape[0]} bands, shape: {month_stack.shape}")
        
        all_month_data.append(month_stack)
        
        if final_profile is None:
            final_profile = profile
    
    if not all_month_data:
        print(f"\n  ERROR: No data loaded for {city}, skipping stack creation")
        return None
    
    print(f"\n{'='*60}")
    print(f"CREATING MULTI-MONTH STACK")
    print(f"{'='*60}")
    
    full_stack = np.concatenate(all_month_data, axis=0)
    
    if full_stack.dtype != np.uint16:
        print(f"  Converting from {full_stack.dtype} to uint16...")
        full_stack = full_stack.astype(np.uint16)
    
    print(f"  Final stack shape: {full_stack.shape}")
    print(f"  Total bands: {full_stack.shape[0]} ({len(all_month_data)} months × {len(bands)} bands)")
    
    final_profile.update(
        count=full_stack.shape[0],
        dtype='uint16'
    )
    
    output_path = f"{OUTPUT_FOLDER}/{city}_MultiMonth_stack.tif"
    
    print(f"\n  Saving to: {output_path}")
    
    with rasterio.open(output_path, 'w', **final_profile) as dst:
        dst.write(full_stack)
    
    print(f"  ✓ Stack saved successfully!")
    
    print(f"\n  Stack Statistics:")
    print(f"    Min: {np.nanmin(full_stack):.2f}")
    print(f"    Max: {np.nanmax(full_stack):.2f}")
    print(f"    Mean: {np.nanmean(full_stack):.2f}")
    print(f"    NaN pixels: {np.isnan(full_stack).sum():,}")
    
    return output_path


# ============================================================
# MAIN PROCESSING LOOP
# ============================================================

def main():
    print("="*60)
    print("SENTINEL-2 MULTI-MONTH PREPROCESSING PIPELINE")
    print("="*60)
    print(f"\nBase path: {BASE_PATH}")
    print(f"Cities: {', '.join(CITIES)}")
    print(f"Months: {', '.join(MONTHS.keys())}")
    print(f"Bands: {', '.join(BANDS_10M)}")
    print(f"Output folder: {OUTPUT_FOLDER}")
    
    successful_cities = []
    failed_cities = []
    
    for city in CITIES:
        try:
            output_path = create_multimonth_stack(
                city=city,
                base_path=BASE_PATH,
                months=MONTHS,
                bands=BANDS_10M
            )
            
            if output_path:
                successful_cities.append(city)
            else:
                failed_cities.append(city)
                
        except Exception as e:
            print(f"\n  ERROR processing {city}: {str(e)}")
            failed_cities.append(city)
    
    print(f"\n\n{'='*60}")
    print("PROCESSING SUMMARY")
    print(f"{'='*60}")
    print(f"\n✓ Successfully processed: {len(successful_cities)} cities")
    for city in successful_cities:
        print(f"  - {city}")
    
    if failed_cities:
        print(f"\n✗ Failed: {len(failed_cities)} cities")
        for city in failed_cities:
            print(f"  - {city}")
    
    print(f"\n{'='*60}")
    print("PIPELINE COMPLETE!")
    print(f"{'='*60}")


if __name__ == "__main__":
    main()

SENTINEL-2 MULTI-MONTH PREPROCESSING PIPELINE

Base path: sentinel_data
Cities: London, Melbourne, Paris, Seattle, San_Francisco
Months: April, August, November
Bands: B02, B03, B04, B08
Output folder: processed_stacks

PROCESSING CITY: London

=== Processing London - April ===
  Loading bands from: London-April-10m
      ✓ Found B02: T30UXC_20250402T105619_B02_10m.jp2
    ✓ Loading B02: T30UXC_20250402T105619_B02_10m.jp2
      ✓ Found B03: T30UXC_20250402T105619_B03_10m.jp2
    ✓ Loading B03: T30UXC_20250402T105619_B03_10m.jp2
      ✓ Found B04: T30UXC_20250402T105619_B04_10m.jp2
    ✓ Loading B04: T30UXC_20250402T105619_B04_10m.jp2
      ✓ Found B08: T30UXC_20250402T105619_B08_10m.jp2
    ✓ Loading B08: T30UXC_20250402T105619_B08_10m.jp2
  ✓ Loaded 4 bands, shape: (4, 10980, 10980)

=== Processing London - August ===
  Loading bands from: London-August-10m
      ✓ Found B02: T30UXC_20250825T105641_B02_10m.jp2
    ✓ Loading B02: T30UXC_20250825T105641_B02_10m.jp2
      ✓ Found B03: T3

In [5]:
import osmnx as ox
import geopandas as gpd
import os

aoi_base_folder = "aois_json"
output_base_folder = "sentinel_data"
cities = ["London", "Melbourne", "Paris", "Seattle", "San_Francisco"]

tags = {
    "leisure": ["park", "garden"],
    "landuse": ["forest", "grass", "meadow", "village_green"],
    "natural": ["wood", "scrub"]
}

for city in cities:
    print(f"\n{'='*60}")
    print(f"PROCESSING: {city}")
    print(f"{'='*60}")
    
    aoi_file = os.path.join(aoi_base_folder, f"{city}.geojson")
    output_file = os.path.join(output_base_folder, city, f"{city}_OSM_green.geojson")
    
    if not os.path.exists(aoi_file):
        print(f"WARNING: AOI file not found: {aoi_file}")
        print(f"Skipping {city}...")
        continue
    
    try:
        aoi = gpd.read_file(aoi_file)
        aoi = aoi.to_crs("EPSG:4326")
        polygon = aoi.geometry.iloc[0]
        
        print(f"Loaded AOI for {city}")
        
        print(f"Fetching OSM green areas...")
        green_features = ox.features_from_polygon(polygon, tags)
        
        green_features = green_features[green_features.geometry.type.isin(["Polygon", "MultiPolygon"])]
        
        if len(green_features) == 0:
            print(f"WARNING: No green area polygons found for {city}")
            continue
        
        total_area_km2 = green_features.to_crs('EPSG:3857').area.sum() / 1e6
        
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        green_features.to_file(output_file, driver="GeoJSON")
        
        print(f"✓ Saved: {output_file}")
        print(f"  Features: {len(green_features)}")
        print(f"  Total area: {total_area_km2:.2f} km²")
        
    except Exception as e:
        print(f"ERROR processing {city}: {str(e)}")
        continue

print(f"\n{'='*60}")
print("ALL CITIES PROCESSED!")
print(f"{'='*60}")


PROCESSING: London
Loaded AOI for London
Fetching OSM green areas...
✓ Saved: sentinel_data/London/London_OSM_green.geojson
  Features: 1346
  Total area: 17.34 km²

PROCESSING: Melbourne
Loaded AOI for Melbourne
Fetching OSM green areas...
✓ Saved: sentinel_data/Melbourne/Melbourne_OSM_green.geojson
  Features: 1909
  Total area: 14.89 km²

PROCESSING: Paris
Loaded AOI for Paris
Fetching OSM green areas...
✓ Saved: sentinel_data/Paris/Paris_OSM_green.geojson
  Features: 2559
  Total area: 7.99 km²

PROCESSING: Seattle
Loaded AOI for Seattle
Fetching OSM green areas...
✓ Saved: sentinel_data/Seattle/Seattle_OSM_green.geojson
  Features: 419
  Total area: 6.62 km²

PROCESSING: San_Francisco
Loaded AOI for San_Francisco
Fetching OSM green areas...
✓ Saved: sentinel_data/San_Francisco/San_Francisco_OSM_green.geojson
  Features: 645
  Total area: 12.62 km²

ALL CITIES PROCESSED!


In [6]:
import rasterio
from rasterio.features import rasterize
import geopandas as gpd
import numpy as np
import os

sentinel_base_folder = "sentinel_data"
stack_folder = "processed_stacks"        
cities = ["London", "Melbourne", "Paris", "Seattle", "San_Francisco"]

for city in cities:
    print(f"\n{'='*60}")
    print(f"PROCESSING: {city}")
    print(f"{'='*60}")
    
    stack_path = os.path.join(stack_folder, f"{city}_MultiMonth_stack.tif")
    osm_path = os.path.join(sentinel_base_folder, city, f"{city}_OSM_green.geojson")
    label_path = os.path.join(sentinel_base_folder, city, f"{city}_OSM_labels.tif")
    
    if not os.path.exists(stack_path):
        print(f"WARNING: Stack file not found: {stack_path}")
        print(f"Skipping {city}...")
        continue
    
    if not os.path.exists(osm_path):
        print(f"WARNING: OSM green file not found: {osm_path}")
        print(f"Skipping {city}...")
        continue
    
    try:
        with rasterio.open(stack_path) as src:
            transform = src.transform
            out_shape = (src.height, src.width)
            crs = src.crs
        
        print(f"Loaded stack metadata: {out_shape[0]}x{out_shape[1]} pixels")
        
        green_features = gpd.read_file(osm_path)
        green_features = green_features.to_crs(crs)
        green_features = green_features[green_features.geometry.type.isin(["Polygon", "MultiPolygon"])]
        
        if len(green_features) == 0:
            print(f"WARNING: No polygon features found for {city}")
            continue
        
        print(f"Loaded {len(green_features)} green area polygons")
        print("Rasterizing green areas...")
        
        labels = rasterize(
            [(geom, 1) for geom in green_features.geometry],
            out_shape=out_shape,
            transform=transform,
            fill=0,
            all_touched=True,
            dtype="uint8"
        )
        
        green_count = np.sum(labels == 1)
        total_pixels = labels.size
        print(f"  Completed: {green_count}/{total_pixels} pixels labeled as green ({100*green_count/total_pixels:.2f}%)")
        
        with rasterio.open(
            label_path,
            "w",
            driver="GTiff",
            height=out_shape[0],
            width=out_shape[1],
            count=1,
            dtype="uint8",
            crs=crs,
            transform=transform,
            compress="lzw"
        ) as dst:
            dst.write(labels, 1)
        
        print(f"✓ Saved: {label_path}")
        
    except Exception as e:
        print(f"ERROR processing {city}: {str(e)}")
        import traceback
        traceback.print_exc()
        continue

print(f"\n{'='*60}")
print("ALL CITIES PROCESSED!")
print(f"{'='*60}")


PROCESSING: London
Loaded stack metadata: 10980x10980 pixels
Loaded 1346 green area polygons
Rasterizing green areas...
  Completed: 63528/120560400 pixels labeled as green (0.05%)
✓ Saved: sentinel_data/London/London_OSM_labels.tif

PROCESSING: Melbourne
Loaded stack metadata: 10980x10980 pixels
Loaded 1909 green area polygons
Rasterizing green areas...
  Completed: 96494/120560400 pixels labeled as green (0.08%)
✓ Saved: sentinel_data/Melbourne/Melbourne_OSM_labels.tif

PROCESSING: Paris
Loaded stack metadata: 10980x10980 pixels
Loaded 2559 green area polygons
Rasterizing green areas...
  Completed: 39572/120560400 pixels labeled as green (0.03%)
✓ Saved: sentinel_data/Paris/Paris_OSM_labels.tif

PROCESSING: Seattle
Loaded stack metadata: 10980x10980 pixels
Loaded 419 green area polygons
Rasterizing green areas...
  Completed: 34139/120560400 pixels labeled as green (0.03%)
✓ Saved: sentinel_data/Seattle/Seattle_OSM_labels.tif

PROCESSING: San_Francisco
Loaded stack metadata: 10980x

In [7]:
import rasterio
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score, f1_score
import matplotlib.pyplot as plt
import pandas as pd
import json
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

SENTINEL_BASE = "processed_stacks"
CITY = "London"
OUTPUT_BASE = "results"
SAMPLE_PERCENTAGE = 0.01
os.makedirs(OUTPUT_BASE, exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
run_folder = os.path.join(OUTPUT_BASE, f"{CITY}_{timestamp}")
os.makedirs(run_folder, exist_ok=True)

print("="*60)
print(f"RANDOM FOREST TRAINING - {CITY}")
print("="*60)
print(f"\nResults will be saved to: {run_folder}\n")

print("="*60)
print("LOADING DATA")
print("="*60)

stack_path = os.path.join(SENTINEL_BASE, f"{CITY}_MultiMonth_stack.tif")
labels_path = f"sentinel_data/{CITY}/{CITY}_OSM_labels.tif"

with rasterio.open(stack_path) as src:
    X = src.read()
    profile = src.profile.copy()
    transform = src.transform

print(f"✓ Loaded Sentinel-2 stack: {X.shape}")

with rasterio.open(labels_path) as src:
    y = src.read(1)

print(f"✓ Loaded labels: {y.shape}")

print("\n" + "="*60)
print("PREPARING DATA")
print("="*60)

n_bands, h, w = X.shape
X_flat = X.reshape(n_bands, -1).T
y_flat = y.flatten()

print(f"Original shape: {h} × {w} = {h*w:,} pixels")

mask = ~np.isnan(X_flat).any(axis=1)
X_flat = X_flat[mask]
y_flat = y_flat[mask]

print(f"After removing NaN: {len(y_flat):,} pixels")

print("\n" + "="*60)
print("CLASS DISTRIBUTION")
print("="*60)

unique, counts = np.unique(y_flat, return_counts=True)
total = len(y_flat)

class_distribution = {}
for label, count in zip(unique, counts):
    percentage = 100*count/total
    class_distribution[f"Class_{int(label)}"] = {
        "count": int(count),
        "percentage": round(percentage, 2)
    }
    print(f"Class {int(label)}: {count:,} samples ({percentage:.2f}%)")

if len(counts) == 2:
    imbalance_ratio = counts[0]/counts[1]
    print(f"\nImbalance ratio: {imbalance_ratio:.1f}:1 (non-green:green)")
    class_distribution["imbalance_ratio"] = round(imbalance_ratio, 2)

print("\n" + "="*60)
print("TRAIN-TEST SPLIT")
print("="*60)

X_train, X_test, y_train, y_test = train_test_split(
    X_flat, y_flat, 
    test_size=0.2, 
    random_state=42,
    stratify=y_flat
)


print(f"Initial train set: {len(y_train):,} samples")
print(f"Initial test set: {len(y_test):,} samples")

print("\n" + "="*60)
print(f"SAMPLING {SAMPLE_PERCENTAGE*100}% OF TRAINING DATA")
print("="*60)

if SAMPLE_PERCENTAGE < 1.0:
    n_samples = int(len(X_train) * SAMPLE_PERCENTAGE)
    X_train, y_train = resample(X_train, y_train, 
                                n_samples=n_samples, 
                                random_state=42,
                                stratify=y_train)
    print(f"✓ Sampled {SAMPLE_PERCENTAGE*100}% of training data: {n_samples:,} pixels")
    print(f"  Maintains class balance through stratified sampling")
else:
    print(f"Using 100% of training data (no sampling)")


split_info = {
    "train_samples": int(len(y_train)),
    "test_samples": int(len(y_test)),
    "sample_percentage": SAMPLE_PERCENTAGE * 100,
    "train_green": int(np.sum(y_train==1)),
    "train_green_pct": round(100*np.mean(y_train==1), 2),
    "test_green": int(np.sum(y_test==1)),
    "test_green_pct": round(100*np.mean(y_test==1), 2)
}

print(f"Train set: {split_info['train_samples']:,} samples")
print(f"  Green: {split_info['train_green']:,} ({split_info['train_green_pct']:.2f}%)")
print(f"Test set: {split_info['test_samples']:,} samples")
print(f"  Green: {split_info['test_green']:,} ({split_info['test_green_pct']:.2f}%)")

print("\n" + "="*60)
print("TRAINING RANDOM FOREST")
print("="*60)

clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=25,
    min_samples_split=10,
    min_samples_leaf=5,
    class_weight="balanced",
    n_jobs=-1,
    random_state=42,
    verbose=1
)

clf.fit(X_train, y_train)

print("\n✓ Training complete!")

print("\n" + "="*60)
print("EVALUATION")
print("="*60)

y_pred = clf.predict(X_test)

print("\nClassification Report:")
report_dict = classification_report(y_test, y_pred, target_names=["Non-green", "Green"], output_dict=True)
print(classification_report(y_test, y_pred, target_names=["Non-green", "Green"]))

cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print("                Predicted")
print("              Non-green  Green")
print(f"Actual Non-g  {cm[0,0]:8d}  {cm[0,1]:6d}")
print(f"       Green  {cm[1,0]:8d}  {cm[1,1]:6d}")

tn, fp, fn, tp = cm.ravel()
metrics = {
    "balanced_accuracy": round(balanced_accuracy_score(y_test, y_pred), 3),
    "f1_green": round(f1_score(y_test, y_pred, pos_label=1), 3),
    "f1_macro": round(f1_score(y_test, y_pred, average='macro'), 3),
    "sensitivity": round(tp / (tp + fn), 3),
    "specificity": round(tn / (tn + fp), 3),
    "precision_green": round(report_dict['Green']['precision'], 3),
    "recall_green": round(report_dict['Green']['recall'], 3)
}

print("\n" + "-"*60)
print("SUMMARY METRICS:")
print("-"*60)
for key, value in metrics.items():
    print(f"{key:20s}: {value:.3f}")

print("\n" + "="*60)
print("BAND IMPORTANCE")
print("="*60)

months = ['April', 'August', 'November']
bands = ['B02', 'B03', 'B04', 'B08']
band_names = [f"{month}_{band}" for month in months for band in bands]

feature_importance = pd.DataFrame({
    'band': band_names,
    'importance': clf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Bands:")
for idx, row in feature_importance.head(10).iterrows():
    print(f"{row['band']:20s}: {row['importance']:.4f}")

print("\n" + "="*60)
print("CREATING PREDICTION MAP")
print("="*60)

X_full_flat = X.reshape(n_bands, -1).T
mask_full = ~np.isnan(X_full_flat).any(axis=1)

y_pred_full = np.zeros(X_full_flat.shape[0])
y_pred_full[mask_full] = clf.predict(X_full_flat[mask_full])
y_pred_map = y_pred_full.reshape(h, w)

print("✓ Prediction map created")

print("\n" + "="*60)
print("SAVING RESULTS")
print("="*60)

metadata = {
    "city": CITY,
    "timestamp": timestamp,
    "image_shape": {"height": h, "width": w, "bands": n_bands},
    "class_distribution": class_distribution,
    "split_info": split_info,
    "metrics": metrics,
    "model_params": {
        "n_estimators": 200,
        "max_depth": 25,
        "min_samples_split": 10,
        "min_samples_leaf": 5,
        "class_weight": "balanced"
    }
}

with open(os.path.join(run_folder, "metadata.json"), "w") as f:
    json.dump(metadata, f, indent=2)
print("✓ Saved metadata.json")

report_df = pd.DataFrame(report_dict).transpose()
report_df.to_csv(os.path.join(run_folder, "classification_report.csv"))
print("✓ Saved classification_report.csv")

feature_importance.to_csv(os.path.join(run_folder, "feature_importance.csv"), index=False)
print("✓ Saved feature_importance.csv")

cm_df = pd.DataFrame(cm, 
                     index=['Actual_Non-green', 'Actual_Green'],
                     columns=['Predicted_Non-green', 'Predicted_Green'])
cm_df.to_csv(os.path.join(run_folder, "confusion_matrix.csv"))
print("✓ Saved confusion_matrix.csv")

pred_profile = profile.copy()
pred_profile.update(dtype='uint8', count=1, compress='lzw')

with rasterio.open(os.path.join(run_folder, "prediction_map.tif"), 'w', **pred_profile) as dst:
    dst.write(y_pred_map.astype(np.uint8), 1)
print("✓ Saved prediction_map.tif")

with rasterio.open(os.path.join(run_folder, "ground_truth.tif"), 'w', **pred_profile) as dst:
    dst.write(y.astype(np.uint8), 1)
print("✓ Saved ground_truth.tif")

print("\n" + "="*60)
print("CREATING VISUALIZATIONS")
print("="*60)

fig, ax = plt.subplots(figsize=(8, 6))
im = ax.imshow(cm, cmap='Blues', aspect='auto')
ax.set_xticks([0, 1])
ax.set_yticks([0, 1])
ax.set_xticklabels(['Non-green', 'Green'])
ax.set_yticklabels(['Non-green', 'Green'])
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title(f'Confusion Matrix - {CITY}')

for i in range(2):
    for j in range(2):
        text = ax.text(j, i, f"{cm[i, j]:,}", ha="center", va="center", color="black", fontsize=14)

plt.colorbar(im, ax=ax)
plt.tight_layout()
plt.savefig(os.path.join(run_folder, "confusion_matrix.png"), dpi=300, bbox_inches='tight')
plt.close()
print("✓ Saved confusion_matrix.png")

fig, ax = plt.subplots(figsize=(10, 6))
top_features = feature_importance.head(12)
ax.barh(range(len(top_features)), top_features['importance'])
ax.set_yticks(range(len(top_features)))
ax.set_yticklabels(top_features['band'])
ax.set_xlabel('Importance')
ax.set_title(f'Top 12 Band Importance - {CITY}')
ax.invert_yaxis()
plt.tight_layout()
plt.savefig(os.path.join(run_folder, "feature_importance.png"), dpi=300, bbox_inches='tight')
plt.close()
print("✓ Saved feature_importance.png")

fig, axes = plt.subplots(1, 3, figsize=(18, 6))

rgb = X[[6, 5, 4], :, :].transpose(1, 2, 0)
rgb_norm = np.clip(rgb / 3000, 0, 1)

axes[0].imshow(rgb_norm)
axes[0].set_title("True Color RGB (August)", fontsize=14)
axes[0].axis('off')

axes[1].imshow(y, cmap='RdYlGn', vmin=0, vmax=1)
axes[1].set_title("Ground Truth (OSM)", fontsize=14)
axes[1].axis('off')

axes[2].imshow(y_pred_map, cmap='RdYlGn', vmin=0, vmax=1)
axes[2].set_title("Model Prediction", fontsize=14)
axes[2].axis('off')

plt.tight_layout()
plt.savefig(os.path.join(run_folder, "prediction_comparison.png"), dpi=300, bbox_inches='tight')
plt.close()
print("✓ Saved prediction_comparison.png")

fig, axes = plt.subplots(1, 3, figsize=(18, 6))

false_positives = (y == 0) & (y_pred_map == 1)
false_negatives = (y == 1) & (y_pred_map == 0)

axes[0].imshow(rgb_norm)
axes[0].set_title("RGB Image", fontsize=14)
axes[0].axis('off')

axes[1].imshow(false_positives, cmap='Reds', vmin=0, vmax=1)
axes[1].set_title(f"False Positives ({np.sum(false_positives):,} pixels)", fontsize=14)
axes[1].axis('off')

axes[2].imshow(false_negatives, cmap='Blues', vmin=0, vmax=1)
axes[2].set_title(f"False Negatives ({np.sum(false_negatives):,} pixels)", fontsize=14)
axes[2].axis('off')

plt.tight_layout()
plt.savefig(os.path.join(run_folder, "error_analysis.png"), dpi=300, bbox_inches='tight')
plt.close()
print("✓ Saved error_analysis.png")

fig, ax = plt.subplots(figsize=(10, 6))
metric_names = list(metrics.keys())
metric_values = list(metrics.values())

bars = ax.bar(range(len(metric_names)), metric_values, color='steelblue')
ax.set_xticks(range(len(metric_names)))
ax.set_xticklabels(metric_names, rotation=45, ha='right')
ax.set_ylabel('Score')
ax.set_title(f'Model Performance Metrics - {CITY}')
ax.set_ylim([0, 1])
ax.axhline(y=0.8, color='r', linestyle='--', alpha=0.3, label='0.8 threshold')
ax.grid(axis='y', alpha=0.3)
ax.legend()

for i, (bar, value) in enumerate(zip(bars, metric_values)):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, 
            f'{value:.3f}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.savefig(os.path.join(run_folder, "metrics_summary.png"), dpi=300, bbox_inches='tight')
plt.close()
print("✓ Saved metrics_summary.png")

summary_text = f"""
RANDOM FOREST CLASSIFICATION RESULTS
{'='*60}

City: {CITY}
Date: {timestamp}

DATA SUMMARY:
- Image size: {h} × {w} pixels
- Total bands: {n_bands}
- Sample percentage: {SAMPLE_PERCENTAGE*100}%
- Training samples: {split_info['train_samples']:,}
- Test samples: {split_info['test_samples']:,}
- Green pixels (test): {split_info['test_green']:,} ({split_info['test_green_pct']:.2f}%)

MODEL PERFORMANCE:
- Balanced Accuracy: {metrics['balanced_accuracy']:.3f}
- F1 Score (Green): {metrics['f1_green']:.3f}
- F1 Score (Macro): {metrics['f1_macro']:.3f}
- Precision (Green): {metrics['precision_green']:.3f}
- Recall/Sensitivity (Green): {metrics['recall_green']:.3f}
- Specificity (Non-green): {metrics['specificity']:.3f}

CONFUSION MATRIX:
                Predicted
             Non-green    Green
Actual Non-g  {cm[0,0]:8,}  {cm[0,1]:7,}
       Green  {cm[1,0]:8,}  {cm[1,1]:7,}

TOP 5 MOST IMPORTANT BANDS:
"""

for idx, row in feature_importance.head(5).iterrows():
    summary_text += f"  {row['band']:20s}: {row['importance']:.4f}\n"

summary_text += f"""
FILES SAVED:
- metadata.json
- classification_report.csv
- feature_importance.csv
- confusion_matrix.csv
- prediction_map.tif
- ground_truth.tif
- confusion_matrix.png
- feature_importance.png
- prediction_comparison.png
- error_analysis.png
- metrics_summary.png

Results saved to: {run_folder}
"""

with open(os.path.join(run_folder, "SUMMARY.txt"), "w") as f:
    f.write(summary_text)

print("\n" + summary_text)
print("="*60)
print("✓ ALL RESULTS SAVED SUCCESSFULLY!")
print("="*60)

RANDOM FOREST TRAINING - London

Results will be saved to: results/London_20260111_000353

LOADING DATA
✓ Loaded Sentinel-2 stack: (12, 10980, 10980)
✓ Loaded labels: (10980, 10980)

PREPARING DATA
Original shape: 10980 × 10980 = 120,560,400 pixels
After removing NaN: 120,560,400 pixels

CLASS DISTRIBUTION
Class 0: 120,496,872 samples (99.95%)
Class 1: 63,528 samples (0.05%)

Imbalance ratio: 1896.8:1 (non-green:green)

TRAIN-TEST SPLIT
Initial train set: 96,448,320 samples
Initial test set: 24,112,080 samples

SAMPLING 1.0% OF TRAINING DATA
✓ Sampled 1.0% of training data: 964,483 pixels
  Maintains class balance through stratified sampling
Train set: 964,483 samples
  Green: 508 (0.05%)
Test set: 24,112,080 samples
  Green: 12,706 (0.05%)

TRAINING RANDOM FOREST


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   13.2s finished



✓ Training complete!

EVALUATION


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    3.9s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:   21.5s
[Parallel(n_jobs=10)]: Done 200 out of 200 | elapsed:   24.3s finished



Classification Report:
              precision    recall  f1-score   support

   Non-green       1.00      1.00      1.00  24099374
       Green       0.15      0.03      0.06     12706

    accuracy                           1.00  24112080
   macro avg       0.57      0.52      0.53  24112080
weighted avg       1.00      1.00      1.00  24112080


Confusion Matrix:
                Predicted
              Non-green  Green
Actual Non-g  24096923    2451
       Green     12274     432

------------------------------------------------------------
SUMMARY METRICS:
------------------------------------------------------------
balanced_accuracy   : 0.517
f1_green            : 0.055
f1_macro            : 0.528
sensitivity         : 0.034
specificity         : 1.000
precision_green     : 0.150
recall_green        : 0.034

BAND IMPORTANCE

Top 10 Most Important Bands:
April_B04           : 0.1893
April_B08           : 0.1493
April_B03           : 0.1339
April_B02           : 0.1013
August_B04  

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:  1.4min
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:  8.6min
[Parallel(n_jobs=10)]: Done 200 out of 200 | elapsed:  9.2min finished


✓ Prediction map created

SAVING RESULTS
✓ Saved metadata.json
✓ Saved classification_report.csv
✓ Saved feature_importance.csv
✓ Saved confusion_matrix.csv
✓ Saved prediction_map.tif
✓ Saved ground_truth.tif

CREATING VISUALIZATIONS
✓ Saved confusion_matrix.png
✓ Saved feature_importance.png
✓ Saved prediction_comparison.png
✓ Saved error_analysis.png
✓ Saved metrics_summary.png


RANDOM FOREST CLASSIFICATION RESULTS

City: London
Date: 20260111_000353

DATA SUMMARY:
- Image size: 10980 × 10980 pixels
- Total bands: 12
- Sample percentage: 1.0%
- Training samples: 964,483
- Test samples: 24,112,080
- Green pixels (test): 12,706 (0.05%)

MODEL PERFORMANCE:
- Balanced Accuracy: 0.517
- F1 Score (Green): 0.055
- F1 Score (Macro): 0.528
- Precision (Green): 0.150
- Recall/Sensitivity (Green): 0.034
- Specificity (Non-green): 1.000

CONFUSION MATRIX:
                Predicted
             Non-green    Green
Actual Non-g  24,096,923    2,451
       Green    12,274      432

TOP 5 MOST IMPOR