# Multi-City Green Space Detection
## Training Support Vector Machine (SVM) with WorldCover 2021 as Ground Truth

**Training Cities:** 9 cities for robust model training

**Key Features:**
- Uses **WorldCover 2021** as ground truth for training
- Green classes: Tree cover (10), Shrubland (20), Grassland (30), Mangroves (95)
- Multi-temporal Sentinel-2 data (April, August, November)
- 21 bands: 4 spectral bands x 3 months + 3 vegetation indices x 3 months
- **Cross-city training** for better generalization

**SVM Implementation:**
- Uses SGDClassifier with hinge loss (linear SVM approximation) for scalability
- Optional RBF kernel SVM with subsampling for comparison

## 1. Import Libraries

In [None]:
import json
import os
import glob
import numpy as np
import rasterio
from rasterio.warp import reproject, Resampling
from pathlib import Path
import geopandas as gpd
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully")

## 2. Configuration

In [None]:
# Base paths - using relative paths from project root
# Run notebooks from the project root directory: python -m jupyter notebook
import os

# Find project root (go up from notebooks/training/)
if os.path.exists("data") and os.path.exists("models"):
    PROJECT_ROOT = os.getcwd()
elif os.path.exists("../../data") and os.path.exists("../../models"):
    PROJECT_ROOT = os.path.abspath("../..")
else:
    PROJECT_ROOT = os.getcwd()
    print(f"Warning: Could not detect project root. Using: {PROJECT_ROOT}")

# Derived paths
DATA_PATH = os.path.join(PROJECT_ROOT, "data")
MODELS_PATH = os.path.join(PROJECT_ROOT, "models")
GEOJSON_FOLDER = os.path.join(DATA_PATH, "aois")

# Output folder
OUTPUT_FOLDER = os.path.join(PROJECT_ROOT, "outputs", "svm_training")
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
os.makedirs(MODELS_PATH, exist_ok=True)

# Create timestamped run folder
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
RUN_FOLDER = os.path.join(OUTPUT_FOLDER, f"run_{timestamp}")
os.makedirs(RUN_FOLDER, exist_ok=True)

# WorldCover green classes
GREEN_CLASSES = [10, 20, 30, 95]  # Tree, Shrub, Grass, Mangroves

# SVM Configuration
USE_RBF_KERNEL = False  # Set to True for RBF kernel (slower but potentially better)
MAX_SAMPLES_FOR_RBF = 50000  # RBF SVM is slow, limit samples

# Define cities with their specific file locations
CITY_FILES = {
    "Amsterdam": {
        "stack": os.path.join(DATA_PATH, "sentinel_stacks", "Amsterdam_MultiMonth_stack.tif"),
        "worldcover": os.path.join(DATA_PATH, "worldcover", "Amsterdam_WorldCover_2021.tif"),
        "geojson": os.path.join(GEOJSON_FOLDER, "Amsterdam.geojson"),
    },
    "Auckland": {
        "stack": os.path.join(DATA_PATH, "sentinel_stacks", "Auckland_MultiMonth_stack.tif"),
        "worldcover": os.path.join(DATA_PATH, "worldcover", "Auckland_WorldCover_2021.tif"),
        "geojson": os.path.join(GEOJSON_FOLDER, "Auckland.geojson"),
    },
    "Barcelona": {
        "stack": os.path.join(DATA_PATH, "sentinel_stacks", "Barcelona_MultiMonth_stack.tif"),
        "worldcover": os.path.join(DATA_PATH, "worldcover", "Barcelona_WorldCover_2021.tif"),
        "geojson": os.path.join(GEOJSON_FOLDER, "Barcelona.geojson"),
    },
    "Sydney": {
        "stack": os.path.join(DATA_PATH, "sentinel_stacks", "Sydney_MultiMonth_stack.tif"),
        "worldcover": os.path.join(DATA_PATH, "worldcover", "Sydney_WorldCover_2021.tif"),
        "geojson": os.path.join(GEOJSON_FOLDER, "Sydney.geojson"),
    },
    "Toronto": {
        "stack": os.path.join(DATA_PATH, "sentinel_stacks", "Toronto_MultiMonth_stack.tif"),
        "worldcover": os.path.join(DATA_PATH, "worldcover", "Toronto_WorldCover_2021.tif"),
        "geojson": os.path.join(GEOJSON_FOLDER, "Toronto.geojson"),
    },
    "Vienna": {
        "stack": os.path.join(DATA_PATH, "sentinel_stacks", "Wien_MultiMonth_stack.tif"),
        "worldcover": os.path.join(DATA_PATH, "worldcover", "Vienna_WorldCover_2021.tif"),
        "geojson": os.path.join(GEOJSON_FOLDER, "Vienna.geojson"),
    },
    "London": {
        "stack": os.path.join(DATA_PATH, "sentinel_stacks", "London_MultiMonth_stack.tif"),
        "worldcover": os.path.join(DATA_PATH, "worldcover", "London_WorldCover_2021.tif"),
        "geojson": os.path.join(GEOJSON_FOLDER, "London.geojson"),
    },
    "Melbourne": {
        "stack": os.path.join(DATA_PATH, "sentinel_stacks", "Melbourne_MultiMonth_stack.tif"),
        "worldcover": os.path.join(DATA_PATH, "worldcover", "Melbourne_WorldCover_2021.tif"),
        "geojson": os.path.join(GEOJSON_FOLDER, "Melbourne.geojson"),
    },
    "Paris": {
        "stack": os.path.join(DATA_PATH, "sentinel_stacks", "Paris_MultiMonth_stack.tif"),
        "worldcover": os.path.join(DATA_PATH, "worldcover", "Paris_WorldCover_2021.tif"),
        "geojson": os.path.join(GEOJSON_FOLDER, "Paris.geojson"),
    },
    "San_Francisco": {
        "stack": os.path.join(DATA_PATH, "sentinel_stacks", "San_Francisco_MultiMonth_stack.tif"),
        "worldcover": os.path.join(DATA_PATH, "worldcover", "San_Francisco_WorldCover_2021.tif"),
        "geojson": os.path.join(GEOJSON_FOLDER, "San_Francisco.geojson"),
    },
    "Seattle": {
        "stack": os.path.join(DATA_PATH, "sentinel_stacks", "Seattle_MultiMonth_stack.tif"),
        "worldcover": os.path.join(DATA_PATH, "worldcover", "Seattle_WorldCover_2021.tif"),
        "geojson": os.path.join(GEOJSON_FOLDER, "Seattle.geojson"),
    },
}

print("Configuration loaded")
print(f"  Project root: {PROJECT_ROOT}")
print(f"  Data path: {DATA_PATH}")
print(f"  Models path: {MODELS_PATH}")
print(f"  Output folder: {RUN_FOLDER}")
print(f"  SVM Type: {'RBF Kernel' if USE_RBF_KERNEL else 'Linear (SGD)'}")
print(f"  Target cities: {len(CITY_FILES)}")
for city in CITY_FILES:
    print(f"    - {city}")

## 3. Discover Available Cities

In [None]:
print("="*70)
print("DISCOVERING AVAILABLE CITIES")
print("="*70)

print(f"\nChecking {len(CITY_FILES)} configured cities...")

cities_data = []
missing_cities = []

for city_name, paths in CITY_FILES.items():
    stack_file = paths["stack"]
    worldcover_file = paths["worldcover"]
    geojson_file = paths["geojson"]
    
    has_stack = os.path.exists(stack_file)
    has_worldcover = os.path.exists(worldcover_file)
    has_geojson = os.path.exists(geojson_file)
    
    status_stack = "Y" if has_stack else "N"
    status_geojson = "Y" if has_geojson else "N"
    status_worldcover = "Y" if has_worldcover else "N"
    
    print(f"  {city_name:15s} - Stack: {status_stack}  GeoJSON: {status_geojson}  WorldCover: {status_worldcover}")
    
    if has_stack and has_geojson and has_worldcover:
        cities_data.append({
            "name": city_name,
            "stack_file": stack_file,
            "geojson_file": geojson_file,
            "worldcover_file": worldcover_file
        })
    else:
        missing = []
        if not has_stack: missing.append("Stack")
        if not has_geojson: missing.append("GeoJSON")
        if not has_worldcover: missing.append("WorldCover")
        missing_cities.append(f"{city_name} (missing: {', '.join(missing)})")

complete_cities = cities_data

print(f"\n{'='*70}")
print(f"Cities with complete data: {len(complete_cities)}/{len(CITY_FILES)}")
print(f"{'='*70}")

if len(complete_cities) == 0:
    raise ValueError("No cities with complete data found!")

print(f"\nReady to train with {len(complete_cities)} cities")

## 4. Load and Process All Cities

In [None]:
print("\n" + "="*70)
print("LOADING AND PROCESSING ALL CITIES")
print("="*70)

EXPECTED_BANDS = None
MAX_SAMPLES_PER_CLASS_PER_CITY = 50000  # Reduced for SVM (memory constraints)

all_X = []
all_y = []
city_info = []
skipped_cities = []

for city_data in tqdm(complete_cities, desc="Processing cities"):
    city_name = city_data["name"]
    stack_file = city_data["stack_file"]
    worldcover_file = city_data["worldcover_file"]
    
    print(f"\n{'='*70}")
    print(f"Processing: {city_name}")
    print(f"{'='*70}")
    
    try:
        # Load Sentinel-2 stack
        with rasterio.open(stack_file) as src:
            X_stack = src.read()
            stack_transform = src.transform
            stack_shape = (src.height, src.width)
            stack_crs = src.crs
        
        n_bands = X_stack.shape[0]
        print(f"  Loaded Sentinel-2 stack: {X_stack.shape} ({n_bands} bands)")
        
        if EXPECTED_BANDS is None:
            EXPECTED_BANDS = n_bands
            print(f"  Setting expected bands to {EXPECTED_BANDS}")
        elif n_bands != EXPECTED_BANDS:
            print(f"  SKIPPING: Expected {EXPECTED_BANDS} bands, but found {n_bands}")
            skipped_cities.append({"name": city_name, "reason": f"Band mismatch"})
            continue
        
        # Load and reproject WorldCover
        with rasterio.open(worldcover_file) as src:
            worldcover_data = np.empty(stack_shape, dtype=np.uint8)
            reproject(
                source=rasterio.band(src, 1),
                destination=worldcover_data,
                src_transform=src.transform,
                src_crs=src.crs,
                dst_transform=stack_transform,
                dst_crs=stack_crs,
                resampling=Resampling.nearest
            )
        
        labels = np.isin(worldcover_data, GREEN_CLASSES).astype(np.uint8)
        green_percentage = 100 * labels.sum() / labels.size
        print(f"  WorldCover labels: {labels.shape} ({green_percentage:.2f}% green)")
        
        # Reshape for sklearn
        n_pixels = X_stack.shape[1] * X_stack.shape[2]
        X = X_stack.reshape(n_bands, -1).T
        y = labels.flatten()
        
        # Remove NaN values
        valid_mask = ~np.isnan(X).any(axis=1)
        X_clean = X[valid_mask]
        y_clean = y[valid_mask]
        
        print(f"  Valid samples: {len(X_clean):,}")
        
        # Balanced sampling
        green_indices = np.where(y_clean == 1)[0]
        nongreen_indices = np.where(y_clean == 0)[0]
        
        n_samples_per_class = min(len(green_indices), len(nongreen_indices), MAX_SAMPLES_PER_CLASS_PER_CITY)
        
        np.random.seed(42)
        sampled_green = np.random.choice(green_indices, n_samples_per_class, replace=False)
        sampled_nongreen = np.random.choice(nongreen_indices, n_samples_per_class, replace=False)
        
        sampled_indices = np.concatenate([sampled_green, sampled_nongreen])
        np.random.shuffle(sampled_indices)
        
        X_sampled = X_clean[sampled_indices]
        y_sampled = y_clean[sampled_indices]
        
        print(f"  Balanced sampling: {len(X_sampled):,} samples ({n_samples_per_class:,} per class)")
        
        all_X.append(X_sampled)
        all_y.append(y_sampled)
        city_info.extend([city_name] * len(X_sampled))
        
    except Exception as e:
        print(f"  Error processing {city_name}: {e}")
        skipped_cities.append({"name": city_name, "reason": str(e)})
        continue

print(f"\n{'='*70}")
print("DATA AGGREGATION")
print(f"{'='*70}")

if len(all_X) == 0:
    raise ValueError("No valid city data loaded!")

X_combined = np.vstack(all_X)
y_combined = np.hstack(all_y)
city_info = np.array(city_info)

print(f"\nCombined dataset:")
print(f"  Cities included: {len(all_X)}")
print(f"  Total samples: {len(X_combined):,}")
print(f"  Features (bands): {X_combined.shape[1]}")
print(f"  Green samples: {np.sum(y_combined == 1):,} ({100*np.sum(y_combined == 1)/len(y_combined):.1f}%)")
print(f"  Non-green samples: {np.sum(y_combined == 0):,} ({100*np.sum(y_combined == 0)/len(y_combined):.1f}%)")

## 5. Train-Test Split & Feature Normalization

In [None]:
print("\n" + "="*70)
print("TRAIN-TEST SPLIT & FEATURE NORMALIZATION")
print("="*70)

# Split data (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y_combined, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_combined
)

print(f"\nDataset split:")
print(f"  Training samples: {len(X_train):,}")
print(f"  Testing samples: {len(X_test):,}")

# Feature normalization (CRITICAL for SVM!)
print(f"\nApplying feature normalization (StandardScaler)...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"  Features normalized")
print(f"    Mean (train): {X_train_scaled.mean():.4f}")
print(f"    Std (train): {X_train_scaled.std():.4f}")

## 6. Train SVM Model

Two options:
1. **Linear SVM (SGDClassifier)**: Fast, scales to large datasets
2. **RBF Kernel SVM**: Better for non-linear boundaries, but slower

In [None]:
print("\n" + "="*70)
print("TRAINING SVM MODEL")
print("="*70)

if USE_RBF_KERNEL:
    # RBF Kernel SVM (slower but potentially better)
    print(f"\nUsing RBF Kernel SVM")
    
    # Subsample for RBF SVM if dataset is too large
    if len(X_train_scaled) > MAX_SAMPLES_FOR_RBF:
        print(f"  Subsampling to {MAX_SAMPLES_FOR_RBF:,} samples for RBF kernel...")
        np.random.seed(42)
        indices = np.random.choice(len(X_train_scaled), MAX_SAMPLES_FOR_RBF, replace=False)
        X_train_svm = X_train_scaled[indices]
        y_train_svm = y_train[indices]
    else:
        X_train_svm = X_train_scaled
        y_train_svm = y_train
    
    svm = SVC(
        kernel='rbf',
        C=1.0,
        gamma='scale',
        class_weight='balanced',
        random_state=42,
        verbose=True,
        probability=True  # Enable probability estimates
    )
    
    print(f"\nSVM Parameters:")
    print(f"  kernel: rbf")
    print(f"  C: 1.0")
    print(f"  gamma: scale")
    print(f"  Training samples: {len(X_train_svm):,}")
    
    print(f"\nTraining RBF SVM (this may take a while)...")
    svm.fit(X_train_svm, y_train_svm)
    
else:
    # Linear SVM using SGDClassifier (fast and scalable)
    print(f"\nUsing Linear SVM (SGDClassifier with hinge loss)")
    
    svm = SGDClassifier(
        loss='hinge',  # Hinge loss = linear SVM
        penalty='l2',
        alpha=0.0001,
        max_iter=1000,
        tol=1e-3,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1,
        verbose=1
    )
    
    print(f"\nSGDClassifier Parameters:")
    print(f"  loss: hinge (linear SVM)")
    print(f"  penalty: l2")
    print(f"  alpha: 0.0001")
    print(f"  max_iter: 1000")
    print(f"  class_weight: balanced")
    print(f"  Training samples: {len(X_train_scaled):,}")
    
    print(f"\nTraining Linear SVM...")
    svm.fit(X_train_scaled, y_train)

print(f"\nModel trained successfully")

## 7. Evaluate Model Performance

In [None]:
print("\n" + "="*70)
print("MODEL EVALUATION")
print("="*70)

# Make predictions
y_pred = svm.predict(X_test_scaled)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

print(f"\nModel Performance (SVM):")
print(f"  Accuracy:  {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1-Score:  {f1:.4f}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(f"\nConfusion Matrix:")
print(f"                 Predicted")
print(f"               Non-Green  Green")
print(f"Actual Non-Green  {cm[0,0]:>8,}  {cm[0,1]:>8,}")
print(f"       Green      {cm[1,0]:>8,}  {cm[1,1]:>8,}")

# Green detection analysis
total_actual_green = cm[1,0] + cm[1,1]
green_detected = 100 * cm[1,1] / total_actual_green if total_actual_green > 0 else 0
green_missed = 100 * cm[1,0] / total_actual_green if total_actual_green > 0 else 0

print(f"\nGreen Detection Analysis:")
print(f"  Green correctly detected: {green_detected:.1f}%")
print(f"  Green missed: {green_missed:.1f}%")

# Save metrics
metrics = {
    "model": "SVM_RBF" if USE_RBF_KERNEL else "SVM_Linear",
    "ground_truth": "WorldCover_2021",
    "training_cities": [city['name'] for city in complete_cities],
    "n_cities": len(complete_cities),
    "total_training_samples": int(len(X_train)),
    "total_testing_samples": int(len(X_test)),
    "accuracy": float(accuracy),
    "precision": float(precision),
    "recall": float(recall),
    "f1_score": float(f1),
    "confusion_matrix": cm.tolist()
}

with open(os.path.join(RUN_FOLDER, "metrics.json"), "w") as f:
    json.dump(metrics, f, indent=2)

print(f"\nMetrics saved to: {RUN_FOLDER}/metrics.json")

## 8. Visualize Confusion Matrix

In [None]:
# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Non-Green', 'Green'],
            yticklabels=['Non-Green', 'Green'],
            cbar_kws={'label': 'Count'})
plt.title(f'Confusion Matrix - SVM ({"RBF" if USE_RBF_KERNEL else "Linear"})\n(Trained on {len(complete_cities)} cities)', 
          fontsize=14, fontweight='bold')
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()
plt.savefig(os.path.join(RUN_FOLDER, 'confusion_matrix.png'), dpi=300, bbox_inches='tight')
plt.show()

print("Confusion matrix saved")

## 9. Feature Importance Analysis (Linear SVM Only)

In [None]:
if not USE_RBF_KERNEL:
    # For linear SVM, we can look at coefficient magnitudes as feature importance
    importances = np.abs(svm.coef_[0])
    n_features = len(importances)
    
    # Generate band names
    if n_features == 21:
        band_names = [
            'B02-Apr', 'B03-Apr', 'B04-Apr', 'B08-Apr', 'NDVI-Apr', 'EVI-Apr', 'SAVI-Apr',
            'B02-Aug', 'B03-Aug', 'B04-Aug', 'B08-Aug', 'NDVI-Aug', 'EVI-Aug', 'SAVI-Aug',
            'B02-Nov', 'B03-Nov', 'B04-Nov', 'B08-Nov', 'NDVI-Nov', 'EVI-Nov', 'SAVI-Nov'
        ]
    else:
        band_names = [f'Band_{i+1}' for i in range(n_features)]
    
    # Sort by importance
    indices = np.argsort(importances)[::-1]
    
    # Plot
    plt.figure(figsize=(12, max(8, n_features * 0.4)))
    plt.barh(range(len(importances)), importances[indices], color='steelblue')
    plt.yticks(range(len(importances)), [band_names[i] for i in indices])
    plt.xlabel('|Coefficient| (Feature Importance)', fontsize=12)
    plt.title(f'Linear SVM Feature Importance\n(Trained on {len(all_X)} cities, {n_features} bands)', 
              fontsize=14, fontweight='bold')
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.savefig(os.path.join(RUN_FOLDER, 'feature_importance.png'), dpi=300, bbox_inches='tight')
    plt.show()
    
    print("Feature importance plot saved")
    print(f"\nTop 10 most important features:")
    for i in range(min(10, len(importances))):
        idx = indices[i]
        print(f"  {i+1:2d}. {band_names[idx]:12s}: {importances[idx]:.4f}")
else:
    print("Feature importance not available for RBF kernel SVM")

## 10. Save Trained Model

In [None]:
import joblib

# Save the trained model
model_file = os.path.join(RUN_FOLDER, 'svm_model.pkl')
joblib.dump(svm, model_file)

# Save the scaler
scaler_file = os.path.join(RUN_FOLDER, 'feature_scaler.pkl')
joblib.dump(scaler, scaler_file)

print(f"Model saved to: {model_file}")
print(f"Scaler saved to: {scaler_file}")
print(f"\nTo load the model and scaler later:")
print(f"  import joblib")
print(f"  svm = joblib.load('{model_file}')")
print(f"  scaler = joblib.load('{scaler_file}')")
print(f"\nIMPORTANT: Always apply scaler.transform(X) before prediction!")

## 11. Per-City Performance Analysis

In [None]:
print("\n" + "="*70)
print("PER-CITY PERFORMANCE ANALYSIS")
print("="*70)

per_city_results = []

for city_data in complete_cities:
    city_name = city_data["name"]
    stack_file = city_data["stack_file"]
    worldcover_file = city_data["worldcover_file"]
    
    print(f"\n{city_name}:")
    
    try:
        # Load city data
        with rasterio.open(stack_file) as src:
            X_stack = src.read()
            stack_transform = src.transform
            stack_shape = (src.height, src.width)
            stack_crs = src.crs
        
        with rasterio.open(worldcover_file) as src:
            worldcover_data = np.empty(stack_shape, dtype=np.uint8)
            reproject(
                source=rasterio.band(src, 1),
                destination=worldcover_data,
                src_transform=src.transform,
                src_crs=src.crs,
                dst_transform=stack_transform,
                dst_crs=stack_crs,
                resampling=Resampling.nearest
            )
        
        labels = np.isin(worldcover_data, GREEN_CLASSES).astype(np.uint8)
        
        # Reshape and clean
        X = X_stack.reshape(X_stack.shape[0], -1).T
        y = labels.flatten()
        valid_mask = ~np.isnan(X).any(axis=1)
        X_city = X[valid_mask]
        y_city = y[valid_mask]
        
        # Apply scaler
        X_city_scaled = scaler.transform(X_city)
        
        # Predict
        y_pred_city = svm.predict(X_city_scaled)
        
        # Calculate metrics
        acc = accuracy_score(y_city, y_pred_city)
        prec = precision_score(y_city, y_pred_city, zero_division=0)
        rec = recall_score(y_city, y_pred_city, zero_division=0)
        f1_city = f1_score(y_city, y_pred_city, zero_division=0)
        
        # Calculate green percentages
        gt_green_pct = 100 * np.sum(y_city == 1) / len(y_city)
        pred_green_pct = 100 * np.sum(y_pred_city == 1) / len(y_pred_city)
        diff_pct = pred_green_pct - gt_green_pct
        
        print(f"  Accuracy:  {acc:.4f}")
        print(f"  Precision: {prec:.4f}")
        print(f"  Recall:    {rec:.4f}")
        print(f"  F1-Score:  {f1_city:.4f}")
        print(f"  Green %:   GT={gt_green_pct:.1f}%  Pred={pred_green_pct:.1f}%  (diff: {diff_pct:+.1f}%)")
        
        per_city_results.append({
            "city": city_name,
            "accuracy": float(acc),
            "precision": float(prec),
            "recall": float(rec),
            "f1_score": float(f1_city),
            "gt_green_pct": float(gt_green_pct),
            "pred_green_pct": float(pred_green_pct),
            "diff_pct": float(diff_pct)
        })
        
    except Exception as e:
        print(f"  Error: {e}")

# Save per-city results
with open(os.path.join(RUN_FOLDER, "per_city_metrics.json"), "w") as f:
    json.dump(per_city_results, f, indent=2)

print(f"\n{'='*70}")
print(f"Per-city metrics saved")

## 12. Summary Report

In [None]:
print("\n" + "="*80)
print("SVM TRAINING - SUMMARY REPORT")
print("="*80)

print(f"\nGround Truth: WorldCover 2021")
print(f"Green Classes: Tree cover (10), Shrubland (20), Grassland (30), Mangroves (95)")

print(f"\nSVM Configuration:")
if USE_RBF_KERNEL:
    print(f"  Type: RBF Kernel SVM")
    print(f"  C: 1.0")
    print(f"  gamma: scale")
else:
    print(f"  Type: Linear SVM (SGDClassifier)")
    print(f"  Loss: hinge")
    print(f"  Alpha: 0.0001")

print(f"\nTraining Data:")
print(f"  Cities: {len(complete_cities)}")
for city in complete_cities:
    print(f"    - {city['name']}")

print(f"\n  Total training samples: {len(X_train):,}")
print(f"  Total testing samples:  {len(X_test):,}")

print(f"\nModel Performance (Overall):")
print(f"  Accuracy:  {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1-Score:  {f1:.4f}")

if per_city_results:
    print(f"\nPer-City Performance (Average):")
    avg_acc = np.mean([r['accuracy'] for r in per_city_results])
    avg_prec = np.mean([r['precision'] for r in per_city_results])
    avg_rec = np.mean([r['recall'] for r in per_city_results])
    avg_f1 = np.mean([r['f1_score'] for r in per_city_results])
    avg_diff = np.mean([r['diff_pct'] for r in per_city_results])
    print(f"  Accuracy:  {avg_acc:.4f}")
    print(f"  Precision: {avg_prec:.4f}")
    print(f"  Recall:    {avg_rec:.4f}")
    print(f"  F1-Score:  {avg_f1:.4f}")
    print(f"  Avg Green % Difference: {avg_diff:+.2f}%")

print(f"\nOutput Files:")
print(f"  Results folder: {RUN_FOLDER}")
print(f"  - metrics.json")
print(f"  - per_city_metrics.json")
print(f"  - confusion_matrix.png")
if not USE_RBF_KERNEL:
    print(f"  - feature_importance.png")
print(f"  - svm_model.pkl")
print(f"  - feature_scaler.pkl")

print(f"\n" + "="*80)
print(f"TRAINING COMPLETE!")
print(f"="*80)

## 13. Copy Model to Project Root (Optional)

Run this cell to copy the trained model to the project root for easy access.

In [None]:
import shutil

# Copy model to main models folder
src_model = os.path.join(RUN_FOLDER, 'svm_model.pkl')
src_scaler = os.path.join(RUN_FOLDER, 'feature_scaler.pkl')

dst_model = os.path.join(MODELS_PATH, 'svm_model.pkl')
dst_scaler = os.path.join(MODELS_PATH, 'svm_scaler.pkl')

shutil.copy(src_model, dst_model)
shutil.copy(src_scaler, dst_scaler)

print(f"Model copied to: {dst_model}")
print(f"Scaler copied to: {dst_scaler}")