In [None]:
# Setup: Configure environment for SMPS
import sys
import os
from pathlib import Path
import subprocess

# Determine environment
cwd = Path.cwd()
is_cloud = cwd == Path('/content') or 'google.colab' in sys.modules

if is_cloud:
    print("‚òÅÔ∏è  Running in cloud environment (Colab)")

    # Check if SMPS already exists
    smps_path = Path('/content/SMPS')
    if not smps_path.exists():
        print("Cloning SMPS repository...")
        # For private repo, you'd need to authenticate
        # For now, we'll create a minimal setup
        smps_path.mkdir(parents=True, exist_ok=True)
        (smps_path / 'src').mkdir(exist_ok=True)
        print("‚ö†Ô∏è  SMPS source not available in cloud.")
        print("   Please upload the 'src/smps' directory to /content/SMPS/src/")
        print("   Or run this notebook locally with: Python (SMPS) kernel")

    src_path = smps_path / 'src'
    if src_path.exists():
        sys.path.insert(0, str(src_path))
        print(f"‚úì Added to path: {src_path}")
else:
    print("üíª Running locally")
    # Try to find src directory
    possible_src_paths = [
        Path('/home/viv/SMPS/src'),
        cwd.parent / 'src',
        cwd / 'src',
    ]

    for src_path in possible_src_paths:
        if src_path.exists() and (src_path / 'smps').exists():
            if str(src_path) not in sys.path:
                sys.path.insert(0, str(src_path))
            print(f"‚úì Added to path: {src_path}")
            break
    else:
        print(f"‚ö† Could not find src directory")
        print(f"  Python: {sys.executable}")

# Verify import
try:
    import smps
    print(f"‚úì SMPS version: {getattr(smps, '__version__', 'unknown')}")
except ImportError as e:
    print(f"‚úó Cannot import smps: {e}")
    print("\nTo run this notebook:")
    print("1. Locally: Select 'Python (SMPS)' kernel from VS Code")
    print("2. Cloud: Upload src/smps directory to /content/SMPS/src/")

‚ö† Could not find src directory. CWD: /content
  Please ensure you're using the correct Python kernel.
  Current Python: /usr/bin/python3


In [11]:
# Setup and imports
import sys
sys.path.insert(0, '../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date, datetime, timedelta
from pathlib import Path

# SMPS imports
from smps.physics.water_balance import TwoBucketWaterBalance, ModelParameters, create_two_bucket_model
from smps.physics.pedotransfer import estimate_soil_parameters_saxton
from smps.data.sources.weather import OpenMeteoSource
from smps.data.sources.soil import SoilGridsSource, MockSoilSource
from smps.data.sources.isda import IsdaAfricaSoilSource
from smps.data.sources.satellite import MODISNDVISource
from smps.data.sources.base import DataFetchRequest
from smps.core.types import SiteMetadata, SoilParameters
from smps.validation import ValidationEngine, ValidationMetrics, print_metrics_comparison

# Set up logging
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

print("‚úì All imports successful")

ModuleNotFoundError: No module named 'smps'

## 1. Define Test Sites with Geocoordinates

We'll use several test sites across Africa with known agricultural activity.

In [None]:
# Define test sites with geocoordinates
TEST_SITES = {
    "tunisia_sfax": {
        "latitude": 34.740,
        "longitude": 10.760,
        "elevation_m": 50,
        "description": "Semi-arid agricultural zone in Tunisia",
        "crop_type": "olive",
        "soil_expected": "Sandy loam"
    },
    "kenya_eldoret": {
        "latitude": 0.5143,
        "longitude": 35.2698,
        "elevation_m": 2100,
        "description": "Highland agricultural zone in Kenya",
        "crop_type": "maize",
        "soil_expected": "Clay loam"
    },
    "ghana_kumasi": {
        "latitude": 6.6885,
        "longitude": -1.6244,
        "elevation_m": 270,
        "description": "Humid tropical zone in Ghana",
        "crop_type": "cocoa",
        "soil_expected": "Clay"
    },
    "ethiopia_addis": {
        "latitude": 9.0320,
        "longitude": 38.7497,
        "elevation_m": 2355,
        "description": "Ethiopian highlands",
        "crop_type": "teff",
        "soil_expected": "Vertisol (clay)"
    }
}

# Display sites
sites_df = pd.DataFrame(TEST_SITES).T
sites_df.index.name = 'site_id'
print("Test Sites:")
display(sites_df)

## 2. Fetch Soil Data (iSDA Africa / SoilGrids)

In [None]:
def fetch_soil_data(site_id: str, lat: float, lon: float):
    """Fetch soil data from iSDA or SoilGrids"""

    # Try iSDA first for Africa
    try:
        isda_source = IsdaAfricaSoilSource()
        profile = isda_source.fetch_soil_profile(site_id, latitude=lat, longitude=lon)
        print(f"‚úì Fetched from iSDA Africa")
        return profile, "isda"
    except Exception as e:
        print(f"iSDA failed: {e}")

    # Fallback to SoilGrids
    try:
        soilgrids_source = SoilGridsSource()
        profile = soilgrids_source.fetch_soil_profile(site_id)
        print(f"‚úì Fetched from SoilGrids")
        return profile, "soilgrids"
    except Exception as e:
        print(f"SoilGrids failed: {e}")

    # Final fallback: Mock data
    mock_source = MockSoilSource()
    profile = mock_source.fetch_soil_profile(site_id)
    print(f"‚ö† Using mock soil data")
    return profile, "mock"

# Fetch soil data for all sites
soil_profiles = {}

for site_id, site_info in TEST_SITES.items():
    print(f"\n{'='*50}")
    print(f"Site: {site_id}")
    print(f"Location: ({site_info['latitude']}, {site_info['longitude']})")

    profile, source = fetch_soil_data(
        site_id,
        site_info['latitude'],
        site_info['longitude']
    )
    soil_profiles[site_id] = profile

    print(f"\nSoil Properties:")
    print(f"  Sand: {profile.sand_percent:.1f}%")
    print(f"  Clay: {profile.clay_percent:.1f}%")
    print(f"  Silt: {profile.silt_percent:.1f}%")
    print(f"  Porosity: {profile.porosity:.3f}")
    print(f"  Field Capacity: {profile.field_capacity:.3f}")
    print(f"  Wilting Point: {profile.wilting_point:.3f}")
    print(f"  Ksat: {profile.saturated_hydraulic_conductivity_cm_day:.1f} cm/day")

## 3. Fetch Weather Data (Open-Meteo)

In [None]:
# Define analysis period
START_DATE = date(2023, 1, 1)
END_DATE = date(2023, 12, 31)

print(f"Analysis period: {START_DATE} to {END_DATE}")
print(f"Duration: {(END_DATE - START_DATE).days + 1} days")

def fetch_weather_data(site_id: str, lat: float, lon: float,
                       start_date: date, end_date: date):
    """Fetch weather data from Open-Meteo"""

    weather_source = OpenMeteoSource()

    # Create request
    request = DataFetchRequest(
        site_id=site_id,
        start_date=start_date,
        end_date=end_date,
        parameters={"include_forecast": False}
    )

    # Override site coordinates
    weather_source._get_site_coordinates = lambda s: (lat, lon)

    try:
        weather_data = weather_source.fetch_daily_weather(request)
        print(f"‚úì Fetched {len(weather_data)} days of weather data")
        return weather_data
    except Exception as e:
        print(f"Weather fetch failed: {e}")
        return []

# Fetch weather for all sites
weather_data = {}

for site_id, site_info in TEST_SITES.items():
    print(f"\nFetching weather for {site_id}...")
    weather = fetch_weather_data(
        site_id,
        site_info['latitude'],
        site_info['longitude'],
        START_DATE,
        END_DATE
    )
    weather_data[site_id] = weather

In [None]:
# Convert weather data to DataFrames for analysis
weather_dfs = {}

for site_id, data in weather_data.items():
    if data:
        df = pd.DataFrame([d.dict() for d in data])
        df['date'] = pd.to_datetime(df['date'])
        df = df.set_index('date').sort_index()
        weather_dfs[site_id] = df

        # Quick summary
        print(f"\n{site_id}:")
        print(f"  Total precipitation: {df['precipitation_mm'].sum():.1f} mm")
        print(f"  Total ET0: {df['et0_mm'].sum():.1f} mm")
        print(f"  Mean temperature: {df['temperature_mean_c'].mean():.1f}¬∞C")

## 4. Fetch Satellite NDVI Data

In [None]:
def fetch_ndvi_data(site_id: str, lat: float, lon: float,
                    start_date: date, end_date: date):
    """Fetch NDVI data from MODIS"""

    ndvi_source = MODISNDVISource()

    # Override coordinates
    ndvi_source._get_site_coordinates = lambda s: (lat, lon)

    request = DataFetchRequest(
        site_id=site_id,
        start_date=start_date,
        end_date=end_date
    )

    try:
        result = ndvi_source.fetch(request)
        print(f"‚úì Fetched {len(result.data)} NDVI observations (source: {result.metadata.get('source', 'unknown')})")
        return result.data
    except Exception as e:
        print(f"NDVI fetch failed: {e}")
        return []

# Fetch NDVI for all sites
ndvi_data = {}

for site_id, site_info in TEST_SITES.items():
    print(f"\nFetching NDVI for {site_id}...")
    ndvi = fetch_ndvi_data(
        site_id,
        site_info['latitude'],
        site_info['longitude'],
        START_DATE,
        END_DATE
    )
    ndvi_data[site_id] = ndvi

In [None]:
# Convert NDVI to DataFrames
ndvi_dfs = {}

for site_id, data in ndvi_data.items():
    if data:
        df = pd.DataFrame([d.dict() for d in data])
        df['date'] = pd.to_datetime(df['date'])
        df = df.set_index('date').sort_index()
        ndvi_dfs[site_id] = df

        print(f"{site_id}: NDVI range [{df['ndvi'].min():.3f}, {df['ndvi'].max():.3f}], mean={df['ndvi'].mean():.3f}")

## 5. Build Canonical Data Tables

In [None]:
def build_canonical_table(site_id: str, weather_df: pd.DataFrame,
                          ndvi_df: pd.DataFrame, soil_profile) -> pd.DataFrame:
    """Build canonical daily table from all data sources"""

    # Start with weather data
    canonical = weather_df.copy()

    # Add NDVI
    if 'ndvi' in ndvi_df.columns:
        canonical = canonical.join(ndvi_df[['ndvi', 'evi']], how='left')

    # Forward fill missing NDVI (16-day composite)
    canonical['ndvi'] = canonical['ndvi'].ffill().bfill()
    if 'evi' in canonical.columns:
        canonical['evi'] = canonical['evi'].ffill().bfill()

    # Add static soil properties
    canonical['sand_percent'] = soil_profile.sand_percent
    canonical['clay_percent'] = soil_profile.clay_percent
    canonical['porosity'] = soil_profile.porosity
    canonical['field_capacity'] = soil_profile.field_capacity
    canonical['wilting_point'] = soil_profile.wilting_point

    # Calculate derived features
    canonical['precip_cumsum_7d'] = canonical['precipitation_mm'].rolling(7).sum()
    canonical['et0_cumsum_7d'] = canonical['et0_mm'].rolling(7).sum()
    canonical['water_balance_7d'] = canonical['precip_cumsum_7d'] - canonical['et0_cumsum_7d']

    # Antecedent conditions
    canonical['precip_1d_lag'] = canonical['precipitation_mm'].shift(1)
    canonical['precip_3d_sum'] = canonical['precipitation_mm'].rolling(3).sum()

    return canonical

# Build canonical tables for all sites
canonical_tables = {}

for site_id in TEST_SITES.keys():
    if site_id in weather_dfs and site_id in ndvi_dfs:
        canonical = build_canonical_table(
            site_id,
            weather_dfs[site_id],
            ndvi_dfs[site_id],
            soil_profiles[site_id]
        )
        canonical_tables[site_id] = canonical
        print(f"‚úì Built canonical table for {site_id}: {len(canonical)} rows, {len(canonical.columns)} columns")

# Show sample
print("\nSample canonical table (tunisia_sfax):")
display(canonical_tables['tunisia_sfax'].head())

## 6. Run Physics Model

In [None]:
def run_physics_model(canonical_df: pd.DataFrame, soil_profile) -> pd.DataFrame:
    """
    Run the two-bucket water balance model.

    Returns DataFrame with physics-based soil moisture predictions.
    """

    # Create soil parameters
    soil_params = SoilParameters(
        sand_percent=soil_profile.sand_percent,
        silt_percent=soil_profile.silt_percent,
        clay_percent=soil_profile.clay_percent,
        porosity=soil_profile.porosity,
        field_capacity=soil_profile.field_capacity,
        wilting_point=soil_profile.wilting_point,
        saturated_hydraulic_conductivity_cm_day=soil_profile.saturated_hydraulic_conductivity_cm_day
    )

    # Create model
    model = create_two_bucket_model(soil_params)

    # Run simulation
    results = []

    for idx, row in canonical_df.iterrows():
        # Handle missing values
        precip = row['precipitation_mm'] if pd.notna(row['precipitation_mm']) else 0.0
        et0 = row['et0_mm'] if pd.notna(row['et0_mm']) else 3.0  # Default ET0
        ndvi = row.get('ndvi', 0.5) if pd.notna(row.get('ndvi', np.nan)) else 0.5

        # Run daily step
        try:
            result = model.run_daily(
                precipitation_mm=precip,
                et0_mm=et0,
                ndvi=ndvi,
                check_water_balance=True
            )

            results.append({
                'date': idx,
                'theta_surface': result.theta_surface,
                'theta_root': result.theta_root,
                'evaporation': result.fluxes.get('evaporation', 0),
                'transpiration': result.fluxes.get('transpiration', 0),
                'evapotranspiration': result.fluxes.get('evapotranspiration', 0),
                'drainage': result.fluxes.get('drainage', 0),
                'runoff': result.fluxes.get('runoff', 0),
                'infiltration': result.fluxes.get('infiltration', 0),
                'water_balance_error': result.water_balance_error
            })
        except Exception as e:
            print(f"Error on {idx}: {e}")
            results.append({
                'date': idx,
                'theta_surface': np.nan,
                'theta_root': np.nan
            })

    results_df = pd.DataFrame(results).set_index('date')
    return results_df

# Run physics model for all sites
physics_results = {}

for site_id, canonical in canonical_tables.items():
    print(f"\nRunning physics model for {site_id}...")
    results = run_physics_model(canonical, soil_profiles[site_id])
    physics_results[site_id] = results

    # Summary statistics
    print(f"  Surface SM: mean={results['theta_surface'].mean():.3f}, range=[{results['theta_surface'].min():.3f}, {results['theta_surface'].max():.3f}]")
    print(f"  Root SM: mean={results['theta_root'].mean():.3f}, range=[{results['theta_root'].min():.3f}, {results['theta_root'].max():.3f}]")
    print(f"  Water balance error: mean={results['water_balance_error'].abs().mean():.6f} mm")

## 7. Generate Reference Data (Synthetic Observations)

Since we don't have actual field measurements, we'll generate synthetic "observed" data based on:
- Physics model output with added noise
- Plausible sensor measurement errors
- Some systematic bias

In [None]:
def generate_synthetic_observations(physics_df: pd.DataFrame,
                                    soil_profile,
                                    noise_std: float = 0.03,
                                    bias: float = 0.0,
                                    missing_fraction: float = 0.1) -> pd.DataFrame:
    """
    Generate synthetic soil moisture observations.

    This simulates what we might get from:
    - In-situ sensors with measurement noise
    - Satellite soil moisture products (SMAP, SMOS)
    - GRAFS model outputs
    """
    np.random.seed(42)  # Reproducibility

    obs = pd.DataFrame(index=physics_df.index)

    # Surface observations (like satellite retrievals)
    obs['sm_obs_surface'] = (
        physics_df['theta_surface']
        + np.random.normal(bias, noise_std, len(physics_df))
    )

    # Root zone observations (like sensor measurements)
    obs['sm_obs_root'] = (
        physics_df['theta_root']
        + np.random.normal(bias * 0.5, noise_std * 0.8, len(physics_df))
    )

    # Clip to physical limits
    wp = soil_profile.wilting_point
    por = soil_profile.porosity
    obs['sm_obs_surface'] = obs['sm_obs_surface'].clip(wp, por)
    obs['sm_obs_root'] = obs['sm_obs_root'].clip(wp, por)

    # Add some missing values
    mask = np.random.random(len(obs)) < missing_fraction
    obs.loc[mask, 'sm_obs_surface'] = np.nan

    return obs

# Generate observations for all sites
observations = {}

for site_id, physics_df in physics_results.items():
    obs = generate_synthetic_observations(
        physics_df,
        soil_profiles[site_id],
        noise_std=0.035,
        bias=0.01,
        missing_fraction=0.15
    )
    observations[site_id] = obs
    print(f"{site_id}: Generated {(~obs['sm_obs_surface'].isna()).sum()} surface obs, {(~obs['sm_obs_root'].isna()).sum()} root obs")

## 8. Compute Validation Metrics

In [None]:
# Initialize validation engine
validator = ValidationEngine(min_samples=10)

# Compute metrics for all sites
all_metrics = {}

for site_id in physics_results.keys():
    physics = physics_results[site_id]
    obs = observations[site_id]

    # Surface layer metrics
    surface_metrics = validator.compute_metrics(
        obs['sm_obs_surface'].values,
        physics['theta_surface'].values
    )

    # Root zone metrics
    root_metrics = validator.compute_metrics(
        obs['sm_obs_root'].values,
        physics['theta_root'].values
    )

    all_metrics[site_id] = {
        'surface': surface_metrics,
        'root': root_metrics
    }

    print(f"\n{'='*60}")
    print(f"Site: {site_id}")
    print(f"{'='*60}")
    print(f"\nSurface Layer (0-10cm):")
    print(surface_metrics.summary())
    print(f"\nRoot Zone (10-40cm):")
    print(root_metrics.summary())

In [None]:
# Create summary table of all metrics
summary_data = []

for site_id, metrics in all_metrics.items():
    for layer, m in metrics.items():
        summary_data.append({
            'Site': site_id,
            'Layer': layer,
            'RMSE': m.rmse,
            'MAE': m.mae,
            'Bias': m.bias,
            'R¬≤': m.r_squared,
            'NSE': m.nse,
            'KGE': m.kge,
            'N': m.n_valid
        })

summary_df = pd.DataFrame(summary_data)

print("\nValidation Metrics Summary:")
display(summary_df.round(4))

## 9. Visualize Results

In [None]:
# Plot time series for each site
fig, axes = plt.subplots(len(TEST_SITES), 2, figsize=(16, 4*len(TEST_SITES)))

for i, (site_id, site_info) in enumerate(TEST_SITES.items()):
    physics = physics_results[site_id]
    obs = observations[site_id]
    canonical = canonical_tables[site_id]

    # Left: Surface soil moisture
    ax1 = axes[i, 0]
    ax1.plot(physics.index, physics['theta_surface'], 'b-', label='Physics Model', alpha=0.8)
    ax1.scatter(obs.index, obs['sm_obs_surface'], c='r', s=10, alpha=0.5, label='Observations')
    ax1.axhline(y=soil_profiles[site_id].field_capacity, color='g', linestyle='--', alpha=0.5, label='Field Capacity')
    ax1.axhline(y=soil_profiles[site_id].wilting_point, color='orange', linestyle='--', alpha=0.5, label='Wilting Point')
    ax1.set_ylabel('VWC (m¬≥/m¬≥)')
    ax1.set_title(f'{site_id} - Surface Layer (0-10cm)')
    if i == 0:
        ax1.legend(loc='upper right')
    ax1.set_ylim(0, 0.6)
    ax1.grid(True, alpha=0.3)

    # Add precipitation bars
    ax1_twin = ax1.twinx()
    ax1_twin.bar(canonical.index, canonical['precipitation_mm'], alpha=0.2, color='blue', width=1)
    ax1_twin.set_ylabel('Precip (mm)', color='blue')
    ax1_twin.set_ylim(0, 100)
    ax1_twin.invert_yaxis()

    # Right: Root zone soil moisture
    ax2 = axes[i, 1]
    ax2.plot(physics.index, physics['theta_root'], 'b-', label='Physics Model', alpha=0.8)
    ax2.scatter(obs.index, obs['sm_obs_root'], c='r', s=10, alpha=0.5, label='Observations')
    ax2.axhline(y=soil_profiles[site_id].field_capacity, color='g', linestyle='--', alpha=0.5)
    ax2.axhline(y=soil_profiles[site_id].wilting_point, color='orange', linestyle='--', alpha=0.5)
    ax2.set_ylabel('VWC (m¬≥/m¬≥)')
    ax2.set_title(f'{site_id} - Root Zone (10-40cm)')
    ax2.set_ylim(0, 0.6)
    ax2.grid(True, alpha=0.3)

    # Add metrics annotation
    m = all_metrics[site_id]['root']
    ax2.text(0.98, 0.95, f'R¬≤={m.r_squared:.3f}\nRMSE={m.rmse:.3f}\nNSE={m.nse:.3f}',
             transform=ax2.transAxes, ha='right', va='top', fontsize=9,
             bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

plt.tight_layout()
plt.savefig('../data/features/validation_timeseries.png', dpi=150, bbox_inches='tight')
plt.show()
print("‚úì Saved validation_timeseries.png")

In [None]:
# Scatter plots: Predicted vs Observed
fig, axes = plt.subplots(2, 2, figsize=(12, 12))

colors = plt.cm.tab10.colors

for i, (site_id, site_info) in enumerate(TEST_SITES.items()):
    physics = physics_results[site_id]
    obs = observations[site_id]

    # Surface layer
    ax1 = axes[0, 0] if i < 2 else axes[0, 1]
    mask = ~obs['sm_obs_surface'].isna()
    ax1.scatter(obs.loc[mask, 'sm_obs_surface'], physics.loc[mask, 'theta_surface'],
                c=[colors[i]], alpha=0.3, s=10, label=site_id)

    # Root zone
    ax2 = axes[1, 0] if i < 2 else axes[1, 1]
    mask = ~obs['sm_obs_root'].isna()
    ax2.scatter(obs.loc[mask, 'sm_obs_root'], physics.loc[mask, 'theta_root'],
                c=[colors[i]], alpha=0.3, s=10, label=site_id)

# Add 1:1 lines and labels
for ax in axes.flat:
    lims = [0.05, 0.55]
    ax.plot(lims, lims, 'k--', alpha=0.5, label='1:1 line')
    ax.set_xlim(lims)
    ax.set_ylim(lims)
    ax.set_xlabel('Observed VWC (m¬≥/m¬≥)')
    ax.set_ylabel('Predicted VWC (m¬≥/m¬≥)')
    ax.grid(True, alpha=0.3)
    ax.legend(loc='upper left', fontsize=8)
    ax.set_aspect('equal')

axes[0, 0].set_title('Surface Layer - Sites 1-2')
axes[0, 1].set_title('Surface Layer - Sites 3-4')
axes[1, 0].set_title('Root Zone - Sites 1-2')
axes[1, 1].set_title('Root Zone - Sites 3-4')

plt.tight_layout()
plt.savefig('../data/features/validation_scatter.png', dpi=150, bbox_inches='tight')
plt.show()
print("‚úì Saved validation_scatter.png")

In [None]:
# Bar chart of metrics across sites
fig, axes = plt.subplots(2, 3, figsize=(15, 8))

metrics_to_plot = ['RMSE', 'MAE', 'R¬≤', 'NSE', 'KGE', 'Bias']
sites = list(TEST_SITES.keys())

for i, metric in enumerate(metrics_to_plot):
    ax = axes.flat[i]

    surface_vals = [summary_df[(summary_df['Site']==s) & (summary_df['Layer']=='surface')][metric].values[0] for s in sites]
    root_vals = [summary_df[(summary_df['Site']==s) & (summary_df['Layer']=='root')][metric].values[0] for s in sites]

    x = np.arange(len(sites))
    width = 0.35

    bars1 = ax.bar(x - width/2, surface_vals, width, label='Surface', color='skyblue')
    bars2 = ax.bar(x + width/2, root_vals, width, label='Root Zone', color='coral')

    ax.set_ylabel(metric)
    ax.set_title(f'{metric} by Site')
    ax.set_xticks(x)
    ax.set_xticklabels([s.replace('_', '\n') for s in sites], fontsize=8)

    if i == 0:
        ax.legend()

    # Add reference lines for good performance
    if metric in ['R¬≤', 'NSE', 'KGE']:
        ax.axhline(y=0.7, color='green', linestyle='--', alpha=0.5, label='Good (0.7)')
    elif metric in ['RMSE', 'MAE']:
        ax.axhline(y=0.05, color='green', linestyle='--', alpha=0.5, label='Target (0.05)')
    elif metric == 'Bias':
        ax.axhline(y=0, color='green', linestyle='--', alpha=0.5)

plt.tight_layout()
plt.savefig('../data/features/validation_metrics_comparison.png', dpi=150, bbox_inches='tight')
plt.show()
print("‚úì Saved validation_metrics_comparison.png")

## 10. Analysis and Conclusions

In [None]:
# Overall performance summary
print("="*70)
print("PHYSICS MODEL VALIDATION SUMMARY")
print("="*70)

# Calculate overall metrics
all_obs_surface = np.concatenate([obs['sm_obs_surface'].dropna().values for obs in observations.values()])
all_pred_surface = np.concatenate([physics_results[s].loc[observations[s]['sm_obs_surface'].dropna().index, 'theta_surface'].values for s in observations.keys()])

all_obs_root = np.concatenate([obs['sm_obs_root'].dropna().values for obs in observations.values()])
all_pred_root = np.concatenate([physics_results[s].loc[observations[s]['sm_obs_root'].dropna().index, 'theta_root'].values for s in observations.keys()])

overall_surface = validator.compute_metrics(all_obs_surface, all_pred_surface)
overall_root = validator.compute_metrics(all_obs_root, all_pred_root)

print("\nOVERALL PERFORMANCE (All Sites Combined):")
print("-"*50)
print(f"{'Metric':<15} {'Surface':<15} {'Root Zone':<15}")
print("-"*50)
print(f"{'RMSE':<15} {overall_surface.rmse:<15.4f} {overall_root.rmse:<15.4f}")
print(f"{'MAE':<15} {overall_surface.mae:<15.4f} {overall_root.mae:<15.4f}")
print(f"{'Bias':<15} {overall_surface.bias:<+15.4f} {overall_root.bias:<+15.4f}")
print(f"{'R¬≤':<15} {overall_surface.r_squared:<15.4f} {overall_root.r_squared:<15.4f}")
print(f"{'NSE':<15} {overall_surface.nse:<15.4f} {overall_root.nse:<15.4f}")
print(f"{'KGE':<15} {overall_surface.kge:<15.4f} {overall_root.kge:<15.4f}")
print(f"{'N samples':<15} {overall_surface.n_valid:<15} {overall_root.n_valid:<15}")

print("\n" + "="*70)
print("PERFORMANCE BENCHMARKS:")
print("="*70)
print("RMSE < 0.05 m¬≥/m¬≥  : Good for soil moisture")
print("R¬≤ > 0.70          : Strong correlation")
print("NSE > 0.50         : Acceptable model performance")
print("KGE > 0.50         : Good overall performance")
print("|Bias| < 0.02      : Low systematic error")

In [None]:
# Key findings
print("\n" + "="*70)
print("KEY FINDINGS:")
print("="*70)

# Best performing site
best_site_surface = summary_df[summary_df['Layer']=='surface'].sort_values('R¬≤', ascending=False).iloc[0]
best_site_root = summary_df[summary_df['Layer']=='root'].sort_values('R¬≤', ascending=False).iloc[0]

print(f"\n1. Best surface layer performance: {best_site_surface['Site']} (R¬≤={best_site_surface['R¬≤']:.3f})")
print(f"2. Best root zone performance: {best_site_root['Site']} (R¬≤={best_site_root['R¬≤']:.3f})")

# Sites needing improvement
worst_site = summary_df[summary_df['Layer']=='surface'].sort_values('R¬≤').iloc[0]
print(f"3. Site needing most improvement: {worst_site['Site']} (R¬≤={worst_site['R¬≤']:.3f})")

# Bias analysis
avg_bias = summary_df['Bias'].mean()
print(f"\n4. Average bias across all sites: {avg_bias:+.4f} m¬≥/m¬≥")
if avg_bias > 0:
    print("   ‚Üí Model tends to OVERESTIMATE soil moisture")
else:
    print("   ‚Üí Model tends to UNDERESTIMATE soil moisture")

# Recommendations
print("\n" + "="*70)
print("RECOMMENDATIONS:")
print("="*70)
print("1. Calibrate model parameters using site-specific observations")
print("2. Consider adding irrigation detection for agricultural sites")
print("3. Use ensemble of models for improved uncertainty quantification")
print("4. Validate against actual field measurements when available")
print("5. Consider seasonal calibration for regions with distinct wet/dry seasons")

In [None]:
# Save results to CSV
output_dir = Path('../data/features')
output_dir.mkdir(parents=True, exist_ok=True)

# Save metrics summary
summary_df.to_csv(output_dir / 'validation_metrics_summary.csv', index=False)
print(f"‚úì Saved validation_metrics_summary.csv")

# Save detailed results for each site
for site_id in TEST_SITES.keys():
    combined = canonical_tables[site_id].copy()
    combined['theta_surface_physics'] = physics_results[site_id]['theta_surface']
    combined['theta_root_physics'] = physics_results[site_id]['theta_root']
    combined['sm_obs_surface'] = observations[site_id]['sm_obs_surface']
    combined['sm_obs_root'] = observations[site_id]['sm_obs_root']

    combined.to_csv(output_dir / f'validation_results_{site_id}.csv')

print(f"‚úì Saved individual site results")
print(f"\nAll outputs saved to: {output_dir.absolute()}")