# NaN Pattern Comparison Report

This report shows representative examples of files with NaN pattern differences between datasets.
Files are grouped by model + variable, and one representative file per group is shown (up to 10 total).

In [None]:
# Parameters (injected by papermill)
selected_files = []
shapefile_path = "../shp/ecos.shp"
qc_utils_path = ""

In [None]:
import sys

if qc_utils_path and qc_utils_path not in sys.path:
    sys.path.insert(0, qc_utils_path)

import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import geopandas as gpd
from pathlib import Path
import warnings

warnings.filterwarnings("ignore")

print(f"Analyzing {len(selected_files)} representative file(s)")

## Overview Statistics

Summary of NaN pattern analysis across all files.

In [None]:
# Calculate overview statistics
if not selected_files:
    print("No files with NaN differences.")
else:
    total_groups = len(set(f["group"] for f in selected_files))
    total_nan_diffs = sum(f["analysis"]["total_nan_diff"] for f in selected_files)

    print(f"Total model+variable groups with differences: {total_groups}")
    print(f"Representative files shown: {len(selected_files)} (max 10)")
    print(f"Total NaN differences in shown files: {total_nan_diffs:,}")
    print(
        f"\\nNote: Each file represents a group. Files in the same group have similar NaN patterns."
    )

## Summary by Group

Detailed statistics grouped by model + variable.

In [None]:
# Create summary table
if selected_files:
    table_data = []
    for f in selected_files:
        model, var = f["group"].split("_", 1)
        table_data.append(
            {
                "Model": model,
                "Variable": var,
                "Files in Group": f["group_size"],
                "Representative File": f["filename"],
                "Total NaN Diffs": f["analysis"]["total_nan_diff"],
                "Timesteps": f["analysis"]["n_timesteps"],
                "Timesteps with Diffs": f["analysis"]["timesteps_with_diffs"],
                "Worst Timestep Diff": f["analysis"]["worst_timestep_count"],
            }
        )

    df = pd.DataFrame(table_data)
    df = df.sort_values("Total NaN Diffs", ascending=False)

    print(df.to_string(index=False))
else:
    print("No data to display.")

## NaN Pattern Visualizations

For each representative file, showing the timestep with the greatest NaN pattern difference.

In [None]:
# Load shapefile if available
if Path(shapefile_path).exists():
    gdf = gpd.read_file(shapefile_path)
    bounds = gdf.total_bounds
else:
    print(f"Warning: Shapefile not found at {shapefile_path}")
    gdf = None
    bounds = None

# Plot each selected file
for i, file_info in enumerate(selected_files, 1):
    print(
        f"\\n[{i}/{len(selected_files)}] {file_info['group']}: {file_info['filename']}"
    )
    print(f"  Total NaN differences: {file_info['analysis']['total_nan_diff']:,}")
    print(
        f"  Showing timestep {file_info['analysis']['worst_timestep']} with {file_info['analysis']['worst_timestep_count']:,} differences"
    )

    try:
        # Load datasets
        ds_ref = xr.open_dataset(file_info["ref_path"])
        ds_new = xr.open_dataset(file_info["new_path"])

        # Get first variable
        var = file_info["analysis"]["variables"][0]
        time_idx = file_info["analysis"]["worst_timestep"]

        # Extract data for worst timestep
        if "time" in ds_ref.dims and len(ds_ref.time) > 1:
            data_ref = ds_ref[var].isel(time=time_idx).values
            data_new = ds_new[var].isel(time=time_idx).values
            time_val = ds_ref.time.values[time_idx]
        else:
            data_ref = ds_ref[var].values
            data_new = ds_new[var].values
            time_val = "N/A"

        # Create NaN pattern mask
        nan_ref = np.isnan(data_ref)
        nan_new = np.isnan(data_new)
        nan_pattern = np.zeros_like(data_ref)
        nan_pattern[nan_ref & ~nan_new] = 1  # NaN in ref only
        nan_pattern[~nan_ref & nan_new] = 2  # NaN in new only
        nan_pattern[nan_ref & nan_new] = 3  # NaN in both

        # Get coordinates
        lon = ds_ref.lon.values
        lat = ds_ref.lat.values

        if bounds is None:
            bounds = [lon.min(), lat.min(), lon.max(), lat.max()]

        # Create plot
        fig, ax = plt.subplots(1, 1, figsize=(12, 8))

        # Count differences
        n_nan_to_valid = np.sum(nan_ref & ~nan_new)
        n_valid_to_nan = np.sum(~nan_ref & nan_new)

        fig.suptitle(
            f"{file_info['group']}: {file_info['filename']}\\n"
            f"Timestep {time_idx}: {n_nan_to_valid:,} NaN→Valid, {n_valid_to_nan:,} Valid→NaN",
            fontsize=12,
            fontweight="bold",
        )

        # Plot NaN pattern
        colors = ["white", "red", "blue", "lightgray"]
        labels = [
            "Valid in both",
            "NaN→Valid (ref has NaN)",
            "Valid→NaN (new has NaN)",
            "NaN in both",
        ]
        im = ax.pcolormesh(
            lon,
            lat,
            nan_pattern,
            cmap=plt.cm.colors.ListedColormap(colors),
            vmin=0,
            vmax=3,
        )

        if gdf is not None:
            gdf.boundary.plot(ax=ax, edgecolor="black", linewidth=1.5)

        ax.set_xlim(bounds[0], bounds[2])
        ax.set_ylim(bounds[1], bounds[3])
        ax.set_xlabel("Longitude", fontsize=10)
        ax.set_ylabel("Latitude", fontsize=10)
        ax.set_title(f"NaN Pattern Differences (Timestep with Max Diff)", fontsize=10)

        # Legend
        patches = [mpatches.Patch(color=colors[j], label=labels[j]) for j in range(4)]
        ax.legend(handles=patches, loc="lower left", fontsize=8)

        plt.tight_layout()
        plt.show()

        ds_ref.close()
        ds_new.close()

    except Exception as e:
        print(f"  ERROR: {str(e)}")
        continue

## Summary

This report analyzed representative files from each model+variable group that showed NA pattern differences. Files within the same group exhibit similar NaN patterns and are therefore not all shown to keep the report concise.

**Key Points:**
- Each visualization represents one model+variable combination
- The timestep shown is the one with the maximum NaN pattern difference
- Red areas: Original had NaN, new has valid data
- Blue areas: Original had valid data, new has NaN
- Gray areas: Both datasets have NaN