# Cryo-TEMPO NetCDF Dataset Structure and Variable Inspection

## Overview
This notebook inspects the structure and variables of the **first observational file** from the Cryo-TEMPO sea ice product for **July 2010**. The Cryo-TEMPO (CryoSat-2 Thematic Exploitation Polar Observation) product provides along-track sea ice freeboard measurements over the Antarctic region.

### Dataset Information
| Property | Value |
|----------|-------|
| **File** | `CS_OFFL_SIR_TDP_SI_ANTARC_20100716T000456_20100716T000635_02_03041_C001.nc` |
| **Date** | July 16, 2010 |
| **Region** | Antarctic |
| **Product Type** | TDP_SI (Thematic Data Product - Sea Ice) |
| **Orbit Number** | 03041 |

### Objectives
1. Load and validate the NetCDF file structure
2. Extract and display global attributes (metadata)
3. List all dimensions with their sizes
4. Enumerate all variables with their properties (dtype, shape, units, description)
5. Generate summary statistics for key numeric variables
6. Identify data quality indicators and potential issues

In [1]:
"""
Cryo-TEMPO NetCDF Dataset Inspector

This module provides comprehensive inspection of Cryo-TEMPO sea ice NetCDF files,
following Google Python Style Guide and Amazon engineering best practices.

Author: Xinlong Liu
Date: December 2025
Version: 1.0.0
"""

import os
import logging
from typing import Dict, List, Tuple, Any, Optional
from dataclasses import dataclass
from datetime import datetime

import numpy as np
import pandas as pd
import xarray as xr
import netCDF4 as nc

# Configure logging following Google/Amazon standards
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)


@dataclass
class VariableInfo:
    """Data class to store variable metadata following Google style."""
    name: str
    dtype: str
    shape: Tuple[int, ...]
    dimensions: Tuple[str, ...]
    units: Optional[str]
    long_name: Optional[str]
    valid_count: int
    nan_count: int
    fill_value: Optional[Any]


class NetCDFInspector:
    """
    A professional-grade NetCDF file inspector for Cryo-TEMPO products.
    
    This class provides comprehensive inspection capabilities following
    enterprise software engineering standards used at Google and Amazon.
    
    Attributes:
        file_path: Path to the NetCDF file to inspect.
        dataset: xarray Dataset object containing the loaded data.
        
    Example:
        >>> inspector = NetCDFInspector(file_path)
        >>> inspector.run_full_inspection()
    """
    
    SEPARATOR = "=" * 80
    SUB_SEPARATOR = "-" * 60
    
    def __init__(self, file_path: str) -> None:
        """
        Initialize the NetCDF inspector.
        
        Args:
            file_path: Absolute path to the NetCDF file.
            
        Raises:
            FileNotFoundError: If the specified file does not exist.
            ValueError: If the file is not a valid NetCDF file.
        """
        self.file_path = file_path
        self._validate_file_path()
        self.dataset: Optional[xr.Dataset] = None
        self.nc_dataset: Optional[nc.Dataset] = None
        
    def _validate_file_path(self) -> None:
        """Validate that the file exists and has correct extension."""
        if not os.path.exists(self.file_path):
            raise FileNotFoundError(f"File not found: {self.file_path}")
        if not self.file_path.endswith('.nc'):
            raise ValueError(f"Expected NetCDF file (.nc), got: {self.file_path}")
        logger.info(f"File path validated: {self.file_path}")
    
    def load_dataset(self) -> None:
        """Load the NetCDF dataset using both xarray and netCDF4."""
        logger.info("Loading NetCDF dataset...")
        try:
            self.dataset = xr.open_dataset(self.file_path)
            self.nc_dataset = nc.Dataset(self.file_path, 'r')
            logger.info("Dataset loaded successfully.")
        except Exception as e:
            logger.error(f"Failed to load dataset: {e}")
            raise
    
    def display_file_info(self) -> None:
        """Display basic file information."""
        print(f"\n{self.SEPARATOR}")
        print("1. FILE INFORMATION")
        print(self.SEPARATOR)
        
        file_stats = os.stat(self.file_path)
        file_size_mb = file_stats.st_size / (1024 * 1024)
        
        info_table = [
            ["File Name", os.path.basename(self.file_path)],
            ["File Path", self.file_path],
            ["File Size", f"{file_size_mb:.4f} MB ({file_stats.st_size:,} bytes)"],
            ["Last Modified", datetime.fromtimestamp(file_stats.st_mtime).strftime('%Y-%m-%d %H:%M:%S')],
            ["NetCDF Format", self.nc_dataset.file_format if self.nc_dataset else "Unknown"],
        ]
        
        df_info = pd.DataFrame(info_table, columns=["Property", "Value"])
        print(df_info.to_string(index=False))
    
    def display_global_attributes(self) -> None:
        """Display all global attributes (metadata) of the NetCDF file."""
        print(f"\n{self.SEPARATOR}")
        print("2. GLOBAL ATTRIBUTES (METADATA)")
        print(self.SEPARATOR)
        
        if self.nc_dataset is None:
            logger.warning("Dataset not loaded. Call load_dataset() first.")
            return
        
        global_attrs = self.nc_dataset.ncattrs()
        
        if not global_attrs:
            print("No global attributes found.")
            return
        
        attr_data = []
        for attr in global_attrs:
            value = self.nc_dataset.getncattr(attr)
            # Truncate long values for display
            value_str = str(value)
            if len(value_str) > 80:
                value_str = value_str[:77] + "..."
            attr_data.append([attr, value_str])
        
        df_attrs = pd.DataFrame(attr_data, columns=["Attribute", "Value"])
        print(df_attrs.to_string(index=False))
        print(f"\nTotal Global Attributes: {len(global_attrs)}")
    
    def display_dimensions(self) -> None:
        """Display all dimensions in the NetCDF file."""
        print(f"\n{self.SEPARATOR}")
        print("3. DIMENSIONS")
        print(self.SEPARATOR)
        
        if self.nc_dataset is None:
            logger.warning("Dataset not loaded. Call load_dataset() first.")
            return
        
        dim_data = []
        for dim_name, dim in self.nc_dataset.dimensions.items():
            dim_data.append([
                dim_name,
                len(dim),
                "Unlimited" if dim.isunlimited() else "Fixed"
            ])
        
        df_dims = pd.DataFrame(dim_data, columns=["Dimension", "Size", "Type"])
        print(df_dims.to_string(index=False))
        print(f"\nTotal Dimensions: {len(self.nc_dataset.dimensions)}")
    
    def display_variables(self) -> None:
        """Display comprehensive information about all variables."""
        print(f"\n{self.SEPARATOR}")
        print("4. VARIABLES")
        print(self.SEPARATOR)
        
        if self.dataset is None:
            logger.warning("Dataset not loaded. Call load_dataset() first.")
            return
        
        var_data = []
        for var_name in self.dataset.data_vars:
            var = self.dataset[var_name]
            
            # Extract attributes safely
            units = var.attrs.get('units', 'N/A')
            long_name = var.attrs.get('long_name', 'N/A')
            
            # Calculate valid/NaN counts
            try:
                values = var.values
                if np.issubdtype(values.dtype, np.floating):
                    nan_count = int(np.sum(np.isnan(values)))
                    valid_count = int(np.sum(~np.isnan(values)))
                else:
                    nan_count = 0
                    valid_count = int(values.size)
            except Exception:
                nan_count = "N/A"
                valid_count = "N/A"
            
            var_data.append([
                var_name,
                str(var.dtype),
                str(var.shape),
                str(var.dims),
                units,
                valid_count,
                nan_count
            ])
        
        df_vars = pd.DataFrame(
            var_data,
            columns=["Variable", "DType", "Shape", "Dimensions", "Units", "Valid", "NaN"]
        )
        print(df_vars.to_string(index=False))
        print(f"\nTotal Variables: {len(self.dataset.data_vars)}")
        
        # Also display coordinate variables
        print(f"\n{self.SUB_SEPARATOR}")
        print("Coordinate Variables:")
        print(self.SUB_SEPARATOR)
        
        coord_data = []
        for coord_name in self.dataset.coords:
            coord = self.dataset.coords[coord_name]
            coord_data.append([
                coord_name,
                str(coord.dtype),
                str(coord.shape),
                coord.attrs.get('units', 'N/A')
            ])
        
        df_coords = pd.DataFrame(
            coord_data,
            columns=["Coordinate", "DType", "Shape", "Units"]
        )
        print(df_coords.to_string(index=False))
    
    def display_variable_details(self) -> None:
        """Display detailed attributes for each variable."""
        print(f"\n{self.SEPARATOR}")
        print("5. VARIABLE DETAILED ATTRIBUTES")
        print(self.SEPARATOR)
        
        if self.dataset is None:
            logger.warning("Dataset not loaded. Call load_dataset() first.")
            return
        
        for var_name in self.dataset.data_vars:
            var = self.dataset[var_name]
            print(f"\n{self.SUB_SEPARATOR}")
            print(f"Variable: {var_name}")
            print(self.SUB_SEPARATOR)
            
            if var.attrs:
                for attr_name, attr_value in var.attrs.items():
                    value_str = str(attr_value)
                    if len(value_str) > 70:
                        value_str = value_str[:67] + "..."
                    print(f"  {attr_name}: {value_str}")
            else:
                print("  No attributes found.")
    
    def display_statistics(self) -> None:
        """Display summary statistics for numeric variables."""
        print(f"\n{self.SEPARATOR}")
        print("6. SUMMARY STATISTICS (Numeric Variables)")
        print(self.SEPARATOR)
        
        if self.dataset is None:
            logger.warning("Dataset not loaded. Call load_dataset() first.")
            return
        
        stats_data = []
        for var_name in self.dataset.data_vars:
            var = self.dataset[var_name]
            
            # Only process numeric types
            if not np.issubdtype(var.dtype, np.number):
                continue
            
            try:
                values = var.values.flatten()
                # Handle NaN values
                valid_values = values[~np.isnan(values)] if np.issubdtype(values.dtype, np.floating) else values
                
                if len(valid_values) == 0:
                    continue
                
                stats_data.append([
                    var_name,
                    f"{np.min(valid_values):.6g}",
                    f"{np.max(valid_values):.6g}",
                    f"{np.mean(valid_values):.6g}",
                    f"{np.median(valid_values):.6g}",
                    f"{np.std(valid_values):.6g}",
                    len(valid_values)
                ])
            except Exception as e:
                logger.debug(f"Could not compute stats for {var_name}: {e}")
                continue
        
        if stats_data:
            df_stats = pd.DataFrame(
                stats_data,
                columns=["Variable", "Min", "Max", "Mean", "Median", "Std Dev", "Count"]
            )
            print(df_stats.to_string(index=False))
        else:
            print("No numeric variables found for statistics.")
    
    def display_data_quality_summary(self) -> None:
        """Display data quality indicators and potential issues."""
        print(f"\n{self.SEPARATOR}")
        print("7. DATA QUALITY SUMMARY")
        print(self.SEPARATOR)
        
        if self.dataset is None:
            logger.warning("Dataset not loaded. Call load_dataset() first.")
            return
        
        issues = []
        quality_data = []
        
        for var_name in self.dataset.data_vars:
            var = self.dataset[var_name]
            
            try:
                values = var.values.flatten()
                total = len(values)
                
                if np.issubdtype(values.dtype, np.floating):
                    nan_count = int(np.sum(np.isnan(values)))
                    nan_pct = (nan_count / total) * 100 if total > 0 else 0
                    
                    quality_data.append([
                        var_name,
                        total,
                        total - nan_count,
                        nan_count,
                        f"{nan_pct:.2f}%"
                    ])
                    
                    if nan_pct > 50:
                        issues.append(f"WARNING: {var_name} has {nan_pct:.1f}% missing values")
                        
            except Exception:
                continue
        
        if quality_data:
            df_quality = pd.DataFrame(
                quality_data,
                columns=["Variable", "Total", "Valid", "Missing", "Missing %"]
            )
            print(df_quality.to_string(index=False))
        
        if issues:
            print(f"\n{self.SUB_SEPARATOR}")
            print("Data Quality Issues Detected:")
            print(self.SUB_SEPARATOR)
            for issue in issues:
                print(f"  ⚠ {issue}")
        else:
            print("\n✓ No significant data quality issues detected.")
    
    def run_full_inspection(self) -> None:
        """Execute the complete inspection workflow."""
        logger.info("Starting full NetCDF inspection...")
        start_time = datetime.now()
        
        print("\n" + "=" * 80)
        print("  CRYO-TEMPO NETCDF DATASET INSPECTION REPORT")
        print("  Generated: " + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        print("=" * 80)
        
        self.load_dataset()
        self.display_file_info()
        self.display_global_attributes()
        self.display_dimensions()
        self.display_variables()
        self.display_variable_details()
        self.display_statistics()
        self.display_data_quality_summary()
        
        elapsed = (datetime.now() - start_time).total_seconds()
        print(f"\n{self.SEPARATOR}")
        print(f"Inspection completed in {elapsed:.2f} seconds.")
        print(self.SEPARATOR)
        
        logger.info(f"Inspection completed in {elapsed:.2f} seconds.")
    
    def close(self) -> None:
        """Close all open dataset handles."""
        if self.dataset is not None:
            self.dataset.close()
        if self.nc_dataset is not None:
            self.nc_dataset.close()
        logger.info("Dataset handles closed.")


# =============================================================================
# MAIN EXECUTION
# =============================================================================

if __name__ == "__main__" or True:  # Always execute in notebook context
    
    # Define file path for the first Cryo-TEMPO observation in July 2010
    FILE_PATH = r"D:\phd\data\CryoTEMPO\2010\07\CS_OFFL_SIR_TDP_SI_ANTARC_20100716T000456_20100716T000635_02_03041_C001.nc"
    
    # Create inspector instance and run full inspection
    try:
        inspector = NetCDFInspector(FILE_PATH)
        inspector.run_full_inspection()
    except FileNotFoundError as e:
        logger.error(f"File not found: {e}")
        print(f"\n❌ ERROR: {e}")
        print("Please verify the file path and ensure the file exists.")
    except Exception as e:
        logger.error(f"Unexpected error: {e}")
        print(f"\n❌ ERROR: {e}")
    finally:
        # Ensure resources are cleaned up
        if 'inspector' in dir() and inspector is not None:
            inspector.close()

2025-12-07 20:27:17 - INFO - __main__ - File path validated: D:\phd\data\CryoTEMPO\2010\07\CS_OFFL_SIR_TDP_SI_ANTARC_20100716T000456_20100716T000635_02_03041_C001.nc
2025-12-07 20:27:17 - INFO - __main__ - Starting full NetCDF inspection...
2025-12-07 20:27:17 - INFO - __main__ - Loading NetCDF dataset...



  CRYO-TEMPO NETCDF DATASET INSPECTION REPORT
  Generated: 2025-12-07 20:27:17


2025-12-07 20:27:18 - INFO - __main__ - Dataset loaded successfully.
2025-12-07 20:27:18 - INFO - __main__ - Inspection completed in 0.52 seconds.
2025-12-07 20:27:18 - INFO - __main__ - Dataset handles closed.



1. FILE INFORMATION
     Property                                                                                                    Value
    File Name                               CS_OFFL_SIR_TDP_SI_ANTARC_20100716T000456_20100716T000635_02_03041_C001.nc
    File Path D:\phd\data\CryoTEMPO\2010\07\CS_OFFL_SIR_TDP_SI_ANTARC_20100716T000456_20100716T000635_02_03041_C001.nc
    File Size                                                                                0.0979 MB (102,706 bytes)
Last Modified                                                                                      2024-07-13 20:34:24
NetCDF Format                                                                                                  NETCDF4

2. GLOBAL ATTRIBUTES (METADATA)
              Attribute                                                                   Value
   Metadata_Conventions                                          Unidata Dataset Discovery v1.0
                  title                 

# Cryo-TEMPO Variable Validation and Extraction Pipeline

## Overview
This pipeline validates the existence of critical sea ice freeboard variables across the **entire Cryo-TEMPO dataset archive** (July 2010 - May 2024) and consolidates them into a single NetCDF file for downstream analysis.

### Target Variables
| Variable | Description | Unit |
|----------|-------------|------|
| `radar_freeboard` | Radar freeboard height | m |
| `radar_freeboard_uncertainty` | Uncertainty of radar freeboard | m |
| `sea_ice_freeboard` | Sea ice freeboard height | m |
| `sea_ice_freeboard_uncertainty` | Uncertainty of sea ice freeboard | m |
| `snow_depth` | Snow depth on sea ice | m |
| `snow_depth_uncertainty` | Uncertainty of snow depth | m |

### Data Source
- **Directory**: `D:\phd\data\CryoTEMPO`
- **Time Period**: July 2010 to May 2024
- **Structure**: `{base_dir}/{year}/{month}/{files}.nc`

### Pipeline Stages
1. **Discovery**: Scan directory structure to identify all NetCDF files
2. **Validation**: Verify target variables exist in all files
3. **Extraction**: Extract and consolidate variables from all files
4. **Export**: Save consolidated dataset with CF-compliant metadata

In [2]:
"""
Cryo-TEMPO Variable Validation and Extraction Pipeline

This module provides enterprise-grade data validation and extraction capabilities
for Cryo-TEMPO sea ice freeboard products, following Google Python Style Guide
and Amazon engineering best practices.

Author: Xinlong Liu
Date: December 2025
Version: 1.0.0
"""

import os
import sys
import glob
import logging
import warnings
from typing import Dict, List, Tuple, Optional, Set, Any
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
import traceback

import numpy as np
import pandas as pd
import xarray as xr
import netCDF4 as nc

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# =============================================================================
# LOGGING CONFIGURATION
# =============================================================================

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s | %(levelname)-8s | %(name)s | %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger("CryoTEMPO.Pipeline")


# =============================================================================
# CONFIGURATION CONSTANTS
# =============================================================================

@dataclass(frozen=True)
class PipelineConfig:
    """Immutable configuration for the Cryo-TEMPO extraction pipeline."""
    
    # Base directory containing all Cryo-TEMPO data
    BASE_DIR: str = r"D:\phd\data\CryoTEMPO"
    
    # Output directory for consolidated dataset
    OUTPUT_DIR: str = r"D:\phd\data\CryoTEMPO"
    
    # Time period boundaries
    START_YEAR: int = 2010
    START_MONTH: int = 7  # July
    END_YEAR: int = 2024
    END_MONTH: int = 5    # May
    
    # Target variables to validate and extract
    TARGET_VARIABLES: Tuple[str, ...] = (
        "radar_freeboard",
        "radar_freeboard_uncertainty",
        "sea_ice_freeboard",
        "sea_ice_freeboard_uncertainty",
        "snow_depth",
        "snow_depth_uncertainty",
    )
    
    # Additional variables to extract for context
    CONTEXT_VARIABLES: Tuple[str, ...] = (
        "time",
        "latitude",
        "longitude",
    )
    
    # File pattern for Cryo-TEMPO files
    FILE_PATTERN: str = "CS_OFFL_SIR_TDP_SI_ANTARC_*.nc"
    
    # Maximum workers for parallel processing
    MAX_WORKERS: int = 8
    
    # Chunk size for memory-efficient processing
    CHUNK_SIZE: int = 100


# Initialize configuration
CONFIG = PipelineConfig()


# =============================================================================
# DATA CLASSES FOR STRUCTURED RESULTS
# =============================================================================

@dataclass
class FileValidationResult:
    """Result of validating a single NetCDF file."""
    file_path: str
    is_valid: bool
    missing_variables: List[str] = field(default_factory=list)
    error_message: Optional[str] = None
    record_count: int = 0


@dataclass
class MonthlyValidationSummary:
    """Summary of validation results for a single month."""
    year: int
    month: int
    total_files: int
    valid_files: int
    invalid_files: int
    missing_variable_counts: Dict[str, int] = field(default_factory=dict)
    errors: List[str] = field(default_factory=list)


@dataclass
class PipelineReport:
    """Comprehensive report of the entire pipeline execution."""
    start_time: datetime
    end_time: Optional[datetime] = None
    total_files_scanned: int = 0
    total_files_valid: int = 0
    total_files_invalid: int = 0
    total_records_extracted: int = 0
    monthly_summaries: List[MonthlyValidationSummary] = field(default_factory=list)
    output_file_path: Optional[str] = None
    success: bool = False
    error_message: Optional[str] = None


print("=" * 80)
print("  CRYO-TEMPO VARIABLE VALIDATION AND EXTRACTION PIPELINE")
print("  Configuration Loaded Successfully")
print("=" * 80)
print(f"\n  Base Directory    : {CONFIG.BASE_DIR}")
print(f"  Time Period       : {CONFIG.START_YEAR}/{CONFIG.START_MONTH:02d} - {CONFIG.END_YEAR}/{CONFIG.END_MONTH:02d}")
print(f"  Target Variables  : {len(CONFIG.TARGET_VARIABLES)}")
print(f"  Max Workers       : {CONFIG.MAX_WORKERS}")
print("=" * 80)

  CRYO-TEMPO VARIABLE VALIDATION AND EXTRACTION PIPELINE
  Configuration Loaded Successfully

  Base Directory    : D:\phd\data\CryoTEMPO
  Time Period       : 2010/07 - 2024/05
  Target Variables  : 6
  Max Workers       : 8


## Stage 1: File Discovery

This stage scans the directory structure to identify all NetCDF files within the specified time period. The discovery process:
- Iterates through each year-month combination
- Validates directory existence
- Counts files per month for initial assessment

In [3]:
"""
Stage 1: File Discovery
Scan the directory structure to identify all Cryo-TEMPO NetCDF files.
"""


class FileDiscoveryEngine:
    """
    Engine for discovering Cryo-TEMPO NetCDF files across the archive.
    
    This class implements efficient directory scanning with comprehensive
    logging and error handling following enterprise standards.
    """
    
    def __init__(self, config: PipelineConfig) -> None:
        """
        Initialize the file discovery engine.
        
        Args:
            config: Pipeline configuration object.
        """
        self.config = config
        self.file_inventory: Dict[Tuple[int, int], List[str]] = {}
        
    def generate_time_periods(self) -> List[Tuple[int, int]]:
        """
        Generate list of (year, month) tuples for the target time period.
        
        Returns:
            List of (year, month) tuples from start to end period.
        """
        periods = []
        
        for year in range(self.config.START_YEAR, self.config.END_YEAR + 1):
            start_month = self.config.START_MONTH if year == self.config.START_YEAR else 1
            end_month = self.config.END_MONTH if year == self.config.END_YEAR else 12
            
            for month in range(start_month, end_month + 1):
                periods.append((year, month))
        
        logger.info(f"Generated {len(periods)} time periods to scan.")
        return periods
    
    def discover_files_for_period(self, year: int, month: int) -> List[str]:
        """
        Discover all NetCDF files for a specific year-month.
        
        Args:
            year: Target year.
            month: Target month (1-12).
            
        Returns:
            List of absolute file paths found.
        """
        month_dir = os.path.join(
            self.config.BASE_DIR, 
            str(year), 
            f"{month:02d}"
        )
        
        if not os.path.exists(month_dir):
            logger.warning(f"Directory not found: {month_dir}")
            return []
        
        pattern = os.path.join(month_dir, self.config.FILE_PATTERN)
        files = glob.glob(pattern)
        
        return sorted(files)
    
    def run_discovery(self) -> Dict[Tuple[int, int], List[str]]:
        """
        Execute the full file discovery process.
        
        Returns:
            Dictionary mapping (year, month) to list of file paths.
        """
        logger.info("Starting file discovery process...")
        start_time = datetime.now()
        
        periods = self.generate_time_periods()
        total_files = 0
        
        for year, month in periods:
            files = self.discover_files_for_period(year, month)
            self.file_inventory[(year, month)] = files
            total_files += len(files)
        
        elapsed = (datetime.now() - start_time).total_seconds()
        logger.info(f"Discovery completed: {total_files} files found in {elapsed:.2f}s")
        
        return self.file_inventory
    
    def print_discovery_summary(self) -> None:
        """Print a formatted summary of discovered files."""
        print("\n" + "=" * 80)
        print("  FILE DISCOVERY SUMMARY")
        print("=" * 80)
        
        summary_data = []
        total_files = 0
        
        for (year, month), files in sorted(self.file_inventory.items()):
            summary_data.append([year, month, len(files)])
            total_files += len(files)
        
        # Create yearly summary
        yearly_counts = defaultdict(int)
        for (year, month), files in self.file_inventory.items():
            yearly_counts[year] += len(files)
        
        print("\nFiles per Year:")
        print("-" * 40)
        for year in sorted(yearly_counts.keys()):
            print(f"  {year}: {yearly_counts[year]:,} files")
        
        print(f"\n{'=' * 40}")
        print(f"  TOTAL FILES: {total_files:,}")
        print("=" * 40)


# Execute file discovery
discovery_engine = FileDiscoveryEngine(CONFIG)
file_inventory = discovery_engine.run_discovery()
discovery_engine.print_discovery_summary()

2025-12-07 20:51:52 - INFO - CryoTEMPO.Pipeline - Starting file discovery process...
2025-12-07 20:51:52 - INFO - CryoTEMPO.Pipeline - Generated 167 time periods to scan.
2025-12-07 20:54:25 - INFO - CryoTEMPO.Pipeline - Discovery completed: 179876 files found in 153.02s



  FILE DISCOVERY SUMMARY

Files per Year:
----------------------------------------
  2010: 5,312 files
  2011: 13,957 files
  2012: 13,388 files
  2013: 12,664 files
  2014: 12,964 files
  2015: 12,787 files
  2016: 12,817 files
  2017: 12,682 files
  2018: 12,791 files
  2019: 12,862 files
  2020: 13,089 files
  2021: 12,870 files
  2022: 13,005 files
  2023: 13,125 files
  2024: 5,563 files

  TOTAL FILES: 179,876


## Stage 2: Variable Validation

This stage validates that all target variables exist in every NetCDF file. The validation process:
- Opens each file and checks for required variables
- Tracks missing variables per file
- Generates comprehensive validation reports
- Identifies any data quality issues

In [4]:
"""
Stage 2: Variable Validation
Validate that all target variables exist in every NetCDF file.
"""


class VariableValidator:
    """
    Validator for checking variable existence across NetCDF files.
    
    This class implements comprehensive validation with detailed
    reporting following enterprise data quality standards.
    """
    
    def __init__(self, config: PipelineConfig) -> None:
        """
        Initialize the variable validator.
        
        Args:
            config: Pipeline configuration object.
        """
        self.config = config
        self.validation_results: List[FileValidationResult] = []
        
    def validate_single_file(self, file_path: str) -> FileValidationResult:
        """
        Validate a single NetCDF file for required variables.
        
        Args:
            file_path: Path to the NetCDF file.
            
        Returns:
            FileValidationResult with validation outcome.
        """
        try:
            with xr.open_dataset(file_path) as ds:
                available_vars = set(ds.data_vars.keys()) | set(ds.coords.keys())
                required_vars = set(self.config.TARGET_VARIABLES) | set(self.config.CONTEXT_VARIABLES)
                
                missing = list(required_vars - available_vars)
                record_count = ds.dims.get('time', len(ds.time) if 'time' in ds.coords else 0)
                
                return FileValidationResult(
                    file_path=file_path,
                    is_valid=(len(missing) == 0),
                    missing_variables=missing,
                    record_count=record_count
                )
                
        except Exception as e:
            return FileValidationResult(
                file_path=file_path,
                is_valid=False,
                error_message=str(e)
            )
    
    def validate_monthly_files(
        self, 
        year: int, 
        month: int, 
        files: List[str]
    ) -> MonthlyValidationSummary:
        """
        Validate all files for a specific month.
        
        Args:
            year: Target year.
            month: Target month.
            files: List of file paths to validate.
            
        Returns:
            MonthlyValidationSummary with aggregated results.
        """
        valid_count = 0
        invalid_count = 0
        missing_var_counts: Dict[str, int] = defaultdict(int)
        errors: List[str] = []
        
        for file_path in files:
            result = self.validate_single_file(file_path)
            self.validation_results.append(result)
            
            if result.is_valid:
                valid_count += 1
            else:
                invalid_count += 1
                for var in result.missing_variables:
                    missing_var_counts[var] += 1
                if result.error_message:
                    errors.append(f"{os.path.basename(file_path)}: {result.error_message}")
        
        return MonthlyValidationSummary(
            year=year,
            month=month,
            total_files=len(files),
            valid_files=valid_count,
            invalid_files=invalid_count,
            missing_variable_counts=dict(missing_var_counts),
            errors=errors
        )
    
    def run_validation(
        self, 
        file_inventory: Dict[Tuple[int, int], List[str]]
    ) -> List[MonthlyValidationSummary]:
        """
        Execute validation across all discovered files.
        
        Args:
            file_inventory: Dictionary mapping (year, month) to file lists.
            
        Returns:
            List of MonthlyValidationSummary objects.
        """
        logger.info("Starting variable validation process...")
        start_time = datetime.now()
        
        summaries = []
        total_files = sum(len(files) for files in file_inventory.values())
        processed = 0
        
        for (year, month), files in sorted(file_inventory.items()):
            if not files:
                continue
                
            summary = self.validate_monthly_files(year, month, files)
            summaries.append(summary)
            processed += len(files)
            
            # Progress update every year
            if month == 12 or (year == self.config.END_YEAR and month == self.config.END_MONTH):
                logger.info(f"Validated {year}: {processed}/{total_files} files processed")
        
        elapsed = (datetime.now() - start_time).total_seconds()
        logger.info(f"Validation completed in {elapsed:.2f}s")
        
        return summaries
    
    def print_validation_report(self, summaries: List[MonthlyValidationSummary]) -> bool:
        """
        Print comprehensive validation report.
        
        Args:
            summaries: List of monthly validation summaries.
            
        Returns:
            True if all files are valid, False otherwise.
        """
        print("\n" + "=" * 80)
        print("  VARIABLE VALIDATION REPORT")
        print("=" * 80)
        
        total_files = sum(s.total_files for s in summaries)
        total_valid = sum(s.valid_files for s in summaries)
        total_invalid = sum(s.invalid_files for s in summaries)
        
        print(f"\nTarget Variables: {', '.join(self.config.TARGET_VARIABLES)}")
        print(f"\n{'=' * 60}")
        print(f"  Total Files Scanned : {total_files:,}")
        print(f"  Valid Files         : {total_valid:,} ({100*total_valid/total_files:.2f}%)")
        print(f"  Invalid Files       : {total_invalid:,} ({100*total_invalid/total_files:.2f}%)")
        print(f"{'=' * 60}")
        
        # Check for any missing variables
        all_missing = defaultdict(int)
        for s in summaries:
            for var, count in s.missing_variable_counts.items():
                all_missing[var] += count
        
        if all_missing:
            print("\n⚠ Missing Variables Detected:")
            print("-" * 40)
            for var, count in sorted(all_missing.items(), key=lambda x: -x[1]):
                print(f"  {var}: missing in {count} files")
        
        # Collect all errors
        all_errors = []
        for s in summaries:
            all_errors.extend(s.errors)
        
        if all_errors:
            print(f"\n⚠ Errors Detected ({len(all_errors)} total):")
            print("-" * 40)
            for error in all_errors[:10]:  # Show first 10
                print(f"  {error}")
            if len(all_errors) > 10:
                print(f"  ... and {len(all_errors) - 10} more errors")
        
        all_valid = (total_invalid == 0)
        
        if all_valid:
            print("\n" + "=" * 60)
            print("  ✓ ALL FILES VALIDATED SUCCESSFULLY")
            print("  All target variables exist in all datasets.")
            print("=" * 60)
        else:
            print("\n" + "=" * 60)
            print("  ⚠ VALIDATION COMPLETED WITH ISSUES")
            print("=" * 60)
        
        return all_valid


# Execute validation
validator = VariableValidator(CONFIG)
validation_summaries = validator.run_validation(file_inventory)
all_valid = validator.print_validation_report(validation_summaries)

2025-12-07 20:56:03 - INFO - CryoTEMPO.Pipeline - Starting variable validation process...
2025-12-07 20:58:24 - INFO - CryoTEMPO.Pipeline - Validated 2010: 5312/179876 files processed
2025-12-07 21:08:00 - INFO - CryoTEMPO.Pipeline - Validated 2011: 19269/179876 files processed
2025-12-07 21:17:26 - INFO - CryoTEMPO.Pipeline - Validated 2012: 32657/179876 files processed
2025-12-07 21:23:25 - INFO - CryoTEMPO.Pipeline - Validated 2013: 45321/179876 files processed
2025-12-07 21:30:58 - INFO - CryoTEMPO.Pipeline - Validated 2014: 58285/179876 files processed
2025-12-07 21:40:49 - INFO - CryoTEMPO.Pipeline - Validated 2015: 71072/179876 files processed
2025-12-07 21:52:47 - INFO - CryoTEMPO.Pipeline - Validated 2016: 83889/179876 files processed
2025-12-07 22:01:33 - INFO - CryoTEMPO.Pipeline - Validated 2017: 96571/179876 files processed
2025-12-07 22:08:26 - INFO - CryoTEMPO.Pipeline - Validated 2018: 109362/179876 files processed
2025-12-07 22:15:26 - INFO - CryoTEMPO.Pipeline - Valid


  VARIABLE VALIDATION REPORT

Target Variables: radar_freeboard, radar_freeboard_uncertainty, sea_ice_freeboard, sea_ice_freeboard_uncertainty, snow_depth, snow_depth_uncertainty

  Total Files Scanned : 179,876
  Valid Files         : 179,872 (100.00%)
  Invalid Files       : 4 (0.00%)

⚠ Errors Detected (4 total):
----------------------------------------
  CS_OFFL_SIR_TDP_SI_ANTARC_20101128T231145_20101128T231416_02_05014_C001.nc: did not find a match in any of xarray's currently installed IO backends ['netcdf4', 'scipy']. Consider explicitly selecting one of the installed engines via the ``engine`` parameter, or installing additional IO dependencies, see:
https://docs.xarray.dev/en/stable/getting-started-guide/installing.html
https://docs.xarray.dev/en/stable/user-guide/io.html
  CS_OFFL_SIR_TDP_SI_ANTARC_20101221T084100_20101221T084146_02_05339_C001.nc: did not find a match in any of xarray's currently installed IO backends ['netcdf4', 'scipy']. Consider explicitly selecting one o

## Stage 2.1: Corrupted File Analysis

The validation identified **4 files** (0.002% of total) that cannot be read by xarray's NetCDF backends. This typically indicates:
- Corrupted files during download/transfer
- Incomplete write operations
- File system errors

### Action Plan
1. Investigate the corrupted files using low-level diagnostics
2. Log these files for exclusion from extraction
3. Proceed with valid files only (179,872 files)

### Affected Files
| File | Year | Month | Orbit |
|------|------|-------|-------|
| `CS_OFFL_SIR_TDP_SI_ANTARC_20101128T231145_20101128T231416_02_05014_C001.nc` | 2010 | 11 | 05014 |
| `CS_OFFL_SIR_TDP_SI_ANTARC_20101221T084100_20101221T084146_02_05339_C001.nc` | 2010 | 12 | 05339 |
| `CS_OFFL_SIR_TDP_SI_ANTARC_20110226T143225_20110226T143344_03_00971_C001.nc` | 2011 | 02 | 00971 |
| `CS_OFFL_SIR_TDP_SI_ANTARC_20110226T143448_20110226T143510_03_00971_C001.nc` | 2011 | 02 | 00971 |

In [8]:
"""
Stage 2.1: Corrupted File Investigation
Diagnose and document corrupted NetCDF files for exclusion.
"""

import struct
from typing import NamedTuple


class CorruptedFileInfo(NamedTuple):
    """Information about a corrupted file."""
    file_path: str
    file_name: str
    file_size_bytes: int
    is_empty: bool
    has_nc_signature: bool
    error_type: str


class CorruptedFileAnalyzer:
    """
    Analyzer for diagnosing corrupted NetCDF files.
    
    This class provides low-level file inspection to understand
    why certain files cannot be read by standard NetCDF libraries.
    """
    
    # NetCDF-4/HDF5 magic number
    NC4_SIGNATURE = b'\x89HDF\r\n\x1a\n'
    # NetCDF-3 (classic) magic number
    NC3_SIGNATURE = b'CDF'
    
    def __init__(self) -> None:
        """Initialize the analyzer."""
        self.corrupted_files: List[CorruptedFileInfo] = []
    
    def check_file_signature(self, file_path: str) -> Tuple[bool, str]:
        """
        Check if file has valid NetCDF signature.
        
        Args:
            file_path: Path to the file.
            
        Returns:
            Tuple of (is_valid, format_detected).
        """
        try:
            with open(file_path, 'rb') as f:
                header = f.read(8)
                
                if len(header) < 3:
                    return False, "Empty or truncated"
                
                if header[:3] == self.NC3_SIGNATURE:
                    return True, "NetCDF-3 (Classic)"
                
                if header == self.NC4_SIGNATURE:
                    return True, "NetCDF-4 (HDF5)"
                
                return False, f"Unknown format (header: {header[:8].hex()})"
                
        except Exception as e:
            return False, f"Read error: {str(e)}"
    
    def analyze_file(self, file_path: str) -> CorruptedFileInfo:
        """
        Perform comprehensive analysis of a potentially corrupted file.
        
        Args:
            file_path: Path to the file.
            
        Returns:
            CorruptedFileInfo with diagnostic information.
        """
        file_name = os.path.basename(file_path)
        
        try:
            file_size = os.path.getsize(file_path)
            is_empty = file_size == 0
            has_signature, error_type = self.check_file_signature(file_path)
            
            return CorruptedFileInfo(
                file_path=file_path,
                file_name=file_name,
                file_size_bytes=file_size,
                is_empty=is_empty,
                has_nc_signature=has_signature,
                error_type=error_type
            )
            
        except Exception as e:
            return CorruptedFileInfo(
                file_path=file_path,
                file_name=file_name,
                file_size_bytes=-1,
                is_empty=True,
                has_nc_signature=False,
                error_type=f"Access error: {str(e)}"
            )
    
    def analyze_corrupted_files(
        self, 
        validation_results: List[FileValidationResult]
    ) -> List[CorruptedFileInfo]:
        """
        Analyze all files that failed validation.
        
        Args:
            validation_results: List of validation results.
            
        Returns:
            List of CorruptedFileInfo for failed files.
        """
        failed_files = [
            r for r in validation_results 
            if not r.is_valid and r.error_message is not None
        ]
        
        self.corrupted_files = []
        for result in failed_files:
            info = self.analyze_file(result.file_path)
            self.corrupted_files.append(info)
        
        return self.corrupted_files
    
    def print_analysis_report(self) -> None:
        """Print detailed analysis report for corrupted files."""
        print("\n" + "=" * 80)
        print("  CORRUPTED FILE ANALYSIS REPORT")
        print("=" * 80)
        
        if not self.corrupted_files:
            print("\n  ✓ No corrupted files detected.")
            return
        
        print(f"\n  Total Corrupted Files: {len(self.corrupted_files)}")
        print("-" * 80)
        
        for i, info in enumerate(self.corrupted_files, 1):
            print(f"\n  [{i}] {info.file_name}")
            print(f"      Path: {info.file_path}")
            print(f"      Size: {info.file_size_bytes:,} bytes")
            print(f"      Empty: {info.is_empty}")
            print(f"      Valid NC Signature: {info.has_nc_signature}")
            print(f"      Diagnosis: {info.error_type}")
        
        print("\n" + "-" * 80)
        print("  RECOMMENDATION: Exclude these files from extraction.")
        print("  These files should be re-downloaded from the source if needed.")
        print("=" * 80)
    
    def get_exclusion_list(self) -> List[str]:
        """
        Get list of file paths to exclude from extraction.
        
        Returns:
            List of absolute file paths to exclude.
        """
        return [info.file_path for info in self.corrupted_files]


# Analyze corrupted files
analyzer = CorruptedFileAnalyzer()
corrupted_files_info = analyzer.analyze_corrupted_files(validator.validation_results)
analyzer.print_analysis_report()

# Store exclusion list for extraction stage
EXCLUSION_LIST = analyzer.get_exclusion_list()
print(f"\n  Files to exclude: {len(EXCLUSION_LIST)}")


  CORRUPTED FILE ANALYSIS REPORT

  Total Corrupted Files: 4
--------------------------------------------------------------------------------

  [1] CS_OFFL_SIR_TDP_SI_ANTARC_20101128T231145_20101128T231416_02_05014_C001.nc
      Path: D:\phd\data\CryoTEMPO\2010\11\CS_OFFL_SIR_TDP_SI_ANTARC_20101128T231145_20101128T231416_02_05014_C001.nc
      Size: 0 bytes
      Empty: True
      Valid NC Signature: False
      Diagnosis: Empty or truncated

  [2] CS_OFFL_SIR_TDP_SI_ANTARC_20101221T084100_20101221T084146_02_05339_C001.nc
      Path: D:\phd\data\CryoTEMPO\2010\12\CS_OFFL_SIR_TDP_SI_ANTARC_20101221T084100_20101221T084146_02_05339_C001.nc
      Size: 0 bytes
      Empty: True
      Valid NC Signature: False
      Diagnosis: Empty or truncated

  [3] CS_OFFL_SIR_TDP_SI_ANTARC_20110226T143225_20110226T143344_03_00971_C001.nc
      Path: D:\phd\data\CryoTEMPO\2011\02\CS_OFFL_SIR_TDP_SI_ANTARC_20110226T143225_20110226T143344_03_00971_C001.nc
      Size: 0 bytes
      Empty: True
      Vali