# Test STAC Discovery Functions

This notebook tests the campaign and flight line discovery functions from the xopr.stac module.

In [8]:
import sys
from pathlib import Path

# # Add src to path for local development
# sys.path.insert(0, str(Path.cwd().parent / "src"))

from xopr.stac import discover_campaigns, discover_flight_lines

In [9]:
# Set data root path
data_root = Path("/home/thomasteisberg/Documents/opr/opr_test_dataset_1")
print(f"Data root: {data_root}")
print(f"Exists: {data_root.exists()}")

Data root: /home/thomasteisberg/Documents/opr/opr_test_dataset_1
Exists: True


In [10]:
# Discover campaigns
campaigns = discover_campaigns(data_root)
print(f"Found {len(campaigns)} campaigns:")
for campaign in campaigns:
    print(f"  {campaign}")

Found 2 campaigns:
  {'name': '2016_Antarctica_DC8', 'year': '2016', 'location': 'Antarctica', 'aircraft': 'DC8', 'path': '/home/thomasteisberg/Documents/opr/opr_test_dataset_1/2016_Antarctica_DC8'}
  {'name': '2022_Antarctica_BaslerMKB', 'year': '2022', 'location': 'Antarctica', 'aircraft': 'BaslerMKB', 'path': '/home/thomasteisberg/Documents/opr/opr_test_dataset_1/2022_Antarctica_BaslerMKB'}


In [11]:
# Test flight line discovery for first campaign
if campaigns:
    campaign = campaigns[0]
    campaign_path = Path(campaign['path'])
    print(f"\nTesting flight line discovery for: {campaign['name']}")
    print(f"Campaign path: {campaign_path}")
    
    try:
        flight_lines = discover_flight_lines(campaign_path, "CSARP_standard")
        print(f"\nFound {len(flight_lines)} flight lines:")
        
        # Show first few flight lines
        for i, flight in enumerate(flight_lines[:3]):
            print(f"\nFlight {i+1}: {flight['flight_id']}")
            print(f"  Date: {flight['date']}")
            print(f"  Flight num: {flight['flight_num']}")
            print(f"  MAT files: {len(flight['mat_files'])}")
            print(f"  CSV file: {flight['csv_file']}")
            
            # Check if files exist
            csv_exists = Path(flight['csv_file']).exists()
            mat_exists = all(Path(f).exists() for f in flight['mat_files'])
            print(f"  CSV exists: {csv_exists}")
            print(f"  All MAT files exist: {mat_exists}")
            
    except Exception as e:
        print(f"Error discovering flight lines: {e}")
        import traceback
        traceback.print_exc()


Testing flight line discovery for: 2016_Antarctica_DC8
Campaign path: /home/thomasteisberg/Documents/opr/opr_test_dataset_1/2016_Antarctica_DC8

Found 55 flight lines:

Flight 1: 20161014_03
  Date: 20161014
  Flight num: 03
  MAT files: 1
  CSV file: /home/thomasteisberg/Documents/opr/opr_test_dataset_1/2016_Antarctica_DC8/csv/Data_20161014_03.csv
  CSV exists: True
  All MAT files exist: True

Flight 2: 20161014_04
  Date: 20161014
  Flight num: 04
  MAT files: 10
  CSV file: /home/thomasteisberg/Documents/opr/opr_test_dataset_1/2016_Antarctica_DC8/csv/Data_20161014_04.csv
  CSV exists: True
  All MAT files exist: True

Flight 3: 20161014_05
  Date: 20161014
  Flight num: 05
  MAT files: 41
  CSV file: /home/thomasteisberg/Documents/opr/opr_test_dataset_1/2016_Antarctica_DC8/csv/Data_20161014_05.csv
  CSV exists: True
  All MAT files exist: True


In [12]:
# Test with second campaign if available
if len(campaigns) > 1:
    campaign = campaigns[1]
    campaign_path = Path(campaign['path'])
    print(f"\nTesting flight line discovery for: {campaign['name']}")
    
    try:
        flight_lines = discover_flight_lines(campaign_path, "CSARP_standard")
        print(f"Found {len(flight_lines)} flight lines")
        
        # Show first flight line details
        if flight_lines:
            flight = flight_lines[0]
            print(f"\nFirst flight: {flight['flight_id']}")
            print(f"  MAT files: {len(flight['mat_files'])}")
            print(f"  CSV exists: {Path(flight['csv_file']).exists()}")
            
    except Exception as e:
        print(f"Error: {e}")
else:
    print("\nOnly one campaign found.")


Testing flight line discovery for: 2022_Antarctica_BaslerMKB
Found 23 flight lines

First flight: 20221210_01
  MAT files: 18
  CSV exists: True


In [13]:
# Test building a STAC catalog for one campaign
if campaigns and flight_lines:
    from xopr.stac import (
        create_catalog, create_collection, create_item_from_flight_data, 
        build_collection_extent
    )
    
    print("\\n" + "="*50)
    print("TESTING CATALOG CREATION")
    print("="*50)
    
    # Create root catalog
    catalog = create_catalog(catalog_id="OPR_Test")
    print(f"Created catalog: {catalog.id}")
    
    # Use first campaign and limit to first 2 flight lines
    campaign = campaigns[0]
    campaign_name = campaign['name']
    test_flights = flight_lines[:2]  # Just first 2 flights
    
    print(f"\\nProcessing {len(test_flights)} flights from {campaign_name}:")
    
    collection_items = []
    base_url = "https://data.cresis.ku.edu/data/rds/"
    
    for i, flight_data in enumerate(test_flights):
        print(f"\\n  Flight {i+1}: {flight_data['flight_id']}")
        
        try:
            # Create STAC items for this flight
            items = create_item_from_flight_data(
                flight_data=flight_data,
                base_url=base_url,
                campaign_name=campaign_name,
                data_product="CSARP_standard"
            )
            
            # Take only first item to keep it simple
            if items:
                collection_items.append(items[0])
                print(f"    ✅ Created STAC item: {items[0].id}")
                print(f"    Geometry type: {items[0].geometry['type']}")
                print(f"    Bbox: {items[0].bbox}")
                print(f"    DateTime: {items[0].datetime}")
                print(f"    Assets: {list(items[0].assets.keys())}")
            else:
                print("    ⚠️  No items created")
                
        except Exception as e:
            print(f"    ❌ Error creating items: {e}")
            import traceback
            traceback.print_exc()
    
    # Create collection if we have items
    if collection_items:
        print(f"\\nCreating collection with {len(collection_items)} items...")
        
        try:
            # Build extent from items
            extent = build_collection_extent(collection_items)
            print(f"Collection extent calculated successfully")
            
            # Create collection
            collection = create_collection(
                collection_id=campaign_name,
                description=f"Test collection for {campaign['year']} {campaign['aircraft']} flights over {campaign['location']}",
                extent=extent
            )
            
            # Add items to collection
            collection.add_items(collection_items)
            print(f"✅ Collection created: {collection.id}")
            print(f"   Items: {len(list(collection.get_items()))}")
            
            # Add collection to catalog
            catalog.add_child(collection)
            print(f"✅ Collection added to catalog")
            
            # Show catalog structure
            print(f"\\nCatalog structure:")
            print(f"  Catalog: {catalog.id}")
            for child in catalog.get_children():
                if hasattr(child, 'get_items'):
                    item_count = len(list(child.get_items()))
                    print(f"    Collection: {child.id} ({item_count} items)")
            
        except Exception as e:
            print(f"❌ Error creating collection: {e}")
            import traceback
            traceback.print_exc()
    else:
        print("\\n⚠️  No items created, skipping collection creation")
else:
    print("\\nSkipping catalog creation - no campaigns or flight lines found")

TESTING CATALOG CREATION
Created catalog: OPR_Test
\nProcessing 2 flights from 2016_Antarctica_DC8:
\n  Flight 1: 20221210_01
    ✅ Created STAC item: CSARP_standard_Data_20221210_01_005
    Geometry type: LineString
    Bbox: [155.014586, -77.340594, 165.105848, -74.843745]
    DateTime: 2022-12-09 18:13:14.763149
    Assets: ['data', 'thumbnails', 'flight_path']
\n  Flight 2: 20221212_01
    ✅ Created STAC item: CSARP_standard_Data_20221212_01_003
    Geometry type: LineString
    Bbox: [154.595853, -77.361667, 166.191141, -74.820952]
    DateTime: 2022-12-11 16:54:50.984210
    Assets: ['data', 'thumbnails', 'flight_path']
\nCreating collection with 2 items...
Collection extent calculated successfully
✅ Collection created: 2016_Antarctica_DC8
   Items: 2
✅ Collection added to catalog
\nCatalog structure:
  Catalog: OPR_Test
    Collection: 2016_Antarctica_DC8 (2 items)


In [14]:
# Test saving catalog to geoparquet
if 'catalog' in locals() and catalog:
    import json
    import stac_geoparquet
    from pathlib import Path
    
    print("\n" + "="*50)
    print("TESTING GEOPARQUET EXPORT")
    print("="*50)
    
    try:
        # Create output directory
        output_dir = Path("./test_stac_output")
        output_dir.mkdir(exist_ok=True)
        
        # Create temporary NDJSON file
        ndjson_file = output_dir / "test_catalog.json"
        parquet_file = output_dir / "test_catalog.parquet"
        
        print(f"Exporting catalog to: {parquet_file}")
        
        # Write all items to NDJSON (fix the newline issue)
        item_count = 0
        with open(ndjson_file, 'w') as f:
            for item in catalog.get_all_items():
                json.dump(item.to_dict(), f, separators=(",", ":"))
                f.write("\n")  # Fixed: single backslash for actual newline
                item_count += 1
        
        print(f"✅ Written {item_count} items to NDJSON")
        
        # Convert to parquet
        stac_geoparquet.arrow.parse_stac_ndjson_to_parquet(str(ndjson_file), str(parquet_file))
        
        # Clean up temporary file
        ndjson_file.unlink()
        
        print(f"✅ Geoparquet saved: {parquet_file}")
        print(f"   File size: {parquet_file.stat().st_size / 1024:.1f} KB")
        
        # Test reading the parquet file back
        print(f"\nTesting parquet file read...")
        import pyarrow.parquet as pq
        table = pq.read_table(parquet_file)
        print(f"✅ Parquet file readable")
        print(f"   Schema: {len(table.schema)} columns")
        print(f"   Rows: {len(table)}")
        print(f"   Columns: {[field.name for field in table.schema][:5]}...")  # Show first 5 column names
        
    except Exception as e:
        print(f"❌ Error exporting to geoparquet: {e}")
        import traceback
        traceback.print_exc()
else:
    print("\nSkipping geoparquet export - no catalog created")


TESTING GEOPARQUET EXPORT
Exporting catalog to: test_stac_output/test_catalog.parquet
✅ Written 2 items to NDJSON
✅ Geoparquet saved: test_stac_output/test_catalog.parquet
   File size: 1274.1 KB

Testing parquet file read...
✅ Parquet file readable
   Schema: 10 columns
   Rows: 2
   Columns: ['assets', 'bbox', 'collection', 'geometry', 'id']...
