# 02: Data Verification and Download

**Purpose:** Check what SEVIR data you have and download missing files

**What this does:**
- Check which SEVIR modalities exist
- Count files per modality
- Identify missing data
- Provide download instructions/scripts

**What this does NOT do:**
- Load actual data into memory
- Create models
- Run training

**Expected time:** 5 minutes (or 30-90 min if downloading data)

---

**Prerequisites:** Run `01_Setup_and_Environment.ipynb` first

## Step 1: Set Paths

In [None]:
from google.colab import drive
import os
import glob
import pandas as pd

# Mount Google Drive FIRST
print("Mounting Google Drive...")
drive.mount('/content/drive', force_remount=False)
print("✅ Drive mounted\n")

# Your data directory (should match from 01_Setup)
DRIVE_ROOT = "/content/drive/MyDrive/SEVIR_Data"
SEVIR_ROOT = f"{DRIVE_ROOT}/data/sevir"
CATALOG_PATH = f"{DRIVE_ROOT}/data/SEVIR_CATALOG.csv"

print(f"SEVIR root: {SEVIR_ROOT}")
print(f"Catalog: {CATALOG_PATH}")

# Verify paths exist
print(f"\nVerifying paths...")
print(f"SEVIR_Data exists: {os.path.exists(DRIVE_ROOT)}")
print(f"sevir folder exists: {os.path.exists(SEVIR_ROOT)}")

## Step 2: Check What Data Files Exist

In [None]:
print("="*70)
print("SEVIR DATA CHECK")
print("="*70)

modalities = {
    'vil': 'VIL (Radar) - TARGET MODALITY',
    'ir069': 'GOES-16 C09 (Water Vapor 6.9μm)',
    'ir107': 'GOES-16 C13 (IR Window 10.7μm)',
    'lght': 'GOES-16 GLM (Lightning)'
}

# Expected file counts per year (from AWS S3)
expected_counts_2019 = {
    'vil': 5,
    'ir069': 5,
    'ir107': 5,
    'lght': 11
}

# Years to check (add more years as needed)
years_to_check = ['2019', '2018', '2017']

data_status = {}

for year in years_to_check:
    print(f"\n{'='*70}")
    print(f"YEAR {year}")
    print(f"{'='*70}")
    
    year_data = {}
    
    for mod, desc in modalities.items():
        mod_path = f"{SEVIR_ROOT}/{mod}/{year}"
        
        if os.path.exists(mod_path):
            h5_files = glob.glob(f"{mod_path}/*.h5")
            total_gb = sum(os.path.getsize(f) for f in h5_files) / 1e9
            
            expected = expected_counts_2019.get(mod, 5)  # Default to 5 if not specified
            # Complete if we have at least 1 file (be lenient)
            status = "✅" if len(h5_files) > 0 else "❌"
            
            print(f"{status} {mod.upper():8s}: {len(h5_files):3d} files ({total_gb:.1f} GB)")
            
            year_data[mod] = {
                'exists': True,
                'files': len(h5_files),
                'size_gb': total_gb,
                'complete': len(h5_files) > 0
            }
        else:
            print(f"❌ {mod.upper():8s}: NOT FOUND")
            
            year_data[mod] = {
                'exists': False,
                'files': 0,
                'size_gb': 0,
                'complete': False
            }
    
    data_status[year] = year_data

print("\n" + "="*70)
print("SUMMARY")
print("="*70)

for year in years_to_check:
    total_files = sum(d['files'] for d in data_status[year].values())
    total_gb = sum(d['size_gb'] for d in data_status[year].values())
    print(f"{year}: {total_files:3d} files, {total_gb:6.1f} GB")

print("="*70)

## Step 3: Check Catalog

In [None]:
# Load catalog if it exists
if os.path.exists(CATALOG_PATH):
    catalog = pd.read_csv(CATALOG_PATH, low_memory=False)

    print("📋 SEVIR Catalog Analysis\n")

    for mod in ['vil', 'ir069', 'ir107', 'lght']:
        mod_catalog = catalog[catalog['img_type'] == mod]
        unique_files = mod_catalog['file_name'].nunique()
        unique_events = mod_catalog['id'].nunique()

        print(f"{mod.upper():8s}:")
        print(f"  Catalog lists {unique_files} files")
        print(f"  Catalog lists {unique_events} events")

        # Count total files we have across all years
        total_files = 0
        for year in years_to_check:
            if year in data_status and mod in data_status[year]:
                total_files += data_status[year][mod]['files']
        
        if total_files > 0:
            coverage = (total_files / unique_files * 100) if unique_files > 0 else 0
            print(f"  You have {total_files} files across all years ({coverage:.1f}% coverage)")
        else:
            print(f"  You have 0 files")
        print()
else:
    print(f"❌ Catalog not found: {CATALOG_PATH}")
    print("   Download from: https://sevir.mit.edu/")

## Step 4: Data Completeness Assessment

In [None]:
print("="*70)
print("DATA COMPLETENESS ASSESSMENT")
print("="*70)

# Check completeness for each year
any_vil_data = False
all_years_complete = True

for year in years_to_check:
    year_status = data_status[year]
    vil_exists = year_status['vil']['complete']
    all_mods_exist = all(status['complete'] for status in year_status.values())
    
    if vil_exists:
        any_vil_data = True
    
    if not all_mods_exist:
        all_years_complete = False
    
    status_emoji = "✅" if all_mods_exist else ("⚠️" if vil_exists else "❌")
    print(f"\n{status_emoji} {year}: VIL={'✅' if vil_exists else '❌'}, All={'✅' if all_mods_exist else '❌'}")

print("\n" + "="*70)

if any_vil_data:
    print("\n✅ GOOD: Have VIL data for at least one year")
    print("   Can proceed with training")
    
    if not all_years_complete:
        print("\n⚠️  Some years/modalities incomplete")
        print("   Can train, but performance may be degraded")
        print("   Recommendation: Download missing data for best results")
    
    need_download = not all_years_complete
else:
    print("\n❌ CRITICAL: No VIL data found")
    print("   VIL is the TARGET modality - training will fail without it")
    print("   Action Required: Download VIL data")
    need_download = True

print("\n" + "="*70)

## Step 5: Download Data (If Needed)

**Set `DOWNLOAD = True` to enable download**

**WARNING:** This will download ~50 GB and take 30-90 minutes

In [None]:
DOWNLOAD = False  # ⚠️ SET TO True TO DOWNLOAD

# Specify which years to download (edit this list!)
YEARS_TO_DOWNLOAD = ['2018', '2017']  # 2019 already exists

if DOWNLOAD and need_download:
    print("="*70)
    print("DOWNLOADING SEVIR DATA FROM AWS S3")
    print("="*70)
    print(f"\nWill download years: {YEARS_TO_DOWNLOAD}")
    print("\nℹ️  Note: Per year, AWS S3 contains:")
    print("   - VIL: ~60 GB (5 files)")
    print("   - IR069: ~25 GB (5 files)")
    print("   - IR107: ~25 GB (5 files)")
    print("   - LGHT: ~1 GB (11 files)")
    print("   Total per year: ~110 GB\n")
    
    # Install AWS CLI
    print("Installing AWS CLI...")
    !pip install -q awscli
    print("✅ AWS CLI ready\n")
    
    # Download each year and modality
    for year in YEARS_TO_DOWNLOAD:
        print(f"\n{'='*70}")
        print(f"DOWNLOADING YEAR {year}")
        print(f"{'='*70}")
        
        year_status = data_status.get(year, {})
        
        for mod in ['vil', 'ir069', 'ir107', 'lght']:
            mod_status = year_status.get(mod, {'complete': False})
            
            # Skip if already have files
            if mod_status['complete'] and mod_status['files'] > 0:
                print(f"\n✅ {mod.upper():8s} ({year}): Already have {mod_status['files']} files, skipping")
                continue
            
            print(f"\n⬇️  {mod.upper():8s} ({year}): Starting download...")
            
            target_dir = f"{SEVIR_ROOT}/{mod}/{year}"
            
            # Create directory
            !mkdir -p "{target_dir}"
            
            # Download with sync (only downloads what's missing!)
            !aws s3 sync s3://sevir/data/{mod}/{year}/ "{target_dir}" --no-sign-request --region us-east-1
            
            # Verify what we have now
            files = glob.glob(f"{target_dir}/*.h5")
            total_gb = sum(os.path.getsize(f) for f in files) / 1e9
            print(f"   ✅ Now have {len(files)} files ({total_gb:.1f} GB) for {mod}/{year}")
    
    print("\n" + "="*70)
    print("✅ DOWNLOAD COMPLETE")
    print("="*70)
    print("\n📋 RECOMMENDED: Re-run Step 2 (cell above) to verify all downloads")
    
elif not DOWNLOAD:
    print("⏭️  Download skipped")
    print("\n📝 To download:")
    print("   1. Edit YEARS_TO_DOWNLOAD list above (remove years you already have)")
    print("   2. Set DOWNLOAD=True")
    print("   3. Re-run this cell")
    
    if need_download:
        print("\n⚠️  You have incomplete data")
        print(f"   Years to download: {YEARS_TO_DOWNLOAD}")
else:
    print("✅ Data already complete, no download needed")

## Step 6: Sample a Few Files (Sanity Check)

In [None]:
import h5py
import numpy as np

print("Testing file loading (sanity check)...\n")

# Test VIL from each year that has data
for year in years_to_check:
    if year in data_status and 'vil' in data_status[year]:
        year_status = data_status[year]['vil']
        
        if year_status['exists'] and year_status['files'] > 0:
            mod_path = f"{SEVIR_ROOT}/vil/{year}"
            files = sorted(glob.glob(f"{mod_path}/*.h5"))

            if files:
                test_file = files[0]
                print(f"Testing VIL ({year}): {os.path.basename(test_file)}")

                try:
                    with h5py.File(test_file, 'r') as h5:
                        # Check structure
                        print(f"  Keys: {list(h5.keys())[:5]}...")  # Show first 5 keys

                        if 'vil' in h5:
                            data = h5['vil']
                            print(f"  Shape: {data.shape}")
                            print(f"  Dtype: {data.dtype}")

                            # Load first sample
                            sample = data[0]
                            print(f"  Sample shape: {sample.shape}")
                            print(f"  Sample range: [{sample.min():.1f}, {sample.max():.1f}]")
                            print(f"  ✅ File loads correctly\n")
                except Exception as e:
                    print(f"  ❌ Error loading file: {e}\n")
        else:
            print(f"⏭️  No VIL files for {year}\n")

---

## Summary

**What we checked:**
- Which SEVIR modalities exist
- File counts and sizes
- Catalog completeness
- File integrity

**Next steps:**
1. If data is incomplete, set `DOWNLOAD=True` and re-run Step 5
2. Once data is ready, proceed to `03_Test_DataLoader.ipynb`
3. That notebook will test actually loading and processing the data