# 02: Data Verification and Download

**Purpose:** Check what SEVIR data you have and download missing files

**What this does:**
- Check which SEVIR modalities exist
- Count files per modality
- Identify missing data
- Provide download instructions/scripts

**What this does NOT do:**
- Load actual data into memory
- Create models
- Run training

**Expected time:** 5 minutes (or 30-90 min if downloading data)

---

**Prerequisites:** Run `01_Setup_and_Environment.ipynb` first

## Step 1: Set Paths

In [None]:
import os
import glob
import pandas as pd

# Your data directory (should match from 01_Setup)
DRIVE_ROOT = "/content/drive/MyDrive/SEVIR_Data"
SEVIR_ROOT = f"{DRIVE_ROOT}/data/sevir"
CATALOG_PATH = f"{DRIVE_ROOT}/data/SEVIR_CATALOG.csv"

print(f"SEVIR root: {SEVIR_ROOT}")
print(f"Catalog: {CATALOG_PATH}")

## Step 2: Check What Data Files Exist

In [None]:
print("="*70)
print("SEVIR DATA CHECK")
print("="*70)

modalities = {
    'vil': 'VIL (Radar) - TARGET MODALITY',
    'ir069': 'GOES-16 C09 (Water Vapor 6.9μm)',
    'ir107': 'GOES-16 C13 (IR Window 10.7μm)',
    'lght': 'GOES-16 GLM (Lightning)'
}

data_status = {}

for mod, desc in modalities.items():
    mod_path = f"{SEVIR_ROOT}/{mod}/2019"
    
    if os.path.exists(mod_path):
        h5_files = glob.glob(f"{mod_path}/*.h5")
        total_gb = sum(os.path.getsize(f) for f in h5_files) / 1e9
        
        # Expected: ~174 files per modality for full dataset
        status = "✅" if len(h5_files) >= 100 else ("⚠️" if len(h5_files) > 0 else "❌")
        
        print(f"\n{status} {mod.upper():8s} - {desc}")
        print(f"   Path: {mod_path}")
        print(f"   Files: {len(h5_files):3d} / ~174 expected")
        print(f"   Size: {total_gb:.1f} GB")
        
        data_status[mod] = {
            'exists': True,
            'files': len(h5_files),
            'complete': len(h5_files) >= 100
        }
    else:
        print(f"\n❌ {mod.upper():8s} - {desc}")
        print(f"   MISSING: {mod_path}")
        
        data_status[mod] = {
            'exists': False,
            'files': 0,
            'complete': False
        }

print("\n" + "="*70)

## Step 3: Check Catalog

In [None]:
# Load catalog if it exists
if os.path.exists(CATALOG_PATH):
    catalog = pd.read_csv(CATALOG_PATH, low_memory=False)
    
    print("📋 SEVIR Catalog Analysis\n")
    
    for mod in ['vil', 'ir069', 'ir107', 'lght']:
        mod_catalog = catalog[catalog['img_type'] == mod]
        unique_files = mod_catalog['file_name'].nunique()
        unique_events = mod_catalog['id'].nunique()
        
        print(f"{mod.upper():8s}:")
        print(f"  Catalog lists {unique_files} files")
        print(f"  Catalog lists {unique_events} events")
        
        if data_status[mod]['exists']:
            actual_files = data_status[mod]['files']
            coverage = (actual_files / unique_files * 100) if unique_files > 0 else 0
            print(f"  You have {actual_files} files ({coverage:.1f}% coverage)")
        print()
else:
    print(f"❌ Catalog not found: {CATALOG_PATH}")
    print("   Download from: https://sevir.mit.edu/")

## Step 4: Data Completeness Assessment

In [None]:
print("="*70)
print("DATA COMPLETENESS ASSESSMENT")
print("="*70)

# Check if we have enough data
vil_complete = data_status['vil']['complete']
all_complete = all(status['complete'] for status in data_status.values())

if all_complete:
    print("\n✅ EXCELLENT: All modalities complete!")
    print("   Ready for full multimodal training")
    need_download = False

elif vil_complete:
    print("\n⚠️  PARTIAL: VIL complete, but other modalities incomplete")
    print("   Can train, but model will use zeros for missing modalities")
    print("   Performance will be degraded")
    print("   \n   Recommendation: Download all modalities for best results")
    need_download = True

else:
    print("\n❌ INCOMPLETE: VIL data missing or incomplete")
    print("   VIL is the TARGET modality - training will fail without it")
    print("   \n   Action Required: Download VIL data (critical)")
    need_download = True

print("\n" + "="*70)

## Step 5: Download Data (If Needed)

**Set `DOWNLOAD = True` to enable download**

**WARNING:** This will download ~50 GB and take 30-90 minutes

In [None]:
DOWNLOAD = False  # ⚠️ SET TO True TO DOWNLOAD

if DOWNLOAD and need_download:
    print("="*70)
    print("DOWNLOADING SEVIR DATA FROM AWS S3")
    print("="*70)
    
    # Install AWS CLI
    !pip install -q awscli
    
    # Download each modality
    download_info = {
        'vil': ('~25 GB', '15-45 min', 'CRITICAL - TARGET MODALITY'),
        'ir069': ('~9 GB', '5-15 min', 'Water vapor channel'),
        'ir107': ('~9 GB', '5-15 min', 'IR window channel'),
        'lght': ('~6 GB', '5-15 min', 'Lightning data')
    }
    
    for mod, (size, time, desc) in download_info.items():
        if not data_status[mod]['complete']:
            print(f"\n{'='*70}")
            print(f"Downloading {mod.upper()} - {desc}")
            print(f"Size: {size}, Time: {time}")
            print(f"{'='*70}")
            
            target_dir = f"{SEVIR_ROOT}/{mod}/2019"
            !mkdir -p {target_dir}
            !aws s3 sync s3://sevir/data/{mod}/2019/ {target_dir} --no-sign-request --region us-east-1
            
            # Verify
            files = glob.glob(f"{target_dir}/*.h5")
            print(f"\n✅ Downloaded {len(files)} files for {mod}")
        else:
            print(f"\n✅ {mod.upper()} already complete, skipping")
    
    print("\n" + "="*70)
    print("✅ DOWNLOAD COMPLETE")
    print("="*70)
    print("\nRe-run Step 2 to verify all files downloaded correctly")
    
elif not DOWNLOAD:
    print("⏭️  Download skipped (set DOWNLOAD=True to enable)")
    if need_download:
        print("\n⚠️  WARNING: You need more data for full training")
        print("   Set DOWNLOAD=True above and re-run this cell")
else:
    print("✅ Data already complete, no download needed")

## Step 6: Sample a Few Files (Sanity Check)

In [None]:
import h5py
import numpy as np

print("Testing file loading (sanity check)...\n")

for mod in ['vil']:
    if data_status[mod]['exists'] and data_status[mod]['files'] > 0:
        # Get first file
        mod_path = f"{SEVIR_ROOT}/{mod}/2019"
        files = sorted(glob.glob(f"{mod_path}/*.h5"))
        
        if files:
            test_file = files[0]
            print(f"Testing {mod.upper()}: {os.path.basename(test_file)}")
            
            try:
                with h5py.File(test_file, 'r') as h5:
                    # Check structure
                    print(f"  Keys: {list(h5.keys())}")
                    
                    if mod in h5:
                        data = h5[mod]
                        print(f"  Shape: {data.shape}")
                        print(f"  Dtype: {data.dtype}")
                        
                        # Load first sample
                        sample = data[0]
                        print(f"  Sample shape: {sample.shape}")
                        print(f"  Sample range: [{sample.min():.1f}, {sample.max():.1f}]")
                        print(f"  ✅ File loads correctly\n")
            except Exception as e:
                print(f"  ❌ Error loading file: {e}\n")

---

## Summary

**What we checked:**
- Which SEVIR modalities exist
- File counts and sizes
- Catalog completeness
- File integrity

**Next steps:**
1. If data is incomplete, set `DOWNLOAD=True` and re-run Step 5
2. Once data is ready, proceed to `03_Test_DataLoader.ipynb`
3. That notebook will test actually loading and processing the data