# 01_setup_and_download.ipynb

**Purpose**: Initialize the project environment and download raw ERA5-Land data.
**Runtime**: ~2-4 hours for full download.
**Strategy**: Monthly downloads to avoid CDS 'cost limits exceeded' errors.

**Note**: If you already have dataset files (`data_0*.nc` or `era5land_*.nc`), skip to the validation cell.

## 1. Environment Setup

In [None]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

# Define Project Root
PROJECT_ROOT = '/content/drive/MyDrive/WeatherPaper'
print(f"Project Root: {PROJECT_ROOT}")

!pip install cdsapi xarray netCDF4 pyyaml

In [None]:
import os
import yaml
import cdsapi
import xarray as xr
import glob

# Create directories
subdirs = ['config', 'data/raw', 'data/processed', 'checkpoints', 'outputs', 'figures']
for sd in subdirs:
    os.makedirs(os.path.join(PROJECT_ROOT, sd), exist_ok=True)
print("Directory structure verified.")

## 2. Config & Credentials

In [None]:
# 1. Scoping
scope_config = {
    "region": {
        "name": "Odisha",
        "north": 20.0,
        "south": 17.0,
        "east": 85.0,
        "west": 81.0
    },
    "time_split": {
        "train_years": [2015, 2016, 2017, 2018, 2019, 2020, 2021],
        "val_years": [2022, 2023],
        "test_years": [2024, 2025]
    }
}

with open(os.path.join(PROJECT_ROOT, 'config/project_scope.yaml'), 'w') as f:
    yaml.dump(scope_config, f)

# 2. Variables
var_config = {
    "cds_variables": ['total_precipitation', '2m_temperature', 'mean_sea_level_pressure'],
    "internal_names": {'total_precipitation': 'tp', '2m_temperature': 't2m', 'mean_sea_level_pressure': 'msl'}
}

with open(os.path.join(PROJECT_ROOT, 'config/variables.yaml'), 'w') as f:
    yaml.dump(var_config, f)

# 3. Credentials (Replace with your own key!)
cdsapirc_path = os.path.join(os.path.expanduser('~'), '.cdsapirc')
url = "https://cds.climate.copernicus.eu/api"
key = "YOUR_CDS_API_KEY_HERE"  # Get from: https://cds.climate.copernicus.eu/user

with open(cdsapirc_path, 'w') as f:
    f.write(f"url: {url}\nkey: {key}")
print("Config and Credentials Saved.")

## 3. Check for Existing Dataset

If you already have downloaded files (`data_0*.nc`), you can skip the download.

In [None]:
# Check for existing files
raw_dir = os.path.join(PROJECT_ROOT, 'data/raw')

# Check various file patterns
existing_era5 = glob.glob(os.path.join(raw_dir, "era5land_*.nc"))
existing_data0 = glob.glob(os.path.join(raw_dir, "data_0*.nc"))
existing_any = glob.glob(os.path.join(raw_dir, "*.nc"))

print(f"Files in {raw_dir}:")
print(f"  era5land_*.nc: {len(existing_era5)} files")
print(f"  data_0*.nc: {len(existing_data0)} files")
print(f"  Total *.nc: {len(existing_any)} files")

if existing_any:
    print("\n✓ Dataset files found! You can skip to Section 5 (Validation).")
else:
    print("\n⚠ No dataset files found. Run Section 4 to download.")

## 4. (Optional) Download ERA5 Data
Skip this section if you already have data files.

In [None]:
def download_era5_month(year, month, output_folder, region_box):
    month_str = f"{month:02d}"
    output_file = os.path.join(output_folder, f"era5land_{year}_{month_str}.nc")
    
    if os.path.exists(output_file):
        if os.path.getsize(output_file) > 1 * 1024 * 1024: # > 1MB check
            print(f"Skipping {year}-{month_str}: Exists.")
            return
        else:
            print(f"Redownloading {year}-{month_str}: Too small.")
            os.remove(output_file)

    print(f"Downloading {year}-{month_str}...")
    c = cdsapi.Client()
    
    c.retrieve(
        'reanalysis-era5-land',
        {
            'format': 'netcdf',
            'variable': [
                'total_precipitation', '2m_temperature', 'mean_sea_level_pressure',
            ],
            'year': str(year),
            'month': month_str,
            'day': [
                '01', '02', '03', '04', '05', '06',
                '07', '08', '09', '10', '11', '12',
                '13', '14', '15', '16', '17', '18',
                '19', '20', '21', '22', '23', '24',
                '25', '26', '27', '28', '29', '30',
                '31',
            ],
            'time': [
                '00:00', '01:00', '02:00', '03:00', '04:00', '05:00',
                '06:00', '07:00', '08:00', '09:00', '10:00', '11:00',
                '12:00', '13:00', '14:00', '15:00', '16:00', '17:00',
                '18:00', '19:00', '20:00', '21:00', '22:00', '23:00',
            ],
            'area': [
                region_box['north'], 
                region_box['west'], 
                region_box['south'], 
                region_box['east'],
            ],
        },
        output_file)
    print(f"Finished {year}-{month_str}")

In [None]:
# Execute Download (Monthly)
# SKIP THIS CELL IF YOU ALREADY HAVE DATA
years = range(2015, 2025)
months = range(1, 13)
raw_dir = os.path.join(PROJECT_ROOT, 'data/raw')
region_cfg = scope_config['region']

for y in years:
    for m in months:
        try:
            download_era5_month(y, m, raw_dir, region_cfg)
        except Exception as e:
            print(f"FAILED {y}-{m}: {e}")

## 5. Dataset Validation

Validate that all NetCDF files are readable and have consistent structure.

In [None]:
# Validation - Works with both naming conventions
raw_dir = os.path.join(PROJECT_ROOT, 'data/raw')

# Find all NetCDF files
files = sorted(glob.glob(os.path.join(raw_dir, "*.nc")))
print(f"Total Files: {len(files)}")

if len(files) == 0:
    raise FileNotFoundError(f"No NetCDF files found in {raw_dir}")

# Validate structure of first 5 files
print("\nValidating file structure...")
valid_count = 0
issues = []

for f in files[:5]:
    try:
        ds = xr.open_dataset(f)
        # Check for time coordinate (could be 'time' or 'valid_time')
        has_time = 'time' in ds.coords or 'valid_time' in ds.coords
        has_tp = 'tp' in ds.data_vars
        
        if has_time and has_tp:
            valid_count += 1
        else:
            issues.append(f"{os.path.basename(f)}: missing time or tp")
        
        ds.close()
    except Exception as e:
        issues.append(f"{os.path.basename(f)}: {str(e)[:50]}")

print(f"✓ {valid_count}/5 sample files validated successfully")

if issues:
    print("\nIssues found:")
    for iss in issues:
        print(f"  ⚠ {iss}")
else:
    print("\n=== ALL VALIDATIONS PASSED ===")
    print("You can proceed to 02_preprocessing.ipynb")

In [None]:
# Print sample file structure for reference
if files:
    sample_file = files[0]
    print(f"Sample file: {os.path.basename(sample_file)}")
    print(f"Size: {os.path.getsize(sample_file)/1e6:.2f} MB")
    
    ds = xr.open_dataset(sample_file)
    print(f"\nCoordinates: {list(ds.coords)}")
    print(f"Variables: {list(ds.data_vars)}")
    print(f"Dimensions: {dict(ds.dims)}")
    
    # Show time coordinate info
    time_coord = 'valid_time' if 'valid_time' in ds.coords else 'time'
    if time_coord in ds.coords:
        times = ds[time_coord].values
        print(f"\nTime coordinate: '{time_coord}'")
        print(f"Time range: {str(times[0])[:19]} to {str(times[-1])[:19]}")
        print(f"Time steps: {len(times)}")
    
    ds.close()