# Download Weather Data for ERCOT Region

This notebook downloads 11 years of daily weather data (2014-2024) from the Open-Meteo API.

**Run this once**, then upload the file to Google Drive. You'll use it in your main training notebook.

Estimated time: 2-5 minutes

In [None]:
import pandas as pd
import numpy as np
import requests
import time
from datetime import datetime, timedelta

print("="*80)
print("WEATHER DATA DOWNLOAD")
print("="*80)
print(f"\nStarting download of ERCOT weather data...")
print(f"Date range: 2014-01-01 to 2024-12-31")
print(f"Location: ERCOT region center (31.5°N, 97.5°W)")

In [None]:
def download_weather_data_chunked(
    start_date="2014-01-01",
    end_date="2024-12-31",
    latitude=31.5,
    longitude=-97.5,
    chunk_months=6,
    delay_between_requests=10
):
    """
    Download weather data from Open-Meteo API in chunks to avoid rate limiting.
    """
    API_URL = "https://archive-api.open-meteo.com/v1/archive"
    
    all_data = []
    current_date = pd.to_datetime(start_date)
    end_datetime = pd.to_datetime(end_date)
    
    chunk_count = 0
    
    while current_date < end_datetime:
        # Calculate chunk end date
        chunk_end = current_date + pd.DateOffset(months=chunk_months)
        if chunk_end > end_datetime:
            chunk_end = end_datetime
        
        chunk_start_str = current_date.strftime("%Y-%m-%d")
        chunk_end_str = chunk_end.strftime("%Y-%m-%d")
        chunk_count += 1
        
        print(f"\n[Chunk {chunk_count}] Downloading {chunk_start_str} to {chunk_end_str}...")
        
        params = {
            "latitude": latitude,
            "longitude": longitude,
            "start_date": chunk_start_str,
            "end_date": chunk_end_str,
            "daily": [
                "temperature_2m_max",
                "temperature_2m_min",
                "temperature_2m_mean",
                "precipitation_sum",
                "windspeed_10m_max",
                "relative_humidity_2m_max",
                "relative_humidity_2m_min"
            ],
            "timezone": "America/Chicago"
        }
        
        try:
            response = requests.get(API_URL, params=params, timeout=60)
            
            if response.status_code == 429:
                print(f"⚠️  Rate limited! Waiting 60 seconds...")
                time.sleep(60)
                continue
            
            response.raise_for_status()
            data = response.json()
            
            if "daily" in data:
                daily_data = data["daily"]
                
                # Create dataframe for this chunk
                chunk_df = pd.DataFrame({
                    "date": pd.to_datetime(daily_data["time"]),
                    "temp_max": daily_data["temperature_2m_max"],
                    "temp_min": daily_data["temperature_2m_min"],
                    "temp_mean": daily_data["temperature_2m_mean"],
                    "precipitation": daily_data["precipitation_sum"],
                    "wind_speed_max": daily_data["windspeed_10m_max"],
                    "humidity_max": daily_data["relative_humidity_2m_max"],
                    "humidity_min": daily_data["relative_humidity_2m_min"]
                })
                
                all_data.append(chunk_df)
                print(f"  ✓ Downloaded {len(chunk_df)} days")
                
                # Wait before next request
                if chunk_end < end_datetime:
                    print(f"  Waiting {delay_between_requests}s before next request...")
                    time.sleep(delay_between_requests)
            else:
                print(f"  ⚠️  No daily data in response")
        
        except requests.exceptions.RequestException as e:
            print(f"  ❌ Request failed: {e}")
            print(f"  Retrying in 30 seconds...")
            time.sleep(30)
            continue
        
        # Move to next chunk
        current_date = chunk_end + timedelta(days=1)
    
    if not all_data:
        print("❌ No data downloaded!")
        return None
    
    # Combine all chunks
    df = pd.concat(all_data, ignore_index=True)
    df = df.sort_values("date").reset_index(drop=True)
    
    return df

In [None]:
# Download the data
print("\nStarting download...")
weather_df = download_weather_data_chunked()

if weather_df is not None:
    print(f"\n{'='*80}")
    print(f"✅ DOWNLOAD COMPLETE")
    print(f"{'='*80}")
    print(f"\nData Summary:")
    print(f"  Total days: {len(weather_df):,}")
    print(f"  Date range: {weather_df['date'].min().date()} to {weather_df['date'].max().date()}")
    print(f"  Columns: {weather_df.columns.tolist()}")
    print(f"\nFirst few rows:")
    print(weather_df.head(10))
else:
    print("❌ Download failed!")

In [None]:
# Save to CSV
output_file = "weather_data_ercot.csv"
weather_df.to_csv(output_file, index=False)
print(f"\n✓ Saved to {output_file}")
print(f"  File size: {weather_df.memory_usage(deep=True).sum() / 1024:.2f} KB")

In [None]:
# Display statistics
print("\n" + "="*80)
print("WEATHER DATA STATISTICS")
print("="*80)
print(f"\nTemperature (°F):")
print(weather_df[["temp_max", "temp_min", "temp_mean"]].describe())
print(f"\nPrecipitation (inches):")
print(weather_df[["precipitation"]].describe())
print(f"\nWind Speed (mph):")
print(weather_df[["wind_speed_max"]].describe())
print(f"\nHumidity (%):")
print(weather_df[["humidity_max", "humidity_min"]].describe())

In [None]:
# Download the CSV file from Colab
from google.colab import files

print(f"\nDownloading {output_file} to your computer...")
files.download(output_file)
print(f"✓ File downloaded!")
print(f"\nNext steps:")
print(f"1. Upload this file to Google Drive (Capstone_Data/ folder)")
print(f"2. Use in your main training notebook with:")
print(f"   weather_df = pd.read_csv('/content/drive/MyDrive/Capstone_Data/weather_data_ercot.csv')")