# 01 - OpenAQ Data Exploration

This notebook explores the OpenAQ dataset for air quality prediction.

We'll cover:
1. Connecting to OpenAQ API v3
2. Exploring available cities and parameters
3. Visualizing pollution patterns
4. Preparing data for continual learning tasks

In [None]:
import os
import sys
sys.path.insert(0, '..')

import asyncio
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data.openaq_client import OpenAQClient, OpenAQDataFetcher

# Set your API key
# os.environ['OPENAQ_API_KEY'] = 'your-api-key-here'

plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

## 1. Connect to OpenAQ API

In [None]:
async def explore_api():
    async with OpenAQClient() as client:
        # Get available parameters
        params = await client.get_parameters()
        print("Available Parameters:")
        for p in params:
            print(f"  - {p['name']} (ID: {p['id']}): {p.get('description', 'N/A')}")
        
        # Get countries
        countries = await client.get_countries()
        print(f"\nTotal countries: {len(countries)}")
        
        # Top countries by location count
        top_countries = sorted(countries, key=lambda x: x.get('locations', 0), reverse=True)[:10]
        print("\nTop 10 countries by monitoring locations:")
        for c in top_countries:
            print(f"  - {c['name']}: {c.get('locations', 0)} locations")
        
        return params, countries

params, countries = asyncio.get_event_loop().run_until_complete(explore_api())

## 2. Explore a Specific City (Delhi)

In [None]:
async def explore_city(city="Delhi"):
    async with OpenAQClient() as client:
        locations = await client.get_locations(
            city=city,
            parameters=["pm25", "pm10", "no2", "o3"]
        )
        
        print(f"Found {len(locations)} monitoring locations in {city}")
        
        for loc in locations[:5]:
            print(f"\nLocation: {loc['name']} (ID: {loc['id']})")
            coords = loc.get('coordinates', {})
            print(f"  Coordinates: {coords.get('latitude')}, {coords.get('longitude')}")
            print(f"  Parameters: {[s.get('parameter', {}).get('name') for s in loc.get('sensors', [])]}")
        
        return locations

delhi_locations = asyncio.get_event_loop().run_until_complete(explore_city("Delhi"))

## 3. Fetch Historical Data

In [None]:
# Fetch 7 days of data
fetcher = OpenAQDataFetcher()

df = asyncio.get_event_loop().run_until_complete(
    fetcher.fetch_city_data(
        city="Delhi",
        parameters=["pm25", "pm10", "no2", "o3"],
        days=7
    )
)

print(f"Shape: {df.shape}")
df.head()

## 4. Visualize Pollution Patterns

In [None]:
# Time series plot
fig, axes = plt.subplots(2, 2, figsize=(14, 8))

params = ['pm25', 'pm10', 'no2', 'o3']
titles = ['PM2.5 (μg/m³)', 'PM10 (μg/m³)', 'NO₂ (ppb)', 'O₃ (ppb)']

for ax, param, title in zip(axes.flat, params, titles):
    if param in df.columns:
        # Group by datetime and average across locations
        ts = df.groupby('datetime')[param].mean()
        ax.plot(ts.index, ts.values, linewidth=0.8)
        ax.set_title(title)
        ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.suptitle('Air Quality in Delhi (7 Days)', y=1.02, fontsize=14)
plt.show()

In [None]:
# Hourly patterns (important for understanding temporal dynamics)
df['hour'] = pd.to_datetime(df['datetime']).dt.hour

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# PM2.5 by hour
if 'pm25' in df.columns:
    hourly_pm25 = df.groupby('hour')['pm25'].mean()
    axes[0].bar(hourly_pm25.index, hourly_pm25.values, color='steelblue')
    axes[0].set_xlabel('Hour of Day')
    axes[0].set_ylabel('PM2.5 (μg/m³)')
    axes[0].set_title('Average PM2.5 by Hour')

# NO2 by hour (typically shows traffic patterns)
if 'no2' in df.columns:
    hourly_no2 = df.groupby('hour')['no2'].mean()
    axes[1].bar(hourly_no2.index, hourly_no2.values, color='coral')
    axes[1].set_xlabel('Hour of Day')
    axes[1].set_ylabel('NO₂ (ppb)')
    axes[1].set_title('Average NO₂ by Hour (Traffic Pattern)')

plt.tight_layout()
plt.show()

## 5. Correlation Analysis

In [None]:
# Correlation between pollutants
pollutant_cols = [c for c in ['pm25', 'pm10', 'no2', 'o3', 'co', 'so2'] if c in df.columns]
corr = df[pollutant_cols].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap='RdYlBu_r', center=0, vmin=-1, vmax=1)
plt.title('Pollutant Correlation Matrix')
plt.tight_layout()
plt.show()

## 6. Prepare Data for Continual Learning

For Nested Learning, we treat sequential time periods as separate "tasks".

In [None]:
def create_sequences(df, history_window=168, forecast_horizon=24):
    """
    Create input-output sequences for time series prediction.
    
    - history_window: 168 hours = 7 days of hourly input
    - forecast_horizon: 24 hours to predict
    """
    sequences = []
    pollutant_cols = [c for c in ['pm25', 'pm10', 'no2', 'o3'] if c in df.columns]
    
    for loc_id, loc_df in df.groupby('location_id'):
        loc_df = loc_df.sort_values('datetime')
        values = loc_df[pollutant_cols].values
        
        for i in range(len(values) - history_window - forecast_horizon + 1):
            x = values[i:i + history_window]
            y = values[i + history_window:i + history_window + forecast_horizon]
            sequences.append((x, y))
    
    return sequences

sequences = create_sequences(df)
print(f"Created {len(sequences)} training sequences")

if sequences:
    x, y = sequences[0]
    print(f"Input shape: {x.shape} (7 days hourly)")
    print(f"Output shape: {y.shape} (24 hour forecast)")

## 7. Task Creation for Continual Learning

Split data into sequential tasks to simulate streaming data.

In [None]:
def create_tasks(df, n_tasks=4):
    """
    Split data chronologically into tasks.
    Each task represents a different time period.
    """
    df = df.sort_values('datetime')
    total_days = (df['datetime'].max() - df['datetime'].min()).days
    days_per_task = total_days // n_tasks
    
    tasks = []
    start_date = df['datetime'].min()
    
    for i in range(n_tasks):
        end_date = start_date + pd.Timedelta(days=days_per_task)
        task_df = df[(df['datetime'] >= start_date) & (df['datetime'] < end_date)]
        tasks.append(task_df)
        start_date = end_date
        print(f"Task {i+1}: {len(task_df)} samples")
    
    return tasks

# For a proper experiment, you'd use more data
# tasks = create_tasks(df, n_tasks=4)
print("\n[Note: With only 7 days of data, task creation is limited.")
print("For real experiments, fetch 90+ days of data.]")

## Next Steps

1. **Fetch more data**: Use `--days 90` in the fetch script for meaningful continual learning
2. **Preprocess**: Handle missing values, normalize, create temporal features
3. **Train HOPE-Air**: Run the training script with continual learning
4. **Evaluate forgetting**: Measure BWT/FWT metrics across tasks

See `notebooks/04_hope_training.ipynb` for the training walkthrough.