In [10]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
make_predictions.py

Produce one line of predictions in the required format:

"YYYY-MM-DD", L1_00, ..., L29_23, PH_1, ..., PH_29, PD_1, ..., PD_29

- "YYYY-MM-DD" is the *run date* (target_date - 1 day), matching the project spec.
- L values are hourly loads (MW), rounded to integers.
- PH values are the predicted peak hour (0..23) via a softmax over the 24 hourly loads.
- PD values are 0/1 indicating a predicted "peak day" (heuristic; see Task 3).

References: course spec for the format and schedule.  # see slides
"""

from __future__ import annotations

import argparse
import os
from datetime import datetime, timedelta
from typing import Dict, List

import numpy as np
import pandas as pd

# -------------------------------
# Canonical 29-zone order (L1..L29)
# RTO is excluded; alias USI->UGI, AE->AECO handled in loader.
# -------------------------------
ZONES_29: List[str] = [
    "AECO","AEPAPT","AEPIMP","AEPKPT","AEPOPT",
    "AP","BC","CE","DAY","DEOK","DOM","DPLCO","DUQ","EASTON",
    "EKPC","JC","ME","OE","OVEC","PAPWR","PE","PEPCO","PLCO",
    "PN","PS","RECO","SMECO","UGI","VMEU"
]

ALIAS_MAP = {"USI": "UGI", "AE": "AECO"}

# -------------------------------
# Utilities
# -------------------------------
def _iter_hrl_files(data_dir: str):
    # Preferred flat naming
    found = False
    for y in range(2016, 2026):
        fp = os.path.join(data_dir, f"hrl_load_metered_{y}.csv")
        if os.path.exists(fp):
            found = True
            yield fp
    if found:
        return
    # Fallback: recursive search under data_dir
    for root, _, files in os.walk(data_dir):
        for fn in files:
            if fn.startswith("hrl_load_metered_") and fn.endswith(".csv"):
                yield os.path.join(root, fn)

def _default_target_date_str() -> str:
    # Tomorrow in UTC (sufficient for this script)
    return (datetime.utcnow() + timedelta(days=1)).date().isoformat()

# -------------------------------
# Load all *prior* data
# -------------------------------
def load_prior_hist(target_date_str: str, data_dir: str) -> pd.DataFrame:
    """
    Load rows strictly before target_date across all yearly CSVs.
    Harmonize zone names, drop RTO aggregate.
    """
    target_date = pd.to_datetime(target_date_str).date()
    frames = []
    usecols = ["datetime_beginning_ept", "load_area", "mw"]

    for fp in _iter_hrl_files(data_dir):
        df = pd.read_csv(fp, usecols=usecols, parse_dates=["datetime_beginning_ept"])
        df["load_area"] = df["load_area"].replace(ALIAS_MAP)
        df = df[df["load_area"] != "RTO"]
        df["date_ept"] = df["datetime_beginning_ept"].dt.date
        df = df[df["date_ept"] < target_date]
        frames.append(df[["datetime_beginning_ept", "load_area", "mw"]])

    if not frames:
        raise FileNotFoundError(
            f"No HRL CSVs found under '{data_dir}'. Expected files like hrl_load_metered_2016.csv"
        )

    hist = pd.concat(frames, ignore_index=True)
    hist = hist.sort_values(["load_area", "datetime_beginning_ept"]).rename(
        columns={"datetime_beginning_ept": "ts"}
    )
    return hist.reset_index(drop=True)

# -------------------------------
# Task 1: hourly loads (baseline)
# -------------------------------
def forecast_hourly(hist: pd.DataFrame,
                    zones: List[str],
                    lookback_days: int | None = None) -> Dict[str, np.ndarray]:
    """
    Baseline: for each (zone, hour), use the mean of that hour over prior data.
    If lookback_days is set, restrict to that recent window.
    """
    if hist.empty:
        return {z: np.zeros(24, float) for z in zones}

    h = hist.copy()
    h["hour"] = h["ts"].dt.hour

    if lookback_days:
        cutoff = h["ts"].max().normalize() - pd.Timedelta(days=int(lookback_days))
        h = h[h["ts"] >= cutoff]

    by_zh = h.groupby(["load_area", "hour"], observed=True)["mw"].mean()

    out: Dict[str, np.ndarray] = {}
    for z in zones:
        z_mean = float(h.loc[h["load_area"] == z, "mw"].mean()) if (h["load_area"] == z).any() else 0.0
        out[z] = np.array([by_zh.get((z, hr), z_mean) for hr in range(24)], dtype=float)
    return out

# -------------------------------
# Task 2: softmax + peak hour
# -------------------------------
def softmax(vec: np.ndarray) -> np.ndarray:
    x = np.asarray(vec, dtype=float)
    T = max(float(x.std()), 1.0)  # stabilize; avoid one-hot
    z = (x - x.max()) / T
    e = np.exp(z)
    s = e.sum()
    return e / s if s > 0 else np.ones_like(x) / len(x)

def peak_hour_from_softmax(zone_hourly: Dict[str, np.ndarray]) -> Dict[str, int]:
    return {z: int(np.argmax(softmax(v))) for z, v in zone_hourly.items()}

# -------------------------------
# Task 3: peak-day flag (0/1)
# -------------------------------
def peakday_flags(hist: pd.DataFrame,
                  zones: List[str],
                  zone_hourly: Dict[str, np.ndarray],
                  trailing_days: int = 180,
                  q: float = 0.80) -> Dict[str, int]:
    """
    PD=1 if the predicted daily peak (max of 24 hourly forecasts) >= q-quantile
    of trailing daily peaks from prior data; else PD=0.
    """
    if hist.empty:
        return {z: 0 for z in zones}

    d = hist.copy()
    d["date"] = d["ts"].dt.normalize()
    # Use trailing window for stability
    end_date = d["date"].max()
    start_date = end_date - pd.Timedelta(days=int(trailing_days))
    d = d[(d["date"] >= start_date) & (d["date"] <= end_date)]

    daily = d.groupby(["load_area", "date"], observed=True)["mw"].max().reset_index()

    out: Dict[str, int] = {}
    for z in zones:
        vals = daily.loc[daily["load_area"] == z, "mw"].to_numpy()
        thr = (vals.max() if vals.size < 5 else float(np.quantile(vals, q)))
        pred_peak = float(np.max(zone_hourly[z]))
        out[z] = int(pred_peak >= thr)
    return out

# -------------------------------
# Build the one-line output
# -------------------------------
def build_single_line(run_date_str: str,
                      zones: List[str],
                      zone_hourly: Dict[str, np.ndarray],
                      peak_hour: Dict[str, int],
                      peak_day: Dict[str, int]) -> str:
    fields: List[str] = [f'"{run_date_str}"']
    # 29Ã—24 hourly loads (rounded ints), zone-major, hour-minor
    for z in zones:
        fields.extend(str(int(round(v))) for v in zone_hourly[z])
    # 29 PH values
    fields.extend(str(int(peak_hour[z])) for z in zones)
    # 29 PD flags
    fields.extend(str(int(peak_day[z])) for z in zones)
    return ", ".join(fields)

# -------------------------------
# main()
# -------------------------------
def main(argv: List[str] | None = None) -> None:
    ap = argparse.ArgumentParser(description="Make one-line PJM predictions in class format.")
    ap.add_argument("--target", default=None,
                    help="Target forecast date (YYYY-MM-DD). Default: tomorrow (UTC).")
    ap.add_argument("--data-dir", default=".",
                    help="Directory containing hrl_load_metered_YYYY.csv files.")
    ap.add_argument("--out", default=None,
                    help="Output CSV path. Default: predictions_<run_date>.csv (run_date = target-1).")
    ap.add_argument("--lookback-days", type=int, default=None,
                    help="Optional recent window for Task 1; if omitted, use all prior data.")
    ap.add_argument("--stdout", action="store_true",
                    help="If set, print ONLY the required one-line output to stdout.")
    args = ap.parse_args(argv)

    target_date_str = args.target or _default_target_date_str()
    target_date = pd.to_datetime(target_date_str).normalize()
    run_date_str = (target_date - pd.Timedelta(days=1)).strftime("%Y-%m-%d")
    out_path = args.out or f"predictions_{run_date_str}.csv"

    # 1) Load all prior data
    hist = load_prior_hist(target_date_str, args.data_dir)

    # 2) Task 1
    zone_hourly = forecast_hourly(hist, ZONES_29, lookback_days=args.lookback_days)

    # 3) Task 2
    peak_hour = peak_hour_from_softmax(zone_hourly)

    # 4) Task 3
    peak_day = peakday_flags(hist, ZONES_29, zone_hourly, trailing_days=180, q=0.80)

    # 5) Compose one line and write
    line = build_single_line(run_date_str, ZONES_29, zone_hourly, peak_hour, peak_day)

    if args.stdout:
        # Print ONLY the one required line (no header, no extra text)
        print(line)
    else:
        with open(out_path, "w", encoding="utf-8") as f:
            f.write(line + "\n")

if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] [--target TARGET] [--data-dir DATA_DIR]
                             [--out OUT] [--lookback-days LOOKBACK_DAYS]
                             [--stdout]
ipykernel_launcher.py: error: unrecognized arguments: -f /Users/yueuy/Library/Jupyter/runtime/kernel-c4542292-b548-4623-8811-000912c0f3a0.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
