In [None]:
import pandas as pd

INPUT_FILE = "/Users/tenzinchoedhen/Desktop/All Recorded Traffic.csv"
OUTPUT_FILE = "/Users/tenzinchoedhen/Desktop/weather_data.csv"

DATE_COL = "DATE"
TIME_COL = "TIME"
CHUNKSIZE = 300_000


def hhmm_to_hh_colon_mm(s: pd.Series) -> pd.Series:
    """
    Convert HHMM integers/strings (e.g., 0, 15, 945, 1300, 2359) -> 'HH:MM'
    """
    s = s.astype("Int64").astype(str)          # keeps NA safe
    s = s.str.replace(r"\.0$", "", regex=True) # in case it was read as float
    s = s.str.zfill(4)                         # 945 -> '0945'
    return s.str[:2] + ":" + s.str[2:4]


def build_weather_dataset_big(input_file: str, output_file: str) -> None:
    seen = set()
    out_chunks = []

    for chunk in pd.read_csv(
        input_file,
        usecols=[DATE_COL, TIME_COL],
        chunksize=CHUNKSIZE
    ):
        # --- Date to mm/dd/yyyy ---
        chunk[DATE_COL] = pd.to_datetime(chunk[DATE_COL], errors="coerce")
        chunk = chunk.dropna(subset=[DATE_COL])
        chunk[DATE_COL] = chunk[DATE_COL].dt.strftime("%m/%d/%Y")

        # --- Time HHMM -> HH:MM ---
        chunk = chunk.dropna(subset=[TIME_COL])
        chunk[TIME_COL] = hhmm_to_hh_colon_mm(chunk[TIME_COL])

        # --- Primary Key (Date + Time) ---
        chunk["pk_date_time"] = chunk[DATE_COL] + "_" + chunk[TIME_COL]

        # --- Keeping only unique Date+Time across the full file ---
        is_new = ~chunk["pk_date_time"].isin(seen)
        new_rows = chunk.loc[is_new, [DATE_COL, TIME_COL, "pk_date_time"]]

        seen.update(new_rows["pk_date_time"].tolist())
        out_chunks.append(new_rows[[DATE_COL, TIME_COL]])

    result = pd.concat(out_chunks, ignore_index=True).drop_duplicates()
    result.to_csv(output_file, index=False)

    print("Saved:", output_file)
    print("Unique Date+Time rows:", len(result))


if __name__ == "__main__":
    build_weather_dataset_big(INPUT_FILE, OUTPUT_FILE)


In [None]:
import pandas as pd
import requests

WEATHER_INPUT = "/Users/tenzinchoedhen/Desktop/All Recorded Traffic.csv"
WEATHER_OUTPUT = "/Users/tenzinchoedhen/Desktop/All Recorded Traffic.csv"  # overwrite (or change name)

# ---- Choose ONE location (example: Manhattan / Midtown-ish) ----
LAT = 40.7580
LON = -73.9855

# Use a real timezone for correct local-hour alignment
TIMEZONE = "America/New_York"

# Pick the hourly fields you want from Open-Meteo
HOURLY_VARS = [
    "temperature_2m",
    "relative_humidity_2m",
    "precipitation",
    "rain",
    "snowfall",
    "cloud_cover",
    "wind_speed_10m",
    "wind_direction_10m",
    "weather_code",
]

def fetch_open_meteo_hourly(lat, lon, start_date, end_date, timezone):
    """
    Calls Open-Meteo Historical Weather API (/v1/archive) and returns a DataFrame of hourly results.
    """
    url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": start_date,  # YYYY-MM-DD
        "end_date": end_date,      # YYYY-MM-DD
        "hourly": ",".join(HOURLY_VARS),
        "timezone": timezone,
    }

    r = requests.get(url, params=params, timeout=60)
    r.raise_for_status()
    data = r.json()

    hourly = data.get("hourly", {})
    if not hourly or "time" not in hourly:
        raise ValueError("No hourly data returned. Check dates/lat/lon/timezone.")

    weather_df = pd.DataFrame(hourly)
    # 'time' comes like 'YYYY-MM-DDTHH:MM'
    weather_df["datetime"] = pd.to_datetime(weather_df["time"])
    weather_df.drop(columns=["time"], inplace=True)

    return weather_df

def integrate_weather():
    # Load your unique Date+Time dataset
    df = pd.read_csv(WEATHER_INPUT, dtype=str)

    # Build a datetime column from Date (mm/dd/yyyy) + Time (HH:MM)
    df["datetime"] = pd.to_datetime(df["DATE"] + " " + df["TIME"], format="%m/%d/%Y %H:%M", errors="coerce")
    df = df.dropna(subset=["datetime"])

    # Determine weather date range (Open-Meteo wants YYYY-MM-DD)
    start_date = df["datetime"].min().date().isoformat()
    end_date = df["datetime"].max().date().isoformat()

    # Fetch weather for full range (hourly)
    weather_df = fetch_open_meteo_hourly(LAT, LON, start_date, end_date, TIMEZONE)

    # Merge on datetime (left join keeps all your Date+Time rows)
    merged = df.merge(weather_df, on="datetime", how="left")

    # Optional: drop datetime column if you donâ€™t want it in final CSV
    # merged = merged.drop(columns=["datetime"])

    merged.to_csv(WEATHER_OUTPUT, index=False)
    print("Saved with weather:", WEATHER_OUTPUT)
    print("Rows:", len(merged))
    print("Weather columns added:", [c for c in merged.columns if c not in ["Date", "Time", "datetime"]])

if __name__ == "__main__":
    integrate_weather()


In [None]:
import pandas as pd

# -----------------------------
# CONFIG
# -----------------------------
INPUT_FILE  = "/Users/tenzinchoedhen/Desktop/weather_data.csv"   # your current file (already has weather_code)
OUTPUT_FILE = "/Users/tenzinchoedhen/Desktop/weather_data.csv"   # overwrite; change if you want a new file

# -----------------------------
# WEATHER CODE -> DESCRIPTION
# -----------------------------
def weather_code_to_description(code):
    wmo_map = {
        0: "Clear sky",
        1: "Mainly clear",
        2: "Partly cloudy",
        3: "Overcast",
        45: "Fog",
        48: "Depositing rime fog",
        51: "Light drizzle",
        53: "Moderate drizzle",
        55: "Dense drizzle",
        56: "Light freezing drizzle",
        57: "Dense freezing drizzle",
        61: "Slight rain",
        63: "Moderate rain",
        65: "Heavy rain",
        66: "Light freezing rain",
        67: "Heavy freezing rain",
        71: "Slight snowfall",
        73: "Moderate snowfall",
        75: "Heavy snowfall",
        77: "Snow grains",
        80: "Slight rain showers",
        81: "Moderate rain showers",
        82: "Violent rain showers",
        85: "Slight snow showers",
        86: "Heavy snow showers",
        95: "Thunderstorm",
        96: "Thunderstorm with slight hail",
        99: "Thunderstorm with heavy hail",
    }
    return wmo_map.get(code, "Unknown")

def add_weather_description(input_file: str, output_file: str) -> None:
    df = pd.read_csv(input_file)

    if "weather_code" not in df.columns:
        raise ValueError(f"'weather_code' column not found. Columns are: {list(df.columns)}")

    df["weather_code"] = pd.to_numeric(df["weather_code"], errors="coerce")

    df["weather_description"] = df["weather_code"].apply(
        lambda x: weather_code_to_description(int(x)) if pd.notna(x) else "Unknown"
    )

    df.to_csv(output_file, index=False)
    print("Saved:", output_file)
    print("Rows:", len(df))
    print("Missing weather_code:", df["weather_code"].isna().sum())
    print("Sample:")
    print(df[["weather_code", "weather_description"]].dropna().head(10))

if __name__ == "__main__":
    add_weather_description(INPUT_FILE, OUTPUT_FILE)



In [None]:
import pandas as pd
import requests
import holidays

# -----------------------------
# CONFIG
# -----------------------------
INPUT_FILE  = "/Users/tenzinchoedhen/Desktop/weather_data.csv"
OUTPUT_FILE = "/Users/tenzinchoedhen/Desktop/weather_data.csv"  # overwrite if you want

DATE_COL = "DATE"   # change to "Date" if your file uses that
TIME_COL = "TIME"   # change to "Time" if your file uses that

# NYC Open Data: NYC Permitted Event Information
NYC_EVENTS_DATASET = "tvpp-9vvx"
NYC_APP_TOKEN = None  # optional
NYC_LIMIT = 50000

# -----------------------------
# HELPERS
# -----------------------------
def build_datetime(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["datetime"] = pd.to_datetime(
        df[DATE_COL].astype(str).str.strip() + " " + df[TIME_COL].astype(str).str.strip(),
        format="%m/%d/%Y %H:%M",
        errors="coerce",
    )
    return df.dropna(subset=["datetime"])


def add_holiday_flags(df: pd.DataFrame) -> pd.DataFrame:
    years = sorted(df["datetime"].dt.year.unique().tolist())
    ny = holidays.US(state="NY", years=years)
    nj = holidays.US(state="NJ", years=years)

    d = df["datetime"].dt.date
    df["holiday_ny_name"] = [ny.get(x) for x in d]
    df["holiday_nj_name"] = [nj.get(x) for x in d]

    df["is_holiday_ny"] = df["holiday_ny_name"].notna()
    df["is_holiday_nj"] = df["holiday_nj_name"].notna()
    df["is_holiday_any"] = df["is_holiday_ny"] | df["is_holiday_nj"]
    df["is_weekend"] = df["datetime"].dt.dayofweek >= 5
    return df


def soda_get(url: str, params: dict) -> list:
    headers = {}
    if NYC_APP_TOKEN:
        headers["X-App-Token"] = NYC_APP_TOKEN
    r = requests.get(url, params=params, headers=headers, timeout=60)
    r.raise_for_status()
    return r.json()


def fetch_nyc_events(start_dt: pd.Timestamp, end_dt: pd.Timestamp) -> pd.DataFrame:
    """
    Pull NYC permitted events and keep only events overlapping your datetime window.
    Dataset reference: tvpp-9vvx.
    """
    url = f"https://data.cityofnewyork.us/resource/{NYC_EVENTS_DATASET}.json"
    params = {"$limit": NYC_LIMIT}

    rows = soda_get(url, params)
    ev = pd.DataFrame(rows)
    if ev.empty:
        return ev

    # fields in this dataset commonly include start_date_time / end_date_time
    start_field = "start_date_time"
    end_field = "end_date_time"

    if start_field not in ev.columns:
        raise ValueError(f"NYC events start field not found. Columns: {list(ev.columns)}")

    ev["event_start"] = pd.to_datetime(ev[start_field], errors="coerce")
    ev["event_end"] = pd.to_datetime(ev[end_field], errors="coerce") if end_field in ev.columns else ev["event_start"]
    ev = ev.dropna(subset=["event_start"])
    ev["event_end"] = ev["event_end"].fillna(ev["event_start"])

    # overlap filter
    ev = ev[(ev["event_end"] >= start_dt) & (ev["event_start"] <= end_dt)].copy()
    return ev


def make_event_hour_index(events_df: pd.DataFrame) -> pd.DatetimeIndex:
    """
    Convert event windows into a set of hourly timestamps.
    Uses 'h' (lowercase) to avoid pandas FutureWarning.
    """
    hours = set()
    for s, e in events_df[["event_start", "event_end"]].itertuples(index=False):
        if pd.isna(s) or pd.isna(e):
            continue
        s = pd.to_datetime(s).floor("h")
        e = pd.to_datetime(e).ceil("h")
        for t in pd.date_range(s, e, freq="h"):
            hours.add(t)
    return pd.DatetimeIndex(sorted(hours))


# -----------------------------
# MAIN
# -----------------------------
def integrate_holidays_and_nyc_events():
    df = pd.read_csv(INPUT_FILE, dtype=str)
    df = build_datetime(df)

    start_dt = df["datetime"].min()
    end_dt = df["datetime"].max()

    # 1) Holidays
    df = add_holiday_flags(df)

    # 2) NYC permitted events (hour-level flag)
    nyc_events = fetch_nyc_events(start_dt, end_dt)
    if not nyc_events.empty:
        nyc_event_hours = make_event_hour_index(nyc_events)
        df["is_nyc_permitted_event_hour"] = df["datetime"].isin(nyc_event_hours)
    else:
        df["is_nyc_permitted_event_hour"] = False

    df.to_csv(OUTPUT_FILE, index=False)
    print("Saved:", OUTPUT_FILE)
    print("Rows:", len(df))
    print("Holiday(any):", int(df["is_holiday_any"].sum()))
    print("NYC event hours:", int(df["is_nyc_permitted_event_hour"].sum()))

if __name__ == "__main__":
    integrate_holidays_and_nyc_events()


In [None]:
import os
import re
import pandas as pd
import requests

# -----------------------------
# CONFIG
# -----------------------------
WEATHER_FILE = "/Users/tenzinchoedhen/Desktop/weather_data.csv"  # will be overwritten
DATE_COL = "DATE"
TIME_COL = "TIME"

CACHE_DIR = "/Users/tenzinchoedhen/Desktop/schedule_cache"
os.makedirs(CACHE_DIR, exist_ok=True)

LOCAL_TZ = "America/New_York"

# -----------------------------
# PRE + POST WINDOWS (hours)
# -----------------------------
# Traffic typically increases BEFORE (arrivals) and AFTER (exit).
NFL_PRE, NFL_POST = 2, 4
MLS_PRE, MLS_POST = 1, 3
MLB_PRE, MLB_POST = 1, 4
NBA_PRE, NBA_POST = 1, 3
NHL_PRE, NHL_POST = 1, 3

# -----------------------------
# SOURCES (download once)
# -----------------------------
# NFL: all games CSV (filter by stadium MetLife/Giants Stadium)
NFL_GAMES_CSV_URL = "https://raw.githubusercontent.com/nflverse/nfldata/master/data/games.csv"
NFL_GAMES_CSV_PATH = os.path.join(CACHE_DIR, "nfl_games.csv")

# MLS team calendars (ICS)
RBNY_ICS_URL = "https://mlscalendar.jeffsoftware.com/ical/new-york-red-bulls.ics"
NYCFC_ICS_URL = "https://mlscalendar.jeffsoftware.com/ical/new-york-city-fc.ics"
RBNY_ICS_PATH = os.path.join(CACHE_DIR, "mls_rbny.ics")
NYCFC_ICS_PATH = os.path.join(CACHE_DIR, "mls_nycfc.ics")

# MLB "downloadable schedule" pages (extract CSV link)
YANKEES_CSV_PAGE = "https://www.mlb.com/yankees/schedule/downloadable-schedule"
METS_CSV_PAGE = "https://www.mlb.com/mets/schedule/downloadable-schedule"
YANKEES_CSV_PATH = os.path.join(CACHE_DIR, "mlb_yankees.csv")
METS_CSV_PATH = os.path.join(CACHE_DIR, "mlb_mets.csv")

# NBA via FixtureDownload (Knicks + Nets)
NBA_TEAMS = ["new-york-knicks", "brooklyn-nets"]
NBA_LOCAL_VENUES = ["Madison Square Garden", "Barclays Center"]

# NHL official team CSV feeds (Rangers/Islanders/Devils)
NHL_TEAMS = ["nyr", "nyi", "njd"]
NHL_FEED_TEMPLATE = "https://www.nhl.com/feed/nhl/ics/schedule.csv?format=csv&homeaway=full&team={team}"
NHL_LOCAL_VENUES = ["Madison Square Garden", "UBS Arena", "Prudential Center"]


# -----------------------------
# CORE HELPERS
# -----------------------------
def build_weather_datetime(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["datetime"] = pd.to_datetime(
        df[DATE_COL].astype(str).str.strip() + " " + df[TIME_COL].astype(str).str.strip(),
        format="%m/%d/%Y %H:%M",
        errors="coerce",
    )
    return df.dropna(subset=["datetime"])


def window_hours(start_local_naive: pd.Timestamp, pre_hours: int, post_hours: int) -> pd.DatetimeIndex:
    """
    Hourly stamps covering [start - pre_hours, start + post_hours].
    """
    s = (start_local_naive - pd.Timedelta(hours=pre_hours)).floor("h")
    e = (start_local_naive + pd.Timedelta(hours=post_hours)).ceil("h")
    return pd.date_range(s, e, freq="h")


def utc_to_local_naive(ts_utc: pd.Timestamp) -> pd.Timestamp:
    # schedule side conversion only (safe)
    return ts_utc.tz_convert(LOCAL_TZ).tz_localize(None)


def download_bytes(url: str) -> tuple[bytes, str]:
    r = requests.get(url, timeout=60, allow_redirects=True)
    r.raise_for_status()
    return r.content, r.headers.get("Content-Type", "").lower()


def save_file(path: str, content: bytes) -> None:
    with open(path, "wb") as f:
        f.write(content)


def download_if_missing(url: str, path: str) -> None:
    if os.path.exists(path) and os.path.getsize(path) > 0:
        return
    content, _ = download_bytes(url)
    save_file(path, content)


# -----------------------------
# NFL (MetLife) from nflverse games.csv
# -----------------------------
def load_nfl_metlife_hours(min_dt: pd.Timestamp, max_dt: pd.Timestamp) -> set[pd.Timestamp]:
    download_if_missing(NFL_GAMES_CSV_URL, NFL_GAMES_CSV_PATH)
    games = pd.read_csv(NFL_GAMES_CSV_PATH)

    games["gameday"] = pd.to_datetime(games.get("gameday"), errors="coerce")
    games["gametime"] = games.get("gametime", "").astype(str).str.strip()
    games["stadium"] = games.get("stadium", "").astype(str)

    games = games[games["gameday"].notna() & (games["gametime"].str.len() > 0)].copy()

    games["game_dt_local_naive"] = pd.to_datetime(
        games["gameday"].dt.strftime("%Y-%m-%d") + " " + games["gametime"],
        errors="coerce",
    )
    games = games.dropna(subset=["game_dt_local_naive"])

    # filter to your range (+2 days buffer because we have pre-window)
    start = min_dt.normalize() - pd.Timedelta(days=2)
    end = max_dt.normalize() + pd.Timedelta(days=2)
    games = games[(games["game_dt_local_naive"] >= start) & (games["game_dt_local_naive"] <= end)]

    metlife = games[games["stadium"].str.contains("MetLife Stadium|Giants Stadium", case=False, na=False)]

    hours = set()
    for dt in metlife["game_dt_local_naive"]:
        for h in window_hours(dt, NFL_PRE, NFL_POST):
            hours.add(h)
    return hours


# -----------------------------
# Minimal ICS parsing (MLS)
# -----------------------------
def parse_ics_events(text: str) -> list[dict]:
    unfolded = re.sub(r"\r?\n[ \t]", "", text)
    out = []
    for part in unfolded.split("BEGIN:VEVENT"):
        if "END:VEVENT" not in part:
            continue
        block = part.split("END:VEVENT")[0]

        def get(prefix: str):
            m = re.search(rf"{prefix}(?:;[^:]*)?:([^\r\n]+)", block)
            return m.group(1).strip() if m else None

        dtstart = get("DTSTART")
        if dtstart:
            out.append({"dtstart": dtstart})
    return out


def ics_dt_to_local_naive(dt_raw: str) -> pd.Timestamp | None:
    dt_raw = dt_raw.strip()

    # UTC format: 20260221T193000Z
    if dt_raw.endswith("Z"):
        ts = pd.to_datetime(dt_raw, format="%Y%m%dT%H%M%SZ", utc=True, errors="coerce")
        if pd.isna(ts):
            return None
        return utc_to_local_naive(ts)

    # floating local: 20260221T193000 or 20260221T1930
    ts = pd.to_datetime(dt_raw, format="%Y%m%dT%H%M%S", errors="coerce")
    if pd.isna(ts):
        ts = pd.to_datetime(dt_raw, format="%Y%m%dT%H%M", errors="coerce")
    return None if pd.isna(ts) else ts


def load_mls_hours(ics_url: str, ics_path: str, min_dt: pd.Timestamp, max_dt: pd.Timestamp, pre_h: int, post_h: int) -> set[pd.Timestamp]:
    download_if_missing(ics_url, ics_path)
    with open(ics_path, "r", encoding="utf-8", errors="ignore") as f:
        text = f.read()

    events = parse_ics_events(text)

    start = min_dt.normalize() - pd.Timedelta(days=2)
    end = max_dt.normalize() + pd.Timedelta(days=2)

    hours = set()
    for e in events:
        dt = ics_dt_to_local_naive(e["dtstart"])
        if dt is None or not (start <= dt <= end):
            continue
        for h in window_hours(dt, pre_h, post_h):
            hours.add(h)
    return hours


# -----------------------------
# MLB download page -> CSV -> parse
# -----------------------------
def download_mlb_csv_from_page(page_url: str, out_csv_path: str) -> bool:
    try:
        content, content_type = download_bytes(page_url)

        # if actual CSV returned
        head = content[:80].decode("utf-8", errors="ignore").lower()
        if "text/csv" in content_type or head.startswith("date"):
            save_file(out_csv_path, content)
            return True

        html = content.decode("utf-8", errors="ignore")
        m = re.search(r'href="([^"]+\.csv[^"]*)"', html, flags=re.IGNORECASE)
        if not m:
            return False

        csv_url = m.group(1)
        if csv_url.startswith("/"):
            csv_url = "https://www.mlb.com" + csv_url

        csv_bytes, _ = download_bytes(csv_url)
        save_file(out_csv_path, csv_bytes)
        return True
    except Exception:
        return False


def load_mlb_hours(csv_path: str, min_dt: pd.Timestamp, max_dt: pd.Timestamp, pre_h: int, post_h: int) -> set[pd.Timestamp]:
    sch = pd.read_csv(csv_path)

    cols = {c.lower(): c for c in sch.columns}
    date_col = cols.get("date") or cols.get("game date") or cols.get("gamedate")
    time_col = cols.get("time") or cols.get("start time") or cols.get("start_time")
    if not date_col or not time_col:
        return set()

    sch["_date"] = pd.to_datetime(sch[date_col], errors="coerce")
    sch["_time"] = sch[time_col].astype(str).str.strip()
    sch["_dt"] = pd.to_datetime(sch["_date"].dt.strftime("%Y-%m-%d") + " " + sch["_time"], errors="coerce")
    sch = sch.dropna(subset=["_dt"])

    start = min_dt.normalize() - pd.Timedelta(days=2)
    end = max_dt.normalize() + pd.Timedelta(days=2)
    sch = sch[(sch["_dt"] >= start) & (sch["_dt"] <= end)]

    hours = set()
    for dt in sch["_dt"]:
        for h in window_hours(dt, pre_h, post_h):
            hours.add(h)
    return hours


# -----------------------------
# NBA (Knicks/Nets) from FixtureDownload
# -----------------------------
def resolve_fixturedownload_csv(team_slug: str, season_slug: str) -> str | None:
    url = f"https://fixturedownload.com/download/csv/{season_slug}/{team_slug}"
    try:
        html_bytes, content_type = download_bytes(url)

        if "text/csv" in content_type:
            out_path = os.path.join(CACHE_DIR, f"nba_{season_slug}_{team_slug}.csv")
            save_file(out_path, html_bytes)
            return out_path

        html = html_bytes.decode("utf-8", errors="ignore")
        m = re.search(r"https://fixturedownload\.com/download/[^\s\"']+\.csv", html, flags=re.IGNORECASE)
        if not m:
            return None

        direct_csv_url = m.group(0)
        csv_bytes, _ = download_bytes(direct_csv_url)
        out_path = os.path.join(CACHE_DIR, f"nba_{season_slug}_{team_slug}.csv")
        save_file(out_path, csv_bytes)
        return out_path
    except Exception:
        return None


def nba_season_slugs_from_range(min_dt: pd.Timestamp, max_dt: pd.Timestamp) -> list[str]:
    return [f"nba-{y}" for y in range(min_dt.year - 1, max_dt.year + 1)]


def load_nba_hours(min_dt: pd.Timestamp, max_dt: pd.Timestamp, pre_h: int, post_h: int) -> set[pd.Timestamp]:
    start = min_dt.normalize() - pd.Timedelta(days=2)
    end = max_dt.normalize() + pd.Timedelta(days=2)

    hours = set()
    for season_slug in nba_season_slugs_from_range(min_dt, max_dt):
        for team_slug in NBA_TEAMS:
            csv_path = resolve_fixturedownload_csv(team_slug, season_slug)
            if not csv_path:
                continue

            try:
                sch = pd.read_csv(csv_path)
            except Exception:
                continue

            if "Date" not in sch.columns or "Location" not in sch.columns:
                continue

            # only NY/NJ venues
            local_mask = False
            for v in NBA_LOCAL_VENUES:
                local_mask = local_mask | sch["Location"].astype(str).str.contains(v, case=False, na=False)
            sch = sch[local_mask].copy()

            # FixtureDownload commonly uses UTC in "DD/MM/YYYY HH:MM"
            sch["start_utc"] = pd.to_datetime(sch["Date"], format="%d/%m/%Y %H:%M", errors="coerce", utc=True)
            sch = sch.dropna(subset=["start_utc"])
            sch["start_local_naive"] = sch["start_utc"].dt.tz_convert(LOCAL_TZ).dt.tz_localize(None)

            sch = sch[(sch["start_local_naive"] >= start) & (sch["start_local_naive"] <= end)]
            for dt in sch["start_local_naive"]:
                for h in window_hours(dt, pre_h, post_h):
                    hours.add(h)

    return hours


# -----------------------------
# NHL from official team CSV feed
# -----------------------------
def load_nhl_hours(min_dt: pd.Timestamp, max_dt: pd.Timestamp, pre_h: int, post_h: int) -> set[pd.Timestamp]:
    start = min_dt.normalize() - pd.Timedelta(days=2)
    end = max_dt.normalize() + pd.Timedelta(days=2)

    hours = set()

    for team in NHL_TEAMS:
        url = NHL_FEED_TEMPLATE.format(team=team)
        path = os.path.join(CACHE_DIR, f"nhl_{team}.csv")
        try:
            download_if_missing(url, path)
            sch = pd.read_csv(path)
        except Exception:
            continue

        needed = {"START_DATE", "START_TIME", "LOCATION"}
        if not needed.issubset(set(sch.columns)):
            continue

        loc = sch["LOCATION"].astype(str)
        local_mask = False
        for v in NHL_LOCAL_VENUES:
            local_mask = local_mask | loc.str.contains(v, case=False, na=False)
        sch = sch[local_mask].copy()

        sch["start_local_naive"] = pd.to_datetime(
            sch["START_DATE"].astype(str).str.strip() + " " + sch["START_TIME"].astype(str).str.strip(),
            format="%m/%d/%Y %I:%M %p",
            errors="coerce",
        )
        sch = sch.dropna(subset=["start_local_naive"])
        sch = sch[(sch["start_local_naive"] >= start) & (sch["start_local_naive"] <= end)]

        for dt in sch["start_local_naive"]:
            for h in window_hours(dt, pre_h, post_h):
                hours.add(h)

    return hours


# -----------------------------
# MAIN: combine everything + write flags
# -----------------------------
def add_all_nynj_game_flags_pre_post():
    df = pd.read_csv(WEATHER_FILE, dtype=str)
    df = build_weather_datetime(df)

    min_dt = df["datetime"].min()
    max_dt = df["datetime"].max()

    report = {}

    # NFL
    try:
        nfl_hours = load_nfl_metlife_hours(min_dt, max_dt)
        df["is_nfl_game_hour"] = df["datetime"].isin(nfl_hours)
        report["NFL"] = len(nfl_hours)
    except Exception as e:
        df["is_nfl_game_hour"] = False
        report["NFL"] = f"FAILED ({e})"

    # MLS
    try:
        mls_hours = set()
        mls_hours |= load_mls_hours(RBNY_ICS_URL, RBNY_ICS_PATH, min_dt, max_dt, MLS_PRE, MLS_POST)
        mls_hours |= load_mls_hours(NYCFC_ICS_URL, NYCFC_ICS_PATH, min_dt, max_dt, MLS_PRE, MLS_POST)
        df["is_mls_game_hour"] = df["datetime"].isin(mls_hours)
        report["MLS"] = len(mls_hours)
    except Exception as e:
        df["is_mls_game_hour"] = False
        report["MLS"] = f"FAILED ({e})"

    # MLB
    mlb_hours = set()
    yank_ok = download_mlb_csv_from_page(YANKEES_CSV_PAGE, YANKEES_CSV_PATH)
    mets_ok = download_mlb_csv_from_page(METS_CSV_PAGE, METS_CSV_PATH)
    try:
        if yank_ok:
            mlb_hours |= load_mlb_hours(YANKEES_CSV_PATH, min_dt, max_dt, MLB_PRE, MLB_POST)
        if mets_ok:
            mlb_hours |= load_mlb_hours(METS_CSV_PATH, min_dt, max_dt, MLB_PRE, MLB_POST)
        df["is_mlb_game_hour"] = df["datetime"].isin(mlb_hours)
        report["MLB"] = len(mlb_hours)
    except Exception as e:
        df["is_mlb_game_hour"] = False
        report["MLB"] = f"FAILED ({e})"

    # NBA
    try:
        nba_hours = load_nba_hours(min_dt, max_dt, NBA_PRE, NBA_POST)
        df["is_nba_game_hour"] = df["datetime"].isin(nba_hours)
        report["NBA"] = len(nba_hours)
    except Exception as e:
        df["is_nba_game_hour"] = False
        report["NBA"] = f"FAILED ({e})"

    # NHL
    try:
        nhl_hours = load_nhl_hours(min_dt, max_dt, NHL_PRE, NHL_POST)
        df["is_nhl_game_hour"] = df["datetime"].isin(nhl_hours)
        report["NHL"] = len(nhl_hours)
    except Exception as e:
        df["is_nhl_game_hour"] = False
        report["NHL"] = f"FAILED ({e})"

    # Any game
    df["is_any_nynj_game_hour"] = (
        df["is_nfl_game_hour"]
        | df["is_mls_game_hour"]
        | df["is_mlb_game_hour"]
        | df["is_nba_game_hour"]
        | df["is_nhl_game_hour"]
    )

    df.to_csv(WEATHER_FILE, index=False)

    print("Saved:", WEATHER_FILE)
    print("Cache dir:", CACHE_DIR)
    print("\n=== Build report (hour-set sizes) ===")
    for k, v in report.items():
        print(f"{k}: {v}")

    print("\n=== Flag counts in your dataset ===")
    for col in ["is_nfl_game_hour","is_mls_game_hour","is_mlb_game_hour","is_nba_game_hour","is_nhl_game_hour","is_any_nynj_game_hour"]:
        print(col, "=", int(df[col].sum()))

if __name__ == "__main__":
    add_all_nynj_game_flags_pre_post()
