In [None]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parents[0]  # notebooks/ is one level down
SRC = PROJECT_ROOT / "src"
sys.path.insert(0, str(SRC))

print("Project root:", PROJECT_ROOT)
print("Src path added:", SRC)

# Download the 24 hour dump. Just put the date and hit Enter

In [None]:
from pathlib import Path

out_dir = PROJECT_ROOT / "data" / "processed"
out_dir.mkdir(parents=True, exist_ok=True)

DAY = "2025-06-25"  # change to any Monday/day you want
OUT_PATH = out_dir / f"dump24h_europe_{DAY}.csv.gz"

print("Will write:", OUT_PATH)

In [None]:
from contrail_mvp.ingest_opensky import dump24h_to_csv, DEFAULT_DUMP_PREFIXES

dump24h_to_csv(
    out_csv=str(OUT_PATH),
    day=DAY,
    prefixes=DEFAULT_DUMP_PREFIXES,
    europe_only=True,
    gzip_out=True,
)

print("Done:", OUT_PATH)


In [None]:
import io, gzip, time
from datetime import datetime, timezone
import requests
import pandas as pd
from tqdm import tqdm

BASE = "https://s3.opensky-network.org/data-samples/states/"  # same place as README :contentReference[oaicite:1]{index=1}
EUROPE_BBOX = dict(lamin=34.0, lomin=-25.0, lamax=72.0, lomax=45.0)

def _exists(url: str, sess: requests.Session) -> bool:
    # Try HEAD first; if not allowed, fall back to a tiny GET
    try:
        r = sess.head(url, allow_redirects=True, timeout=30)
        if r.status_code == 200:
            return True
        if r.status_code in (403, 404):
            return False
    except Exception:
        pass
    try:
        r = sess.get(url, stream=True, timeout=30)
        return r.status_code == 200
    except Exception:
        return False

def _read_csv_bytes(content: bytes) -> pd.DataFrame:
    # gzip magic bytes check
    if len(content) >= 2 and content[0] == 0x1F and content[1] == 0x8B:
        with gzip.GzipFile(fileobj=io.BytesIO(content), mode="rb") as gz:
            return pd.read_csv(gz)
    return pd.read_csv(io.BytesIO(content))

def _discover_hour_url(day: str, hour: int, sess: requests.Session) -> str:
    """
    Tries common filename patterns until it finds a file that exists.
    Day: 'YYYY-MM-DD'
    Hour: 0..23
    """
    hh = f"{hour:02d}"
    day_digits = day.replace("-", "")

    # We try multiple likely layouts because OpenSkyâ€™s public dumps vary by dataset.
    candidates = [
        # Flat naming
        f"{BASE}states_{day}-{hh}.csv.gz",
        f"{BASE}states_{day_digits}-{hh}.csv.gz",
        f"{BASE}{day}-{hh}.csv.gz",
        f"{BASE}{day_digits}-{hh}.csv.gz",
        f"{BASE}{day}_{hh}.csv.gz",
        f"{BASE}{day_digits}_{hh}.csv.gz",

        # Day folder naming
        f"{BASE}{day}/states_{day}-{hh}.csv.gz",
        f"{BASE}{day_digits}/states_{day_digits}-{hh}.csv.gz",
        f"{BASE}{day}/{hh}.csv.gz",
        f"{BASE}{day_digits}/{hh}.csv.gz",

        # Some datasets store by format folder
        f"{BASE}csv/states_{day}-{hh}.csv.gz",
        f"{BASE}csv/states_{day_digits}-{hh}.csv.gz",
        f"{BASE}csv/{day}/states_{day}-{hh}.csv.gz",
        f"{BASE}csv/{day_digits}/states_{day_digits}-{hh}.csv.gz",
    ]

    # also try .csv (uncompressed) variants
    candidates += [c.replace(".csv.gz", ".csv") for c in candidates]

    for url in candidates:
        if _exists(url, sess):
            return url

    raise FileNotFoundError(
        f"Could not find hour {hh} for day {day} under {BASE}. "
        f"Try a different date (these samples are produced periodically)."
    )

def download_24h_states_day(day: str, out_csv_gz: str, europe_only: bool = True) -> str:
    """
    Downloads 24 hourly state-vector files for a day from OpenSky 'state vector samples'
    (one file per hour, full day). :contentReference[oaicite:2]{index=2}
    Merges them and optionally filters to Europe bbox.
    Writes a single CSV.GZ.
    """
    sess = requests.Session()
    rows_written = 0

    with gzip.open(out_csv_gz, "wt", newline="") as f_out:
        wrote_header = False

        for hour in tqdm(range(24), desc=f"Downloading {day} (24h)"):
            url = _discover_hour_url(day, hour, sess)

            r = sess.get(url, timeout=120)
            r.raise_for_status()
            df = _read_csv_bytes(r.content)

            # Filter to Europe if lat/lon columns exist (README calls them lat/lon) :contentReference[oaicite:3]{index=3}
            if europe_only:
                lat_col = "lat" if "lat" in df.columns else ("latitude" if "latitude" in df.columns else None)
                lon_col = "lon" if "lon" in df.columns else ("longitude" if "longitude" in df.columns else None)
                if lat_col and lon_col:
                    df = df[
                        df[lat_col].notna() & df[lon_col].notna() &
                        (df[lat_col] >= EUROPE_BBOX["lamin"]) & (df[lat_col] <= EUROPE_BBOX["lamax"]) &
                        (df[lon_col] >= EUROPE_BBOX["lomin"]) & (df[lon_col] <= EUROPE_BBOX["lomax"])
                    ].copy()

            df.to_csv(f_out, index=False, header=not wrote_header)
            wrote_header = True
            rows_written += len(df)

            # tiny pause to be polite
            time.sleep(0.2)

    print("Saved:", out_csv_gz)
    print("Rows written:", rows_written)
    return out_csv_gz


In [None]:

out_path = f"{PROJECT_ROOT}/data/processed/opensky_states_24h_europe_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%SZ')}.csv.gz"
download_24h_states_day(day="2016-12-03", out_csv_gz=out_path, europe_only=True)


In [1]:
from pyopensky.config import opensky_config_dir
opensky_config_dir


WindowsPath('C:/Users/HiWi/AppData/Local/pyopensky/pyopensky')