# Fetch the data from trino db OPENSKY

### Connection Code

In [8]:
from trino.dbapi import connect
from trino.auth import OAuth2Authentication, ConsoleRedirectHandler

conn = connect(
    host="trino.opensky-network.org",
    port=443,
    http_scheme="https",
    user="terriljoel98",  # must be lowercase :contentReference[oaicite:3]{index=3}
    auth=OAuth2Authentication(
        redirect_auth_url_handler=ConsoleRedirectHandler()
    ),
    catalog="minio",
    schema="osky",
)


### Test Code

In [17]:
import pandas as pd
query ="""DESCRIBE minio.osky.flights_data4

"""
# query = "SHOW TABLES FROM minio.osky"
df_tables = pd.read_sql(query, conn)
df_tables.head(20)


  df_tables = pd.read_sql(query, conn)


Unnamed: 0,Column,Type,Extra,Comment
0,icao24,varchar,,
1,firstseen,integer,,
2,estdepartureairport,varchar,,
3,lastseen,integer,,
4,estarrivalairport,varchar,,
5,callsign,varchar,,
6,track,"array(row(time integer, latitude double, longi...",,
7,estdepartureairporthorizdistance,integer,,
8,estdepartureairportvertdistance,integer,,
9,estarrivalairporthorizdistance,integer,,


In [None]:
# Table
# 0	acas_data4
# 1	adsc
# 2	allcall_replies_data4
# 3	flarm_raw
# 4	flights_data4
# 5	flights_data5
# 6	identification_data4
# 7	operational_status_data4
# 8	position_data4
# 9	rollcall_replies_data4
# 10	state_vectors_data4
# 11	velocity_data4

### Path settings

In [3]:
from pathlib import Path

def find_project_root(start: Path) -> Path:
    p = start.resolve()
    for parent in [p] + list(p.parents):
        if (parent / "data").exists():
            return parent
    return p

ROOT = find_project_root(Path.cwd())
RAW_DIR = ROOT / "data" / "raw_opensky"
RAW_DIR.mkdir(parents=True, exist_ok=True)

ROOT, RAW_DIR


(WindowsPath('C:/Users/HiWi/Desktop/Terril/01_nextcloud/Germany/DATA SCIENCE/Semesters/05/02 Sustainability in aviation/03 contrail-mvp'),
 WindowsPath('C:/Users/HiWi/Desktop/Terril/01_nextcloud/Germany/DATA SCIENCE/Semesters/05/02 Sustainability in aviation/03 contrail-mvp/data/raw_opensky'))

###  Configuration (region, altitude band and Time window)

In [4]:
import pandas as pd

# Germany bounding box (tweak if needed)
# BBOX = dict(lat_min=47.0, lat_max=55.1, lon_min=5.5, lon_max=15.5)

BBOX = dict(lat_min=35.0, lat_max=72.0, lon_min=-15.0, lon_max=35.0)
PREFIX = "states_europe_winter"

# PREFIX = "states_germany_winter"  # change name if you like


# Cruise altitude band in meters (set to None to disable)
ALT_MIN_M = 8000
ALT_MAX_M = 13000

# Time window in UTC (end is exclusive)
START_UTC = "2025-01-13"
END_UTC   = "2025-01-14"

# Safety: start with a smoke test LIMIT before downloading full data
SMOKE_TEST = False
SMOKE_LIMIT = 5000  # rows per hour during smoke test


### Hour Partition to adhere to access guidelines from Trino

In [5]:
from tqdm.auto import tqdm

def hour_partitions(start_utc: str, end_utc: str):
    start = pd.Timestamp(start_utc, tz="UTC")
    end = pd.Timestamp(end_utc, tz="UTC")
    hours = pd.date_range(start.floor("H"), end.floor("H"), freq="H", inclusive="left", tz="UTC")
    return [(h, int(h.timestamp())) for h in hours]

def fetch_states_one_hour(hour_unix: int, bbox=BBOX, alt_min=ALT_MIN_M, alt_max=ALT_MAX_M,
                          smoke_test=SMOKE_TEST, smoke_limit=SMOKE_LIMIT) -> pd.DataFrame:
    where = f"""
      hour = {hour_unix}
      AND onground = false
      AND time - lastcontact <= 15
      AND lat BETWEEN {bbox['lat_min']} AND {bbox['lat_max']}
      AND lon BETWEEN {bbox['lon_min']} AND {bbox['lon_max']}
    """
    if alt_min is not None and alt_max is not None:
        where += f"\n  AND baroaltitude BETWEEN {alt_min} AND {alt_max}"

    limit = f"\nLIMIT {int(smoke_limit)}" if smoke_test else ""

    q = f"""
    SELECT
      hour,
      time, lastcontact,
      icao24, callsign,
      lat, lon,
      baroaltitude, geoaltitude,
      velocity, heading, vertrate
    FROM minio.osky.state_vectors_data4
    WHERE {where}
    {limit}
    """
    return pd.read_sql(q, conn)

HOURS = hour_partitions(START_UTC, END_UTC)
print("Hours:", len(HOURS), "| First:", HOURS[0][0], "| Last:", HOURS[-1][0])


Hours: 24 | First: 2025-01-13 00:00:00+00:00 | Last: 2025-01-13 23:00:00+00:00


  from .autonotebook import tqdm as notebook_tqdm
  hours = pd.date_range(start.floor("H"), end.floor("H"), freq="H", inclusive="left", tz="UTC")


In [6]:
import time


def out_path_for_hour(prefix: str, hour_dt: pd.Timestamp):
    return RAW_DIR / f"{prefix}_{hour_dt.strftime('%Y%m%d_%H%M%SZ')}.parquet"

max_retries = 2
base_sleep_s = 3

written = 0
total_rows = 0

for hour_dt, hour_unix in tqdm(HOURS):
    out_path = out_path_for_hour(PREFIX, hour_dt)
    if out_path.exists():
        continue

    for attempt in range(max_retries + 1):
        try:
            df = fetch_states_one_hour(hour_unix)
            df.to_parquet(out_path, index=False)
            written += 1
            total_rows += len(df)
            break
        except Exception as e:
            if attempt == max_retries:
                print(f"[FAILED] {hour_dt} hour={hour_unix} -> {e}")
            else:
                time.sleep(base_sleep_s * (attempt + 1))

print(f"Done. Files written: {written}, total rows: {total_rows:,}")
print("Output folder:", RAW_DIR)


  0%|          | 0/24 [00:00<?, ?it/s]

  return pd.read_sql(q, conn)


Open the following URL in browser for the external authentication:
https://trino.opensky-network.org/oauth2/token/initiate/5865d810b49b6fb4f97b1d002b9fb67a447d6f6bb845b182683e28eb8aee4611


 67%|██████▋   | 16/24 [1:01:29<43:01, 322.71s/it]

Open the following URL in browser for the external authentication:
https://trino.opensky-network.org/oauth2/token/initiate/110581aefdac237ba4d817aa41610cc5f3f8ea503672178115f188520cdef876
Open the following URL in browser for the external authentication:
https://trino.opensky-network.org/oauth2/token/initiate/1319b6bf45757072b4fd67111df57fe43097c890ecd0f699d65f48517b86f65a


100%|██████████| 24/24 [1:25:59<00:00, 214.97s/it]

Done. Files written: 24, total rows: 79,261,352
Output folder: C:\Users\HiWi\Desktop\Terril\01_nextcloud\Germany\DATA SCIENCE\Semesters\05\02 Sustainability in aviation\03 contrail-mvp\data\raw_opensky





In [7]:
files = sorted(RAW_DIR.glob(f"{PREFIX}_*.parquet"))
print("Files:", len(files))
files[:3]


Files: 24


[WindowsPath('C:/Users/HiWi/Desktop/Terril/01_nextcloud/Germany/DATA SCIENCE/Semesters/05/02 Sustainability in aviation/03 contrail-mvp/data/raw_opensky/states_europe_winter_20250113_000000Z.parquet'),
 WindowsPath('C:/Users/HiWi/Desktop/Terril/01_nextcloud/Germany/DATA SCIENCE/Semesters/05/02 Sustainability in aviation/03 contrail-mvp/data/raw_opensky/states_europe_winter_20250113_010000Z.parquet'),
 WindowsPath('C:/Users/HiWi/Desktop/Terril/01_nextcloud/Germany/DATA SCIENCE/Semesters/05/02 Sustainability in aviation/03 contrail-mvp/data/raw_opensky/states_europe_winter_20250113_020000Z.parquet')]

## Verify the required data

### Point to the downloaded folder

In [8]:
import pyarrow.parquet as pq
files = sorted(RAW_DIR.glob(f"{PREFIX}_*.parquet"))
len(files), files[:3]

(24,
 [WindowsPath('C:/Users/HiWi/Desktop/Terril/01_nextcloud/Germany/DATA SCIENCE/Semesters/05/02 Sustainability in aviation/03 contrail-mvp/data/raw_opensky/states_europe_winter_20250113_000000Z.parquet'),
  WindowsPath('C:/Users/HiWi/Desktop/Terril/01_nextcloud/Germany/DATA SCIENCE/Semesters/05/02 Sustainability in aviation/03 contrail-mvp/data/raw_opensky/states_europe_winter_20250113_010000Z.parquet'),
  WindowsPath('C:/Users/HiWi/Desktop/Terril/01_nextcloud/Germany/DATA SCIENCE/Semesters/05/02 Sustainability in aviation/03 contrail-mvp/data/raw_opensky/states_europe_winter_20250113_020000Z.parquet')])

In [9]:
total_rows = 0
min_time, max_time = None, None
lat_min, lat_max = None, None
lon_min, lon_max = None, None
pairs = set()

for f in files:
    pf = pq.ParquetFile(f)
    total_rows += pf.metadata.num_rows
    
    # Read only what we need (small)
    t = pf.read(columns=["time"]).column("time").to_numpy()
    if len(t):
        t0, t1 = int(t.min()), int(t.max())
        min_time = t0 if min_time is None else min(min_time, t0)
        max_time = t1 if max_time is None else max(max_time, t1)

    ll = pf.read(columns=["lat","lon"]).to_pandas()
    lat_min = ll["lat"].min() if lat_min is None else min(lat_min, ll["lat"].min())
    lat_max = ll["lat"].max() if lat_max is None else max(lat_max, ll["lat"].max())
    lon_min = ll["lon"].min() if lon_min is None else min(lon_min, ll["lon"].min())
    lon_max = ll["lon"].max() if lon_max is None else max(lon_max, ll["lon"].max())

    ids = pf.read(columns=["icao24","callsign"]).to_pandas().dropna()
    pairs.update(map(tuple, ids[["icao24","callsign"]].drop_duplicates().to_numpy()))

print("total_points(rows):", f"{total_rows:,}")
print("time_range_utc:", pd.to_datetime(min_time, unit="s", utc=True), "->", pd.to_datetime(max_time, unit="s", utc=True))
print("lat_range:", (lat_min, lat_max))
print("lon_range:", (lon_min, lon_max))
print("unique_(icao24,callsign):", f"{len(pairs):,}")

# quick "top 2%" sanity:
n_proxy = len(pairs)
print("top_2%_count (proxy):", max(1, int(round(n_proxy * 0.02))))

total_points(rows): 79,261,352
time_range_utc: 2025-01-13 00:00:00+00:00 -> 2025-01-13 23:59:59+00:00
lat_range: (np.float64(35.00001525878906), np.float64(71.39628264863612))
lon_range: (np.float64(-14.999892290900732), np.float64(34.99998092651367))
unique_(icao24,callsign): 19,421
top_2%_count (proxy): 388


In [2]:
import pandas as pd
df = pd.read_parquet("C:\Users\HiWi\Desktop\Terril\01_nextcloud\Germany\DATA SCIENCE\Semesters\05\02 Sustainability in aviation\03 contrail-mvp\data\raw\flight_data\states_europe\2025-01-13_2025-01-14\states_europe_winter_20250113_230000Z.parquet")

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (3352900793.py, line 2)

In [7]:
from pathlib import Path
ROOT = Path.cwd().parent
import pandas as pd 
pd.read_parquet(ROOT / "data" / "raw" / "flight_data" / "states_europe" / "2025-01-13_2025-01-14" / "states_europe_winter_20250113_230000Z.parquet").head()


Unnamed: 0,hour,time,lastcontact,icao24,callsign,lat,lon,baroaltitude,geoaltitude,velocity,heading,vertrate
0,1736809200,1736809201,1736809000.0,800739,AIC148,47.202061,17.799401,11269.98,11353.8,213.930781,110.701179,-0.32512
1,1736809200,1736809202,1736809000.0,800739,AIC148,47.199873,17.807782,11269.98,11353.8,213.930781,110.701179,-0.32512
2,1736809200,1736809203,1736809000.0,800739,AIC148,47.1995,17.809401,11269.98,11353.8,213.930781,110.701179,-0.32512
3,1736809200,1736809204,1736809000.0,800739,AIC148,47.198593,17.81282,11277.6,11361.42,213.930781,110.701179,-0.32512
4,1736809200,1736809205,1736809000.0,800739,AIC148,47.197769,17.815979,11277.6,11361.42,213.930781,110.701179,0.0
