In [16]:
# import pandas as pd
# import glob

# # Define the folder path
# computer_villa = 'C:/Users/rli04/Villanova University/Complete-trip-coordinate - Documents/General'
# file_paths = glob.glob(computer_villa + '/Salt_Lake/delivery/Salt_Lake-Mar-2020/*.snappy.parquet')
# df_list = [pd.read_parquet(file, engine='pyarrow') for file in file_paths]
# # Load the first file
# combined_df = pd.concat(df_list, ignore_index=True)

workflow
1. raw data
2. linked trip aggregation, sort by local_datetime_start
3. select OD (airport to center city)[cences track level]
4. linked trip filtter (the first trip O in the airport, the last trip D in the center city)
5. attach geomery
6. export csv (select linked trip with multi-modes or other filtter method)
7. export json

In [17]:
# =========================
# CONFIG
# =========================
BASE_DIR = "C:/Users/rli04/Villanova University/Complete-trip-coordinate - Documents/General"
PARQUET_DIR = f"{BASE_DIR}/Salt_Lake/delivery"
TRACT_SHP = f"{BASE_DIR}/Manuscript/Figure/Visualization-RL/2-OD patterns by census track/six_counties_track.shp"

# airport -> central city (example)
ORIG_TRACT = "49035980000"#49057201900(weber)
DEST_TRACT = "49035114000"

MONTHS = ['Jan']
        #'Feb', 'Mar', 'Apr', 'May', 'Jun',
        #  'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
OUTPUT_CSV = "air2central_complete_trip_samples.csv"
OUTPUT_JSON = "samples_air2central.json"


#### 1. Load raw parquet (minimal columns)
#### 2. Clean + normalize (time, mode, distance)

In [18]:
import pandas as pd
import geopandas as gpd
import pygeohash as pgh
from shapely.geometry import Point, LineString
from shapely import wkt
import glob
import json
from datetime import datetime

USE_COLS = [
    "linked_trip_id", "trip_id", "tour_id",
    "travel_mode", "local_datetime_start", "local_datetime_end",
    "network_distance", "route_distance",
    "geohash7_orig", "geohash7_dest",
    "access_stop", "access_stop_id",
    "egress_stop", "egress_stop_id",
    "trip_purpose", "trip_weight",
    "route_taken"
]

dfs = []
for m in MONTHS:
    files = glob.glob(f"{PARQUET_DIR}/Salt_Lake-{m}-2020/*.snappy.parquet")
    for f in files:
        df = pd.read_parquet(f, columns=USE_COLS)
        dfs.append(df)

df = pd.concat(dfs, ignore_index=True)

# basic clean
df["local_datetime_start"] = pd.to_datetime(df["local_datetime_start"], errors="coerce")
df["local_datetime_end"] = pd.to_datetime(df["local_datetime_end"], errors="coerce")
df = df[df["local_datetime_end"] > df["local_datetime_start"]]

df["duration_min"] = (
    df["local_datetime_end"] - df["local_datetime_start"]
).dt.total_seconds() / 60


#### 3. Geohash ‚Üí census tract (orig / dest)

In [19]:
tracts = gpd.read_file(TRACT_SHP).to_crs("EPSG:4326")

def gh_to_point(gh):
    lat, lon = pgh.decode(gh)
    return Point(lon, lat)

gdf_o = gpd.GeoDataFrame(
    df[["geohash7_orig"]],
    geometry=df["geohash7_orig"].apply(gh_to_point),
    crs="EPSG:4326"
)
gdf_d = gpd.GeoDataFrame(
    df[["geohash7_dest"]],
    geometry=df["geohash7_dest"].apply(gh_to_point),
    crs="EPSG:4326"
)

df["GEOID_orig"] = gpd.sjoin(gdf_o, tracts, how="left", predicate="within")["GEOID"].values
df["GEOID_dest"] = gpd.sjoin(gdf_d, tracts, how="left", predicate="within")["GEOID"].values


#### 4. Sort by linked_trip_id + time
#### 5. Identify linked trips with:
   - first.orig == airport tract
   - last.dest == central tract


In [20]:
df = df.sort_values(["linked_trip_id", "local_datetime_start"])

first = df.groupby("linked_trip_id").first()
last = df.groupby("linked_trip_id").last()

valid_linked = first[
    (first["GEOID_orig"] == ORIG_TRACT) &
    (last["GEOID_dest"] == DEST_TRACT)
].index

df = df[df["linked_trip_id"].isin(valid_linked)]


#### 7. Attach network geometry (mode-aware)

In [21]:
# load networks
auto_links = pd.read_csv(f"{BASE_DIR}/Salt_Lake/supplementInputs/network/auto-biggest-connected-graph/link.csv")
walk_links = pd.read_csv(f"{BASE_DIR}/Salt_Lake/supplementInputs/network/walk-biggest-connected-graph/link.csv")
transit_links = pd.read_csv(f"{BASE_DIR}/Salt_Lake/supplementInputs/network/UTA/link with flow.csv")

auto_dict = {
    (int(r.from_osm_node_id), int(r.to_osm_node_id)): r.geometry
    for r in auto_links.itertuples()
}
transit_dict = {
    (int(r.from_node_id), int(r.to_node_id)): r.geometry
    for r in transit_links.itertuples()
}
walk_dict = {
    (int(r.from_osm_node_id), int(r.to_osm_node_id)): r.geometry
    for r in walk_links.itertuples()
}
def build_geometry(row):
    nodes = [int(x) for x in str(row.route_taken).split(",") if x.strip().isdigit()]
    if len(nodes) < 2:
        return None

    coords = []
    link_dict = (
        auto_dict if row.travel_mode == "car"
        else walk_dict if (row.travel_mode == "walk/bike")
        else transit_dict if row.travel_mode in ["bus", "rail"]
        else None
    )
    for a, b in zip(nodes[:-1], nodes[1:]):
        if (a, b) in link_dict:
            try:
                geom = wkt.loads(link_dict[(a, b)])
                coords.extend(list(geom.coords))
            except:
                continue
    return LineString(coords) if len(coords) > 1 else None

df["geometry"] = df.apply(build_geometry, axis=1)
df = df[df["geometry"].notnull()]


#### 8. Aggregate per segment (trip_id)

#### 9. Export:
   - CSV (debug / archive)
   - JSON (dashboard)

In [22]:
df.to_csv(OUTPUT_CSV, index=False)

#### select sampls

In [27]:

def normalize_mode(m):
    if not isinstance(m, str):
        return None
    m = m.lower().strip()
    if m == ["car"]:
        return "car"
    if m == "bus":
        return "bus"
    if m == ["rail"]:
        return "rail"
    if m == "walk/bike":
        return "walk/bike"
    return None

df["mode_norm"] = df["travel_mode"].apply(normalize_mode)

def is_valid_linked_trip(group):
    modes = set(group["mode_norm"].dropna())

    # Êù°‰ª∂ 1ÔºöËá≥Â∞ë‰∏§Áßç mode
    if len(modes) < 2:
        return False

    # Êù°‰ª∂ 2Ôºö‰∏çËÉΩÂè™Êúâ car
    if modes == {"car"}:
        return False

    return True
valid_linked_ids = (
    df.groupby("linked_trip_id")
      .filter(is_valid_linked_trip)["linked_trip_id"]
      .unique()
)

df = df[df["linked_trip_id"].isin(valid_linked_ids)]


In [28]:
import math

def is_finite(x):
    return x is not None and isinstance(x, (int, float)) and math.isfinite(x)

def clean_num(x):
    return float(x) if is_finite(x) else None

def safe_decode_geohash(gh):
    try:
        lat, lon = pgh.decode(gh)
        if is_finite(lat) and is_finite(lon):
            return lon, lat
    except Exception:
        pass
    return None, None

def build_route(geom):
    if geom is None:
        return None

    coords = []
    for lon, lat in geom.coords:
        if not is_finite(lat) or not is_finite(lon):
            continue
        coords.append([float(lat), float(lon)])

    if len(coords) < 2:
        return None

    # demo ÊäΩÁ®Ä
    if len(coords) > 400:
        coords = coords[::3]

    return coords

In [29]:
samples = []

for r in df.itertuples():
    route = build_route(r.geometry)
    if route is None:
        continue  # üö® ‰∏çÂêàÊ≥ïÁöÑÁõ¥Êé•‰∏¢ÂºÉ

    o_lon, o_lat = safe_decode_geohash(r.geohash7_orig)
    d_lon, d_lat = safe_decode_geohash(r.geohash7_dest)

    sample = {
        "id": str(r.trip_id),
        "mode": str(r.travel_mode).lower().strip(),

        "route": route,

        "duration_min": clean_num(r.duration_min),
        "network_distance_km": clean_num(r.network_distance),
        "route_distance_km": clean_num(r.route_distance),

        "origin": {
            "lon": o_lon,
            "lat": o_lat,
            "geohash": r.geohash7_orig
        },
        "destination": {
            "lon": d_lon,
            "lat": d_lat,
            "geohash": r.geohash7_dest
        },

        "access": {
            "stop_id": clean_num(r.access_stop_id),
            "stop_name": r.access_stop
        },
        "egress": {
            "stop_id": clean_num(r.egress_stop_id),
            "stop_name": r.egress_stop
        },

        "meta": {
            "linked_trip_id": r.linked_trip_id,
            "tour_id": r.tour_id,
            "purpose": r.trip_purpose,
            "weight": clean_num(r.trip_weight)
        }
    }

    samples.append(sample)


In [30]:
out = {
    "schema": "nova.complete_trip.sample.v1",
    "generated_at": datetime.utcnow().isoformat() + "Z",
    "count": len(samples),
    "samples": samples
}

with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(out, f, indent=2, allow_nan=False)
