In [221]:
# import pandas as pd
# import glob

# # Define the folder path
# computer_villa = 'C:/Users/rli04/Villanova University/Complete-trip-coordinate - Documents/General'
# file_paths = glob.glob(computer_villa + '/Salt_Lake/delivery/Salt_Lake-Mar-2020/*.snappy.parquet')
# df_list = [pd.read_parquet(file, engine='pyarrow') for file in file_paths]
# # Load the first file
# combined_df = pd.concat(df_list, ignore_index=True)

workflow
1. raw data
2. linked trip aggregation, sort by local_datetime_start
3. select OD (airport to center city)[cences track level]
4. linked trip filtter (the first trip O in the airport, the last trip D in the center city)
5. attach geomery
6. export csv (select linked trip with multi-modes or other filtter method)
7. export json

In [231]:
# =========================
# CONFIG
# =========================
BASE_DIR = "C:/Users/rli04/Villanova University/Complete-trip-coordinate - Documents/General"
PARQUET_DIR = f"{BASE_DIR}/Salt_Lake/delivery"
TRACT_SHP = f"{BASE_DIR}/Manuscript/Figure/Visualization-RL/2-OD patterns by census track/six_counties_track.shp"

# airport -> central city (example)
ORIG_TRACT = "49035101402"#49057201900 (weber) 
DEST_TRACT = "49035110106" 
# 49035114000 (center)
# 49035980000 (airport)
# 49035110106 (Canyon)
# 49035101402 (U of U)
MONTHS = ['Jan']
        #   'Feb', 'Mar', 'Apr', 'May', 'Jun',
        #   'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
OUTPUT_CSV = f"{ORIG_TRACT}_to_{DEST_TRACT}.csv"
OUTPUT_JSON = f"{ORIG_TRACT}_to_{DEST_TRACT}.json"

# KEEP_FACTOR = 2.5   # OD < 2.5 * max(route_dist) -> DROP

# MAX_SAMPLES = 3




#### 1. Load raw parquet (minimal columns)
#### 2. Clean + normalize (time, mode, distance)

In [232]:
import pandas as pd
import numpy as np
import geopandas as gpd
import pygeohash as pgh
from shapely.geometry import Point, LineString
from shapely import wkt
import glob
import json
from datetime import datetime


# =========================
# UTILS
# =========================
def haversine_miles(lon1, lat1, lon2, lat2):
    R = 3958.8
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    return 2 * R * np.arcsin(np.sqrt(a))

def decode_gh_series(s):
    lat, lon = zip(*s.map(pgh.decode))
    return np.array(lat), np.array(lon)

USE_COLS = [
    "linked_trip_id", "trip_id", "tour_id",
    "travel_mode", "local_datetime_start", "local_datetime_end",
    "network_distance", "route_distance",
    "geohash7_orig", "geohash7_dest",
    "access_stop", "access_stop_id",
    "egress_stop", "egress_stop_id",
    "trip_purpose", "trip_weight",
    "route_taken"
]

MONTHLY_DFS = []
# =========================
# 1️⃣ MONTHLY LOAD + BASIC FILTER
# =========================
for m in MONTHS:
    print(f"\n=== Processing month {m} ===")

    files = glob.glob(f"{PARQUET_DIR}/Salt_Lake-{m}-2020/*.snappy.parquet")
    if not files:
        continue

    dfs = [pd.read_parquet(f, columns=USE_COLS) for f in files]
    df_month = pd.concat(dfs, ignore_index=True)

    df_month["local_datetime_start"] = pd.to_datetime(df_month["local_datetime_start"], errors="coerce")
    df_month["local_datetime_end"] = pd.to_datetime(df_month["local_datetime_end"], errors="coerce")

    df_month = df_month[df_month["local_datetime_end"] > df_month["local_datetime_start"]]

    mode_sets = (
        df_month
        .groupby("linked_trip_id")["travel_mode"]
        .agg(set)
    )

    # valid_linked_ids = mode_sets[
    #     (mode_sets.apply(len) >= 2) &
    #     (mode_sets != {"car"})
    # ].index

    # df_month = df_month[df_month["linked_trip_id"].isin(valid_linked_ids)]
    # ✅ ADD duration_min HERE (row-level, month scope)
    df_month["duration_min"] = (
        df_month["local_datetime_end"] - df_month["local_datetime_start"]
    ).dt.total_seconds() / 60

    MONTHLY_DFS.append(df_month)

# =========================
# 2️⃣ YEAR CONCAT
# =========================
df = pd.concat(MONTHLY_DFS, ignore_index=True)
print("Before OD filter, linked trips:", df["linked_trip_id"].nunique())
# =========================
# FIXED OD DISTANCE (TRACT CENTROID → CENTROID)
# =========================
tracts = gpd.read_file(TRACT_SHP).to_crs("EPSG:4326")

orig_geom = tracts.loc[
    tracts["GEOID"] == ORIG_TRACT, "geometry"
].iloc[0]

dest_geom = tracts.loc[
    tracts["GEOID"] == DEST_TRACT, "geometry"
].iloc[0]

orig_centroid = orig_geom.centroid
dest_centroid = dest_geom.centroid

OD_DIST_MILE = haversine_miles(
    orig_centroid.x, orig_centroid.y,
    dest_centroid.x, dest_centroid.y
)

print(f"Fixed OD distance (mile): {OD_DIST_MILE:.2f}")

# =========================
# 3️⃣ OD vs ROUTE DISTANCE FILTER（核心新增）
# =========================
df = df.sort_values(["linked_trip_id", "local_datetime_start"])





=== Processing month Jan ===
Before OD filter, linked trips: 2751633
Fixed OD distance (mile): 13.56


In [224]:

# 使用 network_distance 作为每条 trip 的长度
df["network_distance"] = pd.to_numeric(
    df["network_distance"], errors="coerce"
)

# 对每个 linked_trip，取最长的一条 trip
network_max = (
    df[df["network_distance"] >= 0]
    .groupby("linked_trip_id")["network_distance"]
    .max()
    .rename("max_network_dist")
    .reset_index()
)

# 删除：只要有一条 trip > 2.5 * OD_DIST_MILE
# valid_linked_ids = network_max.loc[
#     network_max["max_network_dist"] <= KEEP_FACTOR * OD_DIST_MILE,
#     "linked_trip_id"
# ]

# df = df[df["linked_trip_id"].isin(valid_linked_ids)]

print("After OD filter, linked trips:", df["linked_trip_id"].nunique())


After OD filter, linked trips: 2751633


#### 3. Geohash → census tract (orig / dest)

In [225]:
# =========================
# 4️⃣ TRACT JOIN（保持你原逻辑）
# =========================
tracts = gpd.read_file(TRACT_SHP).to_crs("EPSG:4326")

def gh_to_point(gh):
    lat, lon = pgh.decode(gh)
    return Point(lon, lat)

gdf_o = gpd.GeoDataFrame(
    df[["geohash7_orig"]],
    geometry=df["geohash7_orig"].apply(gh_to_point),
    crs="EPSG:4326"
)

gdf_d = gpd.GeoDataFrame(
    df[["geohash7_dest"]],
    geometry=df["geohash7_dest"].apply(gh_to_point),
    crs="EPSG:4326"
)

df["GEOID_orig"] = gpd.sjoin(gdf_o, tracts, how="left", predicate="within")["GEOID"].values
df["GEOID_dest"] = gpd.sjoin(gdf_d, tracts, how="left", predicate="within")["GEOID"].values

#### 4. Sort by linked_trip_id + time
#### 5. Identify linked trips with:
   - first.orig == airport tract
   - last.dest == central tract


In [226]:
df = df.sort_values(["linked_trip_id", "local_datetime_start"])

first = df.groupby("linked_trip_id").first()
last = df.groupby("linked_trip_id").last()

valid_linked = first[
    (first["GEOID_orig"] == ORIG_TRACT) &
    (last["GEOID_dest"] == DEST_TRACT)
].index

df = df[df["linked_trip_id"].isin(valid_linked)]


#### 7. Attach network geometry (mode-aware)

In [227]:
# load networks
auto_links = pd.read_csv(f"{BASE_DIR}/Salt_Lake/supplementInputs/network/auto-biggest-connected-graph/link.csv")
walk_links = pd.read_csv(f"{BASE_DIR}/Salt_Lake/supplementInputs/network/walk-biggest-connected-graph/link.csv")
transit_links = pd.read_csv(f"{BASE_DIR}/Salt_Lake/supplementInputs/network/UTA/link with flow.csv")

auto_dict = {
    (int(r.from_osm_node_id), int(r.to_osm_node_id)): r.geometry
    for r in auto_links.itertuples()
}
transit_dict = {
    (int(r.from_node_id), int(r.to_node_id)): r.geometry
    for r in transit_links.itertuples()
}
walk_dict = {
    (int(r.from_osm_node_id), int(r.to_osm_node_id)): r.geometry
    for r in walk_links.itertuples()
}
def build_geometry(row):
    nodes = [int(x) for x in str(row.route_taken).split(",") if x.strip().isdigit()]
    if len(nodes) < 2:
        return None

    coords = []
    link_dict = (
        auto_dict if row.travel_mode == "car"
        else walk_dict if (row.travel_mode == "walk/bike")
        else transit_dict if row.travel_mode in ["bus", "rail"]
        else None
    )
    for a, b in zip(nodes[:-1], nodes[1:]):
        if (a, b) in link_dict:
            try:
                geom = wkt.loads(link_dict[(a, b)])
                coords.extend(list(geom.coords))
            except:
                continue
    return LineString(coords) if len(coords) > 1 else None

df["geometry"] = df.apply(build_geometry, axis=1)
df = df[df["geometry"].notnull()]


#### 8. Aggregate per segment (trip_id)

#### 9. Export:
   - CSV (debug / archive)
   - JSON (dashboard)

In [228]:
df.to_csv(OUTPUT_CSV, index=False)

In [229]:
import math

def is_finite(x):
    return x is not None and isinstance(x, (int, float)) and math.isfinite(x)

def clean_num(x):
    return float(x) if is_finite(x) else None

def safe_decode_geohash(gh):
    try:
        lat, lon = pgh.decode(gh)
        if is_finite(lat) and is_finite(lon):
            return lon, lat
    except Exception:
        pass
    return None, None

def build_route(geom):
    if geom is None:
        return None

    coords = []
    for lon, lat in geom.coords:
        if not is_finite(lat) or not is_finite(lon):
            continue
        coords.append([float(lat), float(lon)])

    if len(coords) < 2:
        return None

    # demo 抽稀
    if len(coords) > 400:
        coords = coords[::3]

    return coords
def to_iso(t):
    if t is None:
        return None
    if hasattr(t, "isoformat"):
        return t.isoformat()
    return str(t)


In [230]:
import json
from datetime import datetime, timedelta
from collections import defaultdict
from shapely.geometry import mapping

MAX_SAMPLES = 5

# =========================
# 1️⃣ BUILD ALL LEG SAMPLES（仅补 end_time）
# =========================

samples = []

for r in df.itertuples():
    route = build_route(r.geometry)
    if route is None:
        continue

    o_lon, o_lat = safe_decode_geohash(r.geohash7_orig)
    d_lon, d_lat = safe_decode_geohash(r.geohash7_dest)

    start_dt = r.local_datetime_start
    duration = clean_num(r.duration_min)

    end_dt = (
        start_dt + timedelta(minutes=duration)
        if start_dt is not None and duration is not None
        else None
    )

    samples.append({
        "id": str(r.trip_id),
        "mode": str(r.travel_mode).lower().strip(),
        "route": route,
        "start_time": to_iso(start_dt),
        "end_time": to_iso(end_dt),              # ✅ 新增
        "duration_min": duration,
        "network_distance_km": clean_num(r.network_distance),
        "route_distance_km": clean_num(r.route_distance),
        "origin": {
            "lon": o_lon,
            "lat": o_lat,
            "geohash": r.geohash7_orig
        },
        "destination": {
            "lon": d_lon,
            "lat": d_lat,
            "geohash": r.geohash7_dest
        },
        "access": {
            "stop_id": clean_num(r.access_stop_id),
            "stop_name": r.access_stop
        },
        "egress": {
            "stop_id": clean_num(r.egress_stop_id),
            "stop_name": r.egress_stop
        },
        "meta": {
            "linked_trip_id": r.linked_trip_id,
            "tour_id": r.tour_id,
            "purpose": r.trip_purpose,
            "weight": clean_num(r.trip_weight)
        }
    })

# =========================
# 2️⃣ GROUP BY linked_trip（不变）
# =========================

linked_groups = defaultdict(list)
for s in samples:
    linked_groups[s["meta"]["linked_trip_id"]].append(s)

# =========================
# 3️⃣ BUILD FULL linked_trip STRUCTURE（仅修 end_time 语义）
# =========================

linked_trips_full = []

for linked_id, trips in linked_groups.items():

    trips_sorted = sorted(trips, key=lambda x: x["start_time"])

    # 给每个 leg 顺序索引
    for i, t in enumerate(trips_sorted):
        t["leg_index"] = i

    origin = {
        **trips_sorted[0]["origin"],
        "start_time": trips_sorted[0]["start_time"]
    }

    destination = {
        **trips_sorted[-1]["destination"],
        "end_time": trips_sorted[-1].get("end_time")   # ✅ 修正
    }

    transfers = []
    for t in trips_sorted[:-1]:
        if t["destination"]["lat"] is not None and t["destination"]["lon"] is not None:
            transfers.append({
                "lat": t["destination"]["lat"],
                "lon": t["destination"]["lon"],
                "geohash": t["destination"]["geohash"]
            })

    weight = max(t["meta"]["weight"] or 0 for t in trips_sorted)

    linked_trips_full.append({
        "linked_trip_id": linked_id,
        "origin": origin,
        "destination": destination,
        "transfers": transfers,
        "legs": trips_sorted,
        "weight": weight
    })

# =========================
# 4️⃣ SAMPLE linked_trip（不变）
# =========================

linked_trips_sorted = sorted(
    linked_trips_full,
    key=lambda lt: -lt["weight"]
)

linked_trips_final = linked_trips_sorted

# =========================
# 5️⃣ OD TRACT → GEOJSON（不变）
# =========================

tracts["GEOID"] = tracts["GEOID"].astype(str)

origin_tract = tracts.loc[tracts["GEOID"] == ORIG_TRACT]
dest_tract   = tracts.loc[tracts["GEOID"] == DEST_TRACT]

if len(origin_tract) != 1 or len(dest_tract) != 1:
    raise ValueError("OD tract not uniquely identified")

def geom_to_geojson(gdf):
    return mapping(gdf.geometry.iloc[0])

od_info = {
    "origin": {
        "tract_id": ORIG_TRACT,
        "geometry": geom_to_geojson(origin_tract)
    },
    "destination": {
        "tract_id": DEST_TRACT,
        "geometry": geom_to_geojson(dest_tract)
    }
}

# =========================
# 6️⃣ FINAL OUTPUT（不变）
# =========================

out = {
    "schema": "nova.complete_trip.sample.v2",
    "generated_at": datetime.utcnow().isoformat() + "Z",
    "od": od_info,
    "count": len(linked_trips_final),
    "linked_trips": linked_trips_final
}

with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(out, f, indent=2, allow_nan=False)
