In [None]:
# import pandas as pd
# import glob

# # Define the folder path
# computer_villa = 'C:/Users/rli04/Villanova University/Complete-trip-coordinate - Documents/General'
# file_paths = glob.glob(computer_villa + '/Salt_Lake/delivery/Salt_Lake-Mar-2020/*.snappy.parquet')
# df_list = [pd.read_parquet(file, engine='pyarrow') for file in file_paths]
# # Load the first file
# combined_df = pd.concat(df_list, ignore_index=True)

workflow
1. raw data
2. linked trip aggregation, sort by local_datetime_start
3. select OD (airport to center city)[cences track level]
4. linked trip filtter (the first trip O in the airport, the last trip D in the center city)
5. attach geomery
6. export csv (select linked trip with multi-modes or other filtter method)
7. export json

In [None]:
# =========================
# CONFIG
# =========================
BASE_DIR = "C:/Users/rli04/Villanova University/Complete-trip-coordinate - Documents/General"
PARQUET_DIR = f"{BASE_DIR}/Salt_Lake/delivery"
TRACT_SHP = f"{BASE_DIR}/Manuscript/Figure/Visualization-RL/2-OD patterns by census track/six_counties_track.shp"

# airport -> central city (example)
ORIG_TRACT = "49035980000"#49057201900(weber)
DEST_TRACT = "49035114000"

MONTHS = ['Jan']
        #'Feb', 'Mar', 'Apr', 'May', 'Jun',
        #  'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
OUTPUT_CSV = "air2central_complete_trip_samples.csv"
OUTPUT_JSON = "samples_air2central.json"


#### 1. Load raw parquet (minimal columns)
#### 2. Clean + normalize (time, mode, distance)

In [7]:
import pandas as pd
import geopandas as gpd
import pygeohash as pgh
from shapely.geometry import Point, LineString
from shapely import wkt
import glob
import json
from datetime import datetime

USE_COLS = [
    "linked_trip_id", "trip_id", "tour_id",
    "travel_mode", "local_datetime_start", "local_datetime_end",
    "network_distance", "route_distance",
    "geohash7_orig", "geohash7_dest",
    "access_stop", "access_stop_id",
    "egress_stop", "egress_stop_id",
    "trip_purpose", "trip_weight",
    "route_taken"
]

dfs = []
for m in MONTHS:
    files = glob.glob(f"{PARQUET_DIR}/Salt_Lake-{m}-2020/*.snappy.parquet")
    for f in files:
        df = pd.read_parquet(f, columns=USE_COLS)
        dfs.append(df)

df = pd.concat(dfs, ignore_index=True)

# basic clean
df["local_datetime_start"] = pd.to_datetime(df["local_datetime_start"], errors="coerce")
df["local_datetime_end"] = pd.to_datetime(df["local_datetime_end"], errors="coerce")
df = df[df["local_datetime_end"] > df["local_datetime_start"]]

df["duration_min"] = (
    df["local_datetime_end"] - df["local_datetime_start"]
).dt.total_seconds() / 60


#### 3. Geohash ‚Üí census tract (orig / dest)

In [8]:
tracts = gpd.read_file(TRACT_SHP).to_crs("EPSG:4326")

def gh_to_point(gh):
    lat, lon = pgh.decode(gh)
    return Point(lon, lat)

gdf_o = gpd.GeoDataFrame(
    df[["geohash7_orig"]],
    geometry=df["geohash7_orig"].apply(gh_to_point),
    crs="EPSG:4326"
)
gdf_d = gpd.GeoDataFrame(
    df[["geohash7_dest"]],
    geometry=df["geohash7_dest"].apply(gh_to_point),
    crs="EPSG:4326"
)

df["GEOID_orig"] = gpd.sjoin(gdf_o, tracts, how="left", predicate="within")["GEOID"].values
df["GEOID_dest"] = gpd.sjoin(gdf_d, tracts, how="left", predicate="within")["GEOID"].values


#### 4. Sort by linked_trip_id + time
#### 5. Identify linked trips with:
   - first.orig == airport tract
   - last.dest == central tract


In [9]:
df = df.sort_values(["linked_trip_id", "local_datetime_start"])

first = df.groupby("linked_trip_id").first()
last = df.groupby("linked_trip_id").last()

valid_linked = first[
    (first["GEOID_orig"] == ORIG_TRACT) &
    (last["GEOID_dest"] == DEST_TRACT)
].index

df = df[df["linked_trip_id"].isin(valid_linked)]


#### 7. Attach network geometry (mode-aware)

In [10]:
# load networks
auto_links = pd.read_csv(f"{BASE_DIR}/Salt_Lake/supplementInputs/network/auto-biggest-connected-graph/link.csv")
transit_links = pd.read_csv(f"{BASE_DIR}/Salt_Lake/supplementInputs/network/UTA/link with flow.csv")

auto_dict = {
    (int(r.from_osm_node_id), int(r.to_osm_node_id)): r.geometry
    for r in auto_links.itertuples()
}
transit_dict = {
    (int(r.from_node_id), int(r.to_node_id)): r.geometry
    for r in transit_links.itertuples()
}

def build_geometry(row):
    nodes = [int(x) for x in str(row.route_taken).split(",") if x.strip().isdigit()]
    if len(nodes) < 2:
        return None

    coords = []
    link_dict = auto_dict if row.travel_mode == "car" else transit_dict

    for a, b in zip(nodes[:-1], nodes[1:]):
        if (a, b) in link_dict:
            try:
                geom = wkt.loads(link_dict[(a, b)])
                coords.extend(list(geom.coords))
            except:
                continue
    return LineString(coords) if len(coords) > 1 else None

df["geometry"] = df.apply(build_geometry, axis=1)
df = df[df["geometry"].notnull()]


#### 8. Aggregate per segment (trip_id)

#### 9. Export:
   - CSV (debug / archive)
   - JSON (dashboard)

In [11]:
df.to_csv(OUTPUT_CSV, index=False)

In [13]:
import math

def is_finite(x):
    return x is not None and isinstance(x, (int, float)) and math.isfinite(x)

def clean_num(x):
    return float(x) if is_finite(x) else None

def safe_decode_geohash(gh):
    try:
        lat, lon = pgh.decode(gh)
        if is_finite(lat) and is_finite(lon):
            return lon, lat
    except Exception:
        pass
    return None, None

def build_route(geom):
    if geom is None:
        return None

    coords = []
    for lon, lat in geom.coords:
        if not is_finite(lat) or not is_finite(lon):
            continue
        coords.append([float(lat), float(lon)])

    if len(coords) < 2:
        return None

    # demo ÊäΩÁ®Ä
    if len(coords) > 400:
        coords = coords[::3]

    return coords

In [14]:
samples = []

for r in df.itertuples():
    route = build_route(r.geometry)
    if route is None:
        continue  # üö® ‰∏çÂêàÊ≥ïÁöÑÁõ¥Êé•‰∏¢ÂºÉ

    o_lon, o_lat = safe_decode_geohash(r.geohash7_orig)
    d_lon, d_lat = safe_decode_geohash(r.geohash7_dest)

    sample = {
        "id": str(r.trip_id),
        "mode": str(r.travel_mode).lower().strip(),

        "route": route,

        "duration_min": clean_num(r.duration_min),
        "network_distance_km": clean_num(r.network_distance),
        "route_distance_km": clean_num(r.route_distance),

        "origin": {
            "lon": o_lon,
            "lat": o_lat,
            "geohash": r.geohash7_orig
        },
        "destination": {
            "lon": d_lon,
            "lat": d_lat,
            "geohash": r.geohash7_dest
        },

        "access": {
            "stop_id": clean_num(r.access_stop_id),
            "stop_name": r.access_stop
        },
        "egress": {
            "stop_id": clean_num(r.egress_stop_id),
            "stop_name": r.egress_stop
        },

        "meta": {
            "linked_trip_id": r.linked_trip_id,
            "tour_id": r.tour_id,
            "purpose": r.trip_purpose,
            "weight": clean_num(r.trip_weight)
        }
    }

    samples.append(sample)


In [15]:
out = {
    "schema": "nova.complete_trip.sample.v1",
    "generated_at": datetime.utcnow().isoformat() + "Z",
    "count": len(samples),
    "samples": samples
}

with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(out, f, indent=2, allow_nan=False)


# old version below

In [None]:
import pandas as pd
import glob
from datetime import timedelta
import pygeohash as pgh
import geopandas as gpd
from shapely.geometry import Point

# Folder path
folder_path = 'C:/Users/rli04/Villanova University/Complete-trip-coordinate - Documents/General'

# Months
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
          'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

# Only load needed columns
use_cols = [
    'linked_trip_id', 'trip_id', 'travel_mode', 'local_datetime_start', 'local_datetime_end',
    'network_distance', 'geohash7_orig', 'geohash7_dest', 
    'route_taken','route_distance', 'route_speed',
    'access_stop_id','egress_stop_id',
    'trip_purpose'
]
# Travel modes of interest
modes = {'car', 'bus', 'rail', 'walk/bike'}  # use set for faster membership check

# List to store DataFrames
df_list = []

for month in months:
    files = glob.glob(f"{folder_path}/Salt_Lake/delivery/Salt_Lake-{month}-2020/*.snappy.parquet")
    for file in files:
        df = pd.read_parquet(file, engine='pyarrow', columns=use_cols)
        df['linked_trip_id'] = df['linked_trip_id'].astype(str)
        df['trip_id'] = df['trip_id'].astype(str)
        df['travel_mode'] = df['travel_mode'].astype(str).str.lower().str.strip()

        # 3. Êó∂Èó¥ÂàóËΩ¨‰∏∫ datetime
        df['local_datetime_start'] = pd.to_datetime(df['local_datetime_start'], errors='coerce')
        df['local_datetime_end'] = pd.to_datetime(df['local_datetime_end'], errors='coerce')

        # 4. ËøáÊª§ÈùûÊ≥ïÊó∂Èó¥
        df = df[df['local_datetime_end'] > df['local_datetime_start']]

        # 5. ËÆ°ÁÆó duration
        df['duration_minutes'] = (df['local_datetime_end'] - df['local_datetime_start']).dt.total_seconds() / 60

        # 6. Ê∏ÖÊ¥óÁ©∫Èó¥ÂàóÔºàgeohashÔºâ
        df['geohash7_orig'] = df['geohash7_orig'].astype(str).str.strip()
        df['geohash7_dest'] = df['geohash7_dest'].astype(str).str.strip()

        # 7. Ê∏ÖÊ¥óË∑ùÁ¶ªÂ≠óÊÆµÔºàÈÅøÂÖçÈùûÊï∞Â≠óÔºâ
        df['network_distance'] = pd.to_numeric(df['network_distance'], errors='coerce')
        df = df[df['network_distance'] > 0]
        df_list.append(df)

all_df = pd.concat(df_list, ignore_index=True)



In [None]:
# ============ 2. Âä†ËΩΩÂú∞ÁêÜËæπÁïåÂπ∂Á≠õÈÄâGeohashÂØπÂ∫îÂÖ≥Á≥ª ============
tract_path = r"C:/Users/rli04/Villanova University/Complete-trip-coordinate - Documents/General/Manuscript/Figure/Visualization-RL/2-OD patterns by census track/six_counties_track.shp"
tracts = gpd.read_file(tract_path).to_crs("EPSG:4326")

# Step 1: ÊûÑÈÄ† orig Âíå dest ÁÇπ
gdf_orig = gpd.GeoDataFrame(
    all_df[['geohash7_orig']],  # Âè™Áî® orig ÈÉ®ÂàÜ
    geometry=all_df['geohash7_orig'].apply(lambda gh: Point(pgh.decode(gh)[1], pgh.decode(gh)[0])),
    crs="EPSG:4326"
)
gdf_orig.index = all_df.index  # ‰øùÊåÅ index ÂØπÈΩê

gdf_dest = gpd.GeoDataFrame(
    all_df[['geohash7_dest']],
    geometry=all_df['geohash7_dest'].apply(lambda gh: Point(pgh.decode(gh)[1], pgh.decode(gh)[0])),
    crs="EPSG:4326"
)
gdf_dest.index = all_df.index

# Step 2: Á©∫Èó¥ËøûÊé• tract shapefile
gdf_orig_joined = gpd.sjoin(gdf_orig, tracts[['GEOID', 'geometry']], how="left", predicate="within")
gdf_dest_joined = gpd.sjoin(gdf_dest, tracts[['GEOID', 'geometry']], how="left", predicate="within")

# Step 3: Êää GEOID ‰ø°ÊÅØÈÄöËøá index merge Âõû all_df
all_df['GEOID_orig'] = gdf_orig_joined['GEOID']
all_df['GEOID_dest'] = gdf_dest_joined['GEOID']


In [7]:
import pandas as pd
import glob
import pygeohash as pgh
from shapely.geometry import LineString
import geopandas as gpd
from datetime import datetime

# Âä†ËΩΩÂπ∂Ê∏ÖÊ¥óÊï∞ÊçÆÔºöÁï•Ôºà‰ΩøÁî®‰Ω†Â∑≤ÊúâÁöÑ all_dfÔºâ

# === Á¨¨‰∏ÄÊ≠•ÔºöÂåπÈÖç linked_trip Ëµ∑ÁÇπÁªàÁÇπ ===
sorted_df = all_df.sort_values(['linked_trip_id', 'local_datetime_start'])

first_trips = sorted_df.groupby('linked_trip_id').first().reset_index()
last_trips = sorted_df.groupby('linked_trip_id').last().reset_index()

merged = first_trips[['linked_trip_id', 'GEOID_orig']] \
    .merge(last_trips[['linked_trip_id', 'GEOID_dest']], on='linked_trip_id')

target_linked = merged[
    (merged['GEOID_orig'] == '49035114000') &
    (merged['GEOID_dest'] == '49035110106')
]['linked_trip_id'].unique()

target_df = sorted_df[sorted_df['linked_trip_id'].isin(target_linked)]

# === Á¨¨‰∫åÊ≠•ÔºöËØÜÂà´Á¨¶Âêà multimodal ÁöÑ trip ===
def is_multimodal(modes):
    seq = list(modes)
    mode_set = set(seq)
    # walk-bus-walk
    # car + (bus or rail)
    if 'car' in mode_set and 'walk/bike' in mode_set :
        return True
    return False

filtered_trips = []
for ltid, group in target_df.groupby('linked_trip_id'):
    modes = list(group.sort_values('local_datetime_start')['travel_mode'])
    # if is_multimodal(modes):
    filtered_trips.append(ltid)

sample_df = target_df[target_df['linked_trip_id'].isin(filtered_trips)]

# === Á¨¨‰∏âÊ≠•ÔºöÁîüÊàêÁªìÊûúË°® ===
output = []
for ltid, group in sample_df.groupby('linked_trip_id'):
    group = group.sort_values('local_datetime_start')
    mode_seq = '->'.join(group['travel_mode'])

    # ÁªèÁ∫¨Â∫¶ÂùêÊ†áÂ∫èÂàó
    coords = []
    times = []
    for _, row in group.iterrows():
        try:
            o_lat, o_lng = pgh.decode(row['geohash7_orig'])
            d_lat, d_lng = pgh.decode(row['geohash7_dest'])
            coords.append((o_lng, o_lat))  # shapely: (x, y) = (lon, lat)
            coords.append((d_lng, d_lat))
            times.append(row['local_datetime_start'])
            times.append(row['local_datetime_end'])
        except:
            continue

    if len(coords) < 2:
        continue

    linestring = LineString(coords)
    times_str = '->'.join([t.strftime('%Y-%m-%d %H:%M:%S') for t in times])

    output.append({
        'linked_trip_id': ltid,
        'mode_sequence': mode_seq,
        'geometry_wkt': linestring.wkt,
        'time_sequence': times_str
    })

# === Á¨¨ÂõõÊ≠•Ôºö‰øùÂ≠òÁªìÊûú ===
output_df = pd.DataFrame(output)
output_df.to_csv('multimodal_trip_sample.csv', index=False)


# specific sample

In [None]:
import pandas as pd
import os
import glob
import pygeohash as pgh
from shapely.geometry import LineString
from shapely import wkt as shapely_wkt
import ast

# === ËÆæÁΩÆ ===
folder_path = "C:/Users/rli04/Villanova University/Complete-trip-coordinate - Documents/General"
target_ids = [
    "KWzJGrJMlbBXGBdYWd51rRYNrlVBRNYJQLwGZylWa4lZeKq0oP0LQE-rVzWQ05jPj2X9LLBQXpAvdqLrg5QX",  # Jan
    "RLaKOp9YB2WDo44DoYLLKbeKjqzoWAz9J7vM7wEJxEDBwZExB262rO-1B1XElqEwldbAnxYQlwvwAqDx1xJg",  # Feb
]

# === Âä†ËΩΩ Jan Âíå Feb ÁöÑÂÖ®ÈÉ®Êï∞ÊçÆ ===
months = ['Jan', 'Feb']
use_cols = ['linked_trip_id', 'trip_id', 'geohash7_orig', 'geohash7_dest', 'route_taken', 'travel_mode']
df_list = []

for month in months:
    files = glob.glob(os.path.join(folder_path, f"Salt_Lake/delivery/Salt_Lake-{month}-2020/*.snappy.parquet"))
    for file in files:
        df = pd.read_parquet(file, engine='pyarrow', columns=use_cols)
        df = df[df['linked_trip_id'].isin(target_ids)]
        df_list.append(df)

target_df = pd.concat(df_list, ignore_index=True)

# === ODÂùêÊ†áÂàó ===
def decode_geohash_pair(row):
    try:
        lat1, lon1 = pgh.decode(row['geohash7_orig'])
        lat2, lon2 = pgh.decode(row['geohash7_dest'])
        return pd.Series([lon1, lat1, lon2, lat2])
    except:
        return pd.Series([None, None, None, None])

target_df[['orig_lon', 'orig_lat', 'dest_lon', 'dest_lat']] = target_df.apply(decode_geohash_pair, axis=1)

# === Âä†ËΩΩË∑ØÁΩëÂπ∂ÊûÑÂª∫ mode-specific Â≠óÂÖ∏ ===
link_car = pd.read_csv(os.path.join(folder_path, "Salt_Lake/supplementInputs/network/auto-biggest-connected-graph/link.csv"))
link_car_dict = {
    (int(row['from_osm_node_id']), int(row['to_osm_node_id'])): row['geometry']
    for _, row in link_car.iterrows()
}

link_transit = pd.read_csv(os.path.join(folder_path, "Salt_Lake/supplementInputs/network/UTA/link with flow.csv"))
link_transit_dict = {
    (int(row['from_node_id']), int(row['to_node_id'])): row['geometry']
    for _, row in link_transit.iterrows()
}

# === ÊûÑÈÄ† trip-level Ê±áÊÄªÁªìÊûú ===
output_rows = []
for ltid, group in target_df.groupby('linked_trip_id'):
    all_geoms = []
    for _, row in group.iterrows():
        try:
            # Ëß£ÊûêËäÇÁÇπÂ∫èÂàó
            nodes = [int(n.strip()) for n in row['route_taken'].split(',') if n.strip().isdigit() and int(n.strip()) != -1]
            if len(nodes) < 2:
                continue

            # Ê†πÊçÆ mode ÈÄâÂèñ link_dict
            mode = row['travel_mode'].lower()
            if mode == 'car':
                link_dict = link_car_dict
            elif mode in ['bus', 'rail']:
                link_dict = link_transit_dict
            else:
                continue  # skip walk/bike/air etc.

            # Êü•Êâæ link geometry
            for i in range(len(nodes)-1):
                pair = (nodes[i], nodes[i+1])
                if pair in link_dict:
                    try:
                        linestring = shapely_wkt.loads(link_dict[pair])
                        all_geoms.extend(list(linestring.coords))
                    except Exception as e:
                        print(f"‚ùå WKT parse error for pair {pair}: {e}")
        except Exception as e:
            print(f"‚ùå Error in trip {row['trip_id']}: {e}")
            continue

    if all_geoms:
        full_linestring = LineString(all_geoms)
        output_rows.append({
            'linked_trip_id': ltid,
            'trip_count': len(group),
            'full_geometry_wkt': full_linestring.wkt
        })

# === ÂêàÂπ∂ÁªìÊûúÂπ∂‰øùÂ≠ò ===
if output_rows:
    output_df = pd.DataFrame(output_rows)
    target_df = target_df.merge(output_df, on='linked_trip_id', how='left')
else:
    print("‚ö†Ô∏è No link geometries were matched. Adding empty column.")
    target_df['full_geometry_wkt'] = None

target_df.to_csv("selected_trips_with_geometry.csv", index=False)
print("‚úÖ Done. File saved as selected_trips_with_geometry.csv.")


‚úÖ Done. File saved as selected_trips_with_geometry.csv.


In [None]:
import pandas as pd
import glob
import os

# ËÆæÁΩÆË∑ØÂæÑÂíå ID
folder_path = "C:/Users/rli04/Villanova University/Complete-trip-coordinate - Documents/General"
target_ids = [
    "KWzJGrJMlbBXGBdYWd51rRYNrlVBRNYJQLwGZylWa4lZeKq0oP0LQE-rVzWQ05jPj2X9LLBQXpAvdqLrg5QX",  # Jan
    "RLaKOp9YB2WDo44DoYLLKbeKjqzoWAz9J7vM7wEJxEDBwZExB262rO-1B1XElqEwldbAnxYQlwvwAqDx1xJg",  # Feb
]
months = ['Jan', 'Feb']

# ËØªÂèñÂπ∂Á≠õÈÄâÊï∞ÊçÆ
df_list = []
for month in months:
    files = glob.glob(os.path.join(folder_path, f"Salt_Lake/delivery/Salt_Lake-{month}-2020/*.snappy.parquet"))
    for file in files:
        df = pd.read_parquet(file)
        filtered = df[df['linked_trip_id'].isin(target_ids)]
        if not filtered.empty:
            df_list.append(filtered)

# ÂêàÂπ∂Âπ∂‰øùÂ≠ò
if df_list:
    result_df = pd.concat(df_list, ignore_index=True)
    result_df.to_csv("selected_linked_trips_jan_feb.csv", index=False)
    print("‚úÖ Saved to selected_linked_trips_jan_feb.csv")
else:
    print("‚ö†Ô∏è No matching linked_trip_id found in Jan or Feb.")


‚úÖ Saved to selected_linked_trips_jan_feb.csv
