In [4]:
import requests
import time
import pandas as pd
from datetime import datetime
from tabulate import tabulate
import concurrent.futures
import threading
import signal
import sys
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

In [5]:
# KMB API 配置
kmb_base_url = "https://data.etabus.gov.hk/v1/transport/kmb"
kmb_route = "272A"

# Citybus API 配置
citybus_base_url = "https://rt.data.gov.hk/v2/transport/citybus"
citybus_company_id = "CTB"
citybus_route = "582"

In [6]:
def kmb_route_info(direction, service_type):
  url = f"{kmb_base_url}/route/{kmb_route}/{direction}/{service_type}"
  try:
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    return response.json().get('data', "未知路线")
  except Exception as e:
    print(f"获取KMB路线信息失败: {e}")
    return None

In [7]:
def kmb_stopid_info(stop_id):
  url = f"{kmb_base_url}/stop/{stop_id}"
  try:
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    data = response.json().get('data', {})
    return [data.get('name_tc', 'Unknown stop'), data.get('name_en', 'Unknown stop'), data.get('lat', 'Unknown stop'), data.get('long', 'Unknown stop')]
  except Exception as e:
    print(f"获取KMB站点 {stop_id} 名称失败: {e}")
    return None

In [8]:
def kmb_stop_info(direction, service_type):
  url = f"{kmb_base_url}/route-stop/{kmb_route}/{direction}/{service_type}"
  try:
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    data = response.json()
    stops = data.get('data', [])
    stops = [stop['stop'] for stop in stops]
    stops_name = {key: kmb_stopid_info(key) for key in stops}
    return stops, stops_name
  except Exception as e:
    print(f"获取KMB站点ID失败: {e}")
    return None, None

In [9]:
def kmb_eta_info(stop_id, service_type):
  url = f"{kmb_base_url}/eta/{stop_id}/{kmb_route}/{service_type}"
  try:
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    data = response.json().get('data', [])
    for eta in data:
      eta_string = eta.get('eta')
      time_string = eta.get('data_timestamp')
      if eta_string:
        eta_obj = datetime.fromisoformat(eta_string)
        eta['eta'] = eta_obj.strftime('%Y-%m-%d %H:%M')
        time_obj = datetime.fromisoformat(time_string)
        eta['data_timestamp'] = time_obj.strftime('%Y-%m-%d %H:%M')
    selected_keys = ['route', 'dir', 'service_type', 'seq', 'eta', 'eta_seq', 'data_timestamp']
    return [{k: item.get(k) for k in selected_keys} for item in data]
  except Exception as e:
    print(f"获取KMB站点 {stop_id} 的ETA信息失败: {e}")
    return None

In [10]:
def citybus_route_info():
  url = f"{citybus_base_url}/route/{citybus_company_id}/{citybus_route}"
  try:
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    return response.json().get('data', "未知路线")
  except Exception as e:
    print(f"获取Citybus路线信息失败: {e}")
    return None

In [11]:
def citybus_bus_stop_info(stop_id):
  url = f"{citybus_base_url}/stop/{stop_id}"
  try:
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    data = response.json().get('data', {})
    return [data.get('name_tc', 'Unknown stop'), data.get('name_en', 'Unknown stop'), data.get('lat', 'Unknown stop'), data.get('long', 'Unknown stop')]
  except Exception as e:
    print(f"获取Citybus站点 {stop_id} 名称失败: {e}")
    return None

In [12]:
def citybus_stop_info(direction):
  url = f"{citybus_base_url}/route-stop/{citybus_company_id}/{citybus_route}/{direction}"
  try:
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    data = response.json()
    stops = sorted(data.get('data', []), key=lambda x: x.get('seq', 0))
    stops = [stop['stop'] for stop in stops]
    stops_name = {key: citybus_bus_stop_info(key) for key in stops}
    return stops, stops_name
  except Exception as e:
    print(f"获取Citybus站点ID失败: {e}")
    return None, None

In [13]:
def citybus_eta_info(stop_id):
  url = f"{citybus_base_url}/eta/{citybus_company_id}/{stop_id}/{citybus_route}"
  try:
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    data = response.json().get('data', [])
    for eta in data:
      eta_string = eta.get('eta')
      time_string = eta.get('data_timestamp')
      if eta_string:
        eta_obj = datetime.fromisoformat(eta_string)
        eta['eta'] = eta_obj.strftime('%Y-%m-%d %H:%M')
        time_obj = datetime.fromisoformat(time_string)
        eta['data_timestamp'] = time_obj.strftime('%Y-%m-%d %H:%M')
    selected_keys = ['route', 'dir', 'seq', 'stop', 'eta', 'eta_seq', 'data_timestamp']
    return [{k: item.get(k) for k in selected_keys} for item in data]
  except Exception as e:
    print(f"获取Citybus站点 {stop_id} 的ETA信息失败: {e}")
    return None

In [14]:
def collect_eta_dataframe(df, stop_id, eta_func, stops_name_id, service_type=None, lock=None, source=''):
  if stop_event.is_set():
    return df

  # print(f"正在获取{source}站点{stop_id}的ETA信息。。。")
  eta_data = eta_func(stop_id, service_type) if service_type else eta_func(stop_id)
  if eta_data and len(eta_data) > 0:
    records_to_append = []
    for eta_record in eta_data:
      stop_name = stops_name_id.get(eta_record.get('stop', stop_id), ['未知站点'])[0]
      new_eta = datetime.strptime(eta_record['eta'], '%Y-%m-%d %H:%M')
      skip_record = False
      match_condition = (
        (df['route'] == eta_record['route']) &
        (df['dir'] == eta_record['dir']) &
        (df['seq'] == eta_record['seq']) &
        (df['stop_id'] == eta_record.get('stop', stop_id))
      )
      matching_rows = df[match_condition]
      if not matching_rows.empty:
        for index, row in matching_rows.iterrows():
          existing_eta = datetime.strptime(row['eta'], '%Y-%m-%d %H:%M')
          time_diff = abs((new_eta - existing_eta).total_seconds() / 60)
          if time_diff == 0:
            skip_record = True
            break
          elif time_diff <= 3:
            with lock:
              df.loc[index, 'eta'] = eta_record['eta']
            skip_record = True
            break
      if not skip_record:
        record = {
          'route': eta_record['route'],
          'dir': eta_record['dir'],
          'seq': eta_record['seq'],
          'stop_id': eta_record.get('stop', stop_id),
          'stop_name': stop_name,
          'eta': eta_record['eta'],
          'eta_seq': eta_record['eta_seq'],
          'data_timestamp': eta_record['data_timestamp'],
          'source': source
        }
        records_to_append.append(record)
    if records_to_append:
      with lock:
        for record in records_to_append:
          df.loc[len(df)] = record
  else:
    print(f"获取{source}站点{stop_id}的ETA信息失败")
  return df

In [15]:
def kmb_fetch_loop():
  while not stop_event.is_set():
    with concurrent.futures.ThreadPoolExecutor(max_workers=len(kmb_stops_id_list)) as executor:
      futures = [executor.submit(collect_eta_dataframe, kmb_eta_df, stop_id, kmb_eta_info, kmb_stops_name_id, kmb_service_type, kmb_lock, 'KMB') for stop_id in kmb_stops_id_list]
      concurrent.futures.wait(futures)
    time.sleep(interval)

In [16]:
def citybus_fetch_loop():
  while not stop_event.is_set():
    with concurrent.futures.ThreadPoolExecutor(max_workers=len(citybus_stops_id_list)) as executor:
      futures = [executor.submit(collect_eta_dataframe, citybus_eta_df, stop_id, citybus_eta_info, citybus_stops_name_id, None, citybus_lock, 'Citybus') for stop_id in citybus_stops_id_list]
      concurrent.futures.wait(futures)
    time.sleep(interval)

In [17]:
# 处理中断信号
def signal_handler(sig, frame):
  print("\n正在关闭程序...")
  stop_event.set()  # 通知线程停止

In [18]:
# 获取站点数据

kmb_service_type_num = 4
kmb_stops_name_id = {}
for i in range(1, kmb_service_type_num + 1):
  stops_in, stops_name_in = kmb_stop_info("inbound", i)
  stops_out, stops_name_out = kmb_stop_info("outbound", i)
  kmb_stops_name_id.update(stops_name_in or {})
  kmb_stops_name_id.update(stops_name_out or {})

display(pd.DataFrame(kmb_stops_name_id).T)
print(f"站点数量: {len(kmb_stops_name_id)} \n")
kmb_stops_id_list = list(kmb_stops_name_id.keys())

citybus_stops_in, citybus_stops_name_in = citybus_stop_info("inbound")
citybus_stops_out, citybus_stops_name_out = citybus_stop_info("outbound")
citybus_stops_name_id = {**citybus_stops_name_in, **citybus_stops_name_out}

display(pd.DataFrame(citybus_stops_name_id).T)
print(f"站点数量: {len(citybus_stops_name_id)} \n")
citybus_stops_id_list = list(citybus_stops_name_id.keys())

Unnamed: 0,0,1,2,3
9F542D4B6CF41651,大學站 (ST905),UNIVERSITY STATION (ST905),22.412678,114.210665
739A5DDE0CF1970C,馬料水公眾碼頭 (PA100),MA LIU SHUI PUBLIC PIER (PA100),22.415908,114.213778
9E95343C77BB85E2,水上活動中心 (PA104),WATER SPORTS CENTRE (PA104),22.420023,114.21425
F808AF482CCA028E,香港生物科技研究院 (PA105),HONG KONG INSTITUTE OF BIOTECHNOLOGY (PA105),22.422518,114.213925
BE940A2F1B154D6E,科學園(第一期) (PA120),SCIENCE PARK (PHASE 1) (PA120),22.424247,114.211432
E8F5E085BAEFE100,科研路 (PA124),FO YIN ROAD (PA124),22.426628,114.208027
3313B37BF82AFF18,雲滙 (PA125),St MARTIN (PA125),22.429561,114.203338
F8F0E91589F099CD,白石角 (PA141),PAK SHEK KOK (PA141),22.431368,114.201986
5747450359C7907C,逸瓏灣 (PA150),Mayfair by the Sea (PA150),22.431675,114.203753
D31420E8C08168FC,博研路 (PA161),POK YIN ROAD (PA161),22.42784,114.202526


站点数量: 20 



Unnamed: 0,0,1,2,3
3831,"西沙GO PARK, 海映路","GO PARK Sai Sha, Hoi Ying Road",22.427089812091,114.26587060053
3832,"田寮, 西沙路","Tin Liu, Sai Sha Road",22.425568932091,114.26271681053
3833,"官坑, 西沙路","Kwun Hang, Sai Sha Road",22.428665132091,114.26088640053
3834,"西澳, 西沙路","Sai O, Sai Sha Road",22.430695922091,114.25409589053
3457,"樟木頭帝琴灣, 西沙路","Symphony Bay Cheung Muk Tau, Sai Sha Road",22.430093422091,114.25115679053
3841,大學站,University Station,22.412433952091,114.21104821053
3743,"馬料水公眾碼頭, 科學園路","Ma Liu Shui Public Pier, Science Park Road",22.415903682091,114.21369382053
3744,"水上活動中心, 科學園路","Water Sports Centre, Science Park Road",22.419991972091,114.21418574053
3745,"香港生物科技研究院, 科學園路","Hong Kong Institute of Biotechnology, Science ...",22.422587832091,114.21398385053
3839,"香港科學園, 科技大道西","Hong Kong Science Park, Science Park West Avenue",22.424808412091,114.21211687053


站点数量: 25 



In [19]:
# 主程序

stop_event = threading.Event()
signal.signal(signal.SIGINT, signal_handler)

# 初始化DataFrame和锁
columns = ['route', 'dir', 'seq', 'stop_id', 'stop_name', 'eta', 'eta_seq', 'data_timestamp', 'source']
kmb_eta_df = pd.DataFrame(columns=columns)
citybus_eta_df = pd.DataFrame(columns=columns)
kmb_lock = threading.Lock()
citybus_lock = threading.Lock()

# 手动参数
interval = 10  # 抓取间隔（秒）
kmb_service_type = 1 # 272A有4种路线方案
run_duration = 20 #3 * 3600  # 定时参数 (以秒为单位)，此处默认3小时

global combined_df
combined_df = pd.DataFrame(columns=columns)

# 启动线程
kmb_thread = threading.Thread(target=kmb_fetch_loop, daemon=True)
citybus_thread = threading.Thread(target=citybus_fetch_loop, daemon=True)
kmb_thread.start()
citybus_thread.start()

# 启动定时器
timer = threading.Timer(run_duration, stop_event.set)
timer.start()

try:
  while not stop_event.is_set(): # 检查 stop_event 是否设置
    time.sleep(interval)  # 与内层同步
    with kmb_lock:
      kmb_copy = kmb_eta_df.copy()
    with citybus_lock:
      citybus_copy = citybus_eta_df.copy()
    combined_df = pd.concat([kmb_copy, citybus_copy], axis=0, ignore_index=True)
    print("\n合并后的DataFrame内容：")
    print(tabulate(combined_df, headers='keys', tablefmt='psql', showindex=False))
    print(f"\n 数据数量：{len(combined_df)}")
    print(f"\n下一次合并等待 {interval} 秒...")

except KeyboardInterrupt:
  print("\nInterrupted")
  stop_event.set()  # Signal threads to stop

# 取消定时器 (如果用户手动中断)
timer.cancel()

# 等待线程结束 (如果它们不是 daemon 线程)
# 如果是 daemon 线程，主线程退出时它们会自动终止，但等待可以确保它们完成当前任务
kmb_thread.join(timeout= 5) # 给线程一点额外的时间
citybus_thread.join(timeout= 5)

print("程序已关闭。")

  matching_rows = df[match_condition]


获取KMB站点730AEBA1D2D8B20E的ETA信息失败
获取KMB站点A9459D38A4A41F36的ETA信息失败

合并后的DataFrame内容：
+---------+-------+-------+------------------+------------------------------+------------------+-----------+------------------+----------+
| route   | dir   |   seq | stop_id          | stop_name                    | eta              |   eta_seq | data_timestamp   | source   |
|---------+-------+-------+------------------+------------------------------+------------------+-----------+------------------+----------|
| 272A    | O     |     5 | BE940A2F1B154D6E | 科學園(第一期) (PA120)       | 2025-09-01 17:48 |         1 | 2025-09-01 17:41 | KMB      |
| 272A    | O     |     5 | BE940A2F1B154D6E | 科學園(第一期) (PA120)       | 2025-09-01 18:03 |         2 | 2025-09-01 17:41 | KMB      |
| 272A    | O     |     5 | BE940A2F1B154D6E | 科學園(第一期) (PA120)       | 2025-09-01 18:18 |         3 | 2025-09-01 17:41 | KMB      |
| 272A    | O     |     3 | 9E95343C77BB85E2 | 水上活動中心 (PA104)         | 2025-09-01 17:47 |         1 |

In [21]:
# 保存csv文件
csv_path = 'current_eta_data.csv'
combined_df.to_csv(csv_path, index=False, encoding='utf-8-sig')
print(f"数据已保存到 {csv_path}")

数据已保存到 current_eta_data.csv


In [None]:
combined_df= pd.read_csv('current_eta_data.csv')

In [None]:
# KMB stop_id: Citybus stop_id
stop_id_mapping = {
    '9F542D4B6CF41651':'003841',  # 大學站 (ST905, 去程)-大學站（inbound）
    '739A5DDE0CF1970C':'003743',  # 馬料水公眾碼頭 (PA100)
    '9E95343C77BB85E2':'003744',  # 水上活動中心 (PA104)
    'F808AF482CCA028E':'003745',  # 香港生物科技研究院 (PA105)
    '3313B37BF82AFF18':'003748',  # 雲滙 (PA125)
    'F8F0E91589F099CD':'003838',  # 白石角 (PA141)
    # '730AEBA1D2D8B20E':'003839',  # 香港科學園 (PA112)
    # 'A9459D38A4A41F36':'003840',  # 香港科學園第三期 (PA115)
    '3F24CFF9046300D9':'003736',  # 雲滙 (PA206，回程) - 雲滙, 創新路 (outbound)
    'B34F59A0270AEDA4':'003737',  # 創新路 (PA212) - 海日灣II, 創新路 (outbound)
    '39E7051B17D302DA':'003738',  # 科研路 (PA214)
    '1C6EAAF5F48167F9':'003739',  # 科學園(第一期) (PA219)
    'DA9490C24D6E6026':'003740',  # 香港生物科技研究院 (PA230)
    '64101F297D3C1C55':'003741',  # 水上活動中心 (PA234)
    '5B39CC28607910E4':'003742',  # 馬料水公眾碼頭 (PA237)
    'EC5018363D5C45EB':'003841O'  # 大學站 (ST900)
}

# transform KMB dir
inbound_kmb_ids = [
    '9F542D4B6CF41651',
    '739A5DDE0CF1970C',
    '9E95343C77BB85E2',
    'F808AF482CCA028E',
    'BE940A2F1B154D6E',
    'E8F5E085BAEFE100',
    '3313B37BF82AFF18',
    '730AEBA1D2D8B20E',
    'A9459D38A4A41F36'
]

In [None]:
combined_df = combined_df.sort_values(by=['route','seq','stop_id','eta'])

print(len(combined_df))

980


In [None]:
# 分别更新KMB和Citybus的到站数据
update_df = pd.DataFrame(columns=combined_df.columns)
combined_df['eta'] = pd.to_datetime(combined_df['eta'])
for index, row in combined_df.iterrows():
    matches = update_df[
        (update_df['route'] == row['route']) &
        (update_df['dir'] == row['dir']) &
        (update_df['seq'] == row['seq']) &
        (update_df['stop_id'] == row['stop_id'])
    ]

    updated_existing = False
    if not matches.empty:
        if row['route'] == '272A':
          # 如果数据前后差4min，则更新数据
            time_threshold = 4
        elif row['route'] == '582':
            time_threshold = 13

        for match_index, match_row in matches.iterrows():
            time_diff = abs((row['eta'] - match_row['eta']).total_seconds() / 60)

            #  时差在阈值范围内
            if time_diff <= time_threshold:
                update_df.loc[match_index, 'eta'] = row['eta']
                updated_existing = True
                break

    # 时差范围外，加入表中
    if not updated_existing:
        update_df = pd.concat([update_df, pd.DataFrame([row])], ignore_index=True)

update_df['eta'] = update_df['eta'].apply(lambda x: datetime.strftime(x, '%Y-%m-%d %H:%M'))
combined_df = update_df
display(combined_df)

  update_df = pd.concat([update_df, pd.DataFrame([row])], ignore_index=True)


Unnamed: 0,route,dir,seq,stop_id,stop_name,eta,eta_seq,data_timestamp,source
0,272A,O,1,9F542D4B6CF41651,大學站 (ST905),2025-08-19 15:00,1,2025-08-19 14:58,KMB
1,272A,O,1,9F542D4B6CF41651,大學站 (ST905),2025-08-19 15:15,2,2025-08-19 14:58,KMB
2,272A,O,1,9F542D4B6CF41651,大學站 (ST905),2025-08-19 15:30,3,2025-08-19 14:58,KMB
3,272A,O,1,9F542D4B6CF41651,大學站 (ST905),2025-08-19 15:45,3,2025-08-19 15:08,KMB
4,272A,O,1,9F542D4B6CF41651,大學站 (ST905),2025-08-19 16:00,3,2025-08-19 15:18,KMB
...,...,...,...,...,...,...,...,...,...
928,582,I,29,003838,白石角,2025-08-19 19:22,3,2025-08-19 18:28,Citybus
929,582,I,29,003838,白石角,2025-08-19 19:52,3,2025-08-19 18:58,Citybus
930,582,I,29,003838,白石角,2025-08-19 20:23,2,2025-08-19 19:28,Citybus
931,582,I,29,003838,白石角,2025-08-19 20:50,2,2025-08-19 19:58,Citybus


In [None]:
# 改变KMB的dir
combined_df_copy = combined_df.copy()
combined_df_copy.loc[(combined_df_copy['source']=='KMB') & (combined_df_copy['stop_id'].isin(inbound_kmb_ids)),'dir'] = 'I'

# 筛选高峰期的数据
def is_peak_hour(eta):
  eta = datetime.strptime(eta, '%Y-%m-%d %H:%M')
  morning_peak = (eta.hour == 8) or (eta.hour == 9 and eta.minute <= 59)
  evening_peak = (eta.hour == 17) or (eta.hour == 18) or (eta.hour == 19 and eta.minute <= 59)
  return morning_peak or evening_peak
combined_df_copy = combined_df_copy[combined_df_copy['eta'].apply(is_peak_hour)]

# 改变citybus的outbound的大学站stop_id
combined_df_copy.loc[(combined_df_copy['source']=='Citybus') & (combined_df_copy['dir']=='O') & (combined_df_copy['stop_id']=='003841'),'stop_id'] = '003841O'

# 改变citybus的inbound的白石角站stop_id
combined_df_copy.loc[(combined_df_copy['source']=='Citybus') & (combined_df_copy['dir']=='I') & (combined_df_copy['stop_id']=='003838'),'stop_id'] = '003838I'

# 改变combined_df_copy的eta的表现形式
combined_df_copy['eta'] = combined_df_copy['eta'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M'))
combined_df_copy['eta'] = combined_df_copy['eta'].apply(lambda x: datetime.strftime(x, '%H:%M'))

# 统一stop_id
def map_stop_id(row):
  if row['source']=='KMB':
    return stop_id_mapping.get(row['stop_id'],row['stop_id'])
  return row['stop_id']
combined_df_copy['mapped_stop_id'] = combined_df_copy.apply(map_stop_id, axis=1)

# 同时进站
simul_arrival = []
for map_id in combined_df_copy['mapped_stop_id'].unique():

  group = combined_df_copy[combined_df_copy['mapped_stop_id'] == map_id]
  kmb_group = group[group['source']=='KMB']
  citybus_group = group[group['source']=='Citybus']

  for kmb_index, kmb_row in kmb_group.iterrows():
    kmb_eta = datetime.strptime(kmb_row['eta'], '%H:%M')
    for citybus_index, citybus_row in citybus_group.iterrows():
      if kmb_row['dir'] == citybus_row['dir']:
        citybus_eta = datetime.strptime(citybus_row['eta'], '%H:%M')
        time_diff = abs((kmb_eta - citybus_eta).total_seconds() / 60)
        if time_diff <= 2:
          simul_arrival.append({
              'mapped_stop_id': map_id,
              'kmb_route': kmb_row['route'],
              'citybus_route': citybus_row['route'],
              'kmb_dir': kmb_row['dir'],
              'citybus_dir': citybus_row['dir'],
              'kmb_stop_id': kmb_row['stop_id'],
              'citybus_stop_id': citybus_row['stop_id'],
              'kmb_stop_name': kmb_row['stop_name'],
              'citybus_stop_name': citybus_row['stop_name'],
              'kmb_eta': kmb_row['eta'],
              'citybus_eta': citybus_row['eta'],
              'time_different(min)': time_diff
          })

simul_arrival_df = pd.DataFrame(simul_arrival)
simul_arrival_df = simul_arrival_df.sort_values(by=['citybus_dir','citybus_eta'])
print(f"\nKMB {kmb_route} 和 {citybus_route} 同时进站表")
print(f"数据数量: {len(simul_arrival_df)}")
print(tabulate(simul_arrival_df, headers='keys', tablefmt='psql', showindex=False))




KMB 272A 和 582 同时进站表
数据数量: 38
+------------------+-------------+-----------------+-----------+---------------+------------------+-------------------+----------------------------+------------------------------+-----------+---------------+-----------------------+
| mapped_stop_id   | kmb_route   |   citybus_route | kmb_dir   | citybus_dir   | kmb_stop_id      | citybus_stop_id   | kmb_stop_name              | citybus_stop_name            | kmb_eta   | citybus_eta   |   time_different(min) |
|------------------+-------------+-----------------+-----------+---------------+------------------+-------------------+----------------------------+------------------------------+-----------+---------------+-----------------------|
| 003841           | 272A        |             582 | I         | I             | 9F542D4B6CF41651 | 003841            | 大學站 (ST905)             | 大學站                       | 17:15     | 17:16         |                     1 |
| 003743           | 272A        |             

In [None]:
# 筛选出KMB和Citybus各自共享站点的记录
overlap_stops = list(set(stop_id_mapping.values()))
overlap_data = combined_df_copy[combined_df_copy['mapped_stop_id'].isin(overlap_stops)]

# 按 mapped_stop_id 和 source 分组并计数
record_counts = overlap_data.groupby(['mapped_stop_id', 'source']).size().unstack(fill_value=0)

# 为了显示站点名称，可以将 mapped_stop_id 与站点名称合并
# 先创建一个 mapped_stop_id 到 stop_name 的映射
stop_name_map = overlap_data.drop_duplicates(subset=['mapped_stop_id']).set_index('mapped_stop_id')['stop_name'].to_dict()

record_counts['stop_name'] = record_counts.index.map(stop_name_map)

# 按照stop_id_mapping进行站点重排序
exist_ids = record_counts.index.tolist()

new_index = [id for id in list(stop_id_mapping.values()) if id in exist_ids]
record_counts = record_counts.reindex(new_index)
record_counts = record_counts[['stop_name', 'KMB', 'Citybus']]

print("共享站点 KMB 和 Citybus 记录数：")
display(record_counts)

共享站点 KMB 和 Citybus 记录数：


source,stop_name,KMB,Citybus
mapped_stop_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
003841,大學站 (ST905),13,6
003743,馬料水公眾碼頭 (PA100),13,6
003744,水上活動中心 (PA104),13,6
003745,香港生物科技研究院 (PA105),13,6
003748,雲滙 (PA125),13,6
003838,白石角 (PA141),14,6
003736,雲滙 (PA206),14,6
003737,創新路 (PA212),14,6
003738,科研路 (PA214),13,6
003739,科學園(第一期) (PA219),15,6


In [None]:
# citybus在overlap_stop的总数据
citybus_overlap_data = combined_df_copy[(combined_df_copy['source'] == 'Citybus') & (combined_df_copy['mapped_stop_id'].isin(overlap_stops))]
print(f"数据数量: {len(citybus_overlap_data)} \n ")
display(citybus_overlap_data.head())

数据数量: 84 
 


Unnamed: 0,route,dir,seq,stop_id,stop_name,eta,eta_seq,data_timestamp,source,mapped_stop_id
576,582,O,1,3838,白石角,17:00,2,2025-08-19 16:08,Citybus,3838
577,582,O,1,3838,白石角,17:30,2,2025-08-19 16:38,Citybus,3838
578,582,O,1,3838,白石角,18:00,2,2025-08-19 17:08,Citybus,3838
579,582,O,1,3838,白石角,18:30,2,2025-08-19 17:38,Citybus,3838
580,582,O,1,3838,白石角,19:00,2,2025-08-19 18:08,Citybus,3838


In [None]:
# citybus分组，只保留stop_id,stop_name,eta(list)
citybus_group = citybus_overlap_data.groupby('mapped_stop_id').agg(
    stop_name=('stop_name', 'first'),
    etas=('eta', list))
exist_ids = citybus_group.index.tolist()
new_index = [id for id in list(stop_id_mapping.values()) if id in exist_ids]

citybus_group = citybus_group.reindex(new_index)
citybus_group = citybus_group.reset_index()
citybus_group_inbound = citybus_group.iloc[:5]
citybus_group_outbound = citybus_group.iloc[5:].reset_index(drop=True)
display(citybus_group_outbound)

Unnamed: 0,mapped_stop_id,stop_name,etas
0,003838,白石角,"[17:00, 17:30, 18:00, 18:30, 19:00, 19:30]"
1,003736,"雲滙, 創新路","[17:01, 17:31, 18:01, 18:31, 19:01, 19:31]"
2,003737,"海日灣II, 創新路","[17:02, 17:32, 18:02, 18:32, 19:02, 19:32]"
3,003738,"科研路, 創新路","[17:09, 17:39, 18:11, 18:39, 19:08, 19:40]"
4,003739,"香港科學園第一期, 創新路","[17:10, 17:39, 18:11, 18:40, 19:08, 19:41]"
5,003740,"香港生物科技研究院, 科學園路","[17:11, 17:40, 18:18, 18:42, 19:10, 19:42]"
6,003741,"水上活動中心, 科學園路","[17:12, 17:41, 18:19, 18:42, 19:10, 19:43]"
7,003742,"馬料水公眾碼頭, 科學園路","[17:12, 17:41, 18:19, 18:43, 19:11, 19:43]"
8,003841O,大學站,"[17:15, 17:45, 18:22, 18:47, 19:15, 19:46]"


In [None]:
def citybus_frequency(citybus_group):
  for index, row in citybus_group.iterrows():
      citybus_group.at[index, 'etas'] = [datetime.strptime(eta, '%H:%M') for eta in row['etas']]

  travel_times = []

  # 首发站list
  first_stop_etas = citybus_group.loc[0, 'etas']

  for departure_time in first_stop_etas:
      current_arrival_time = departure_time
      trip_travel_times = [current_arrival_time]

      # 遍历其他站点
      for i in range(1, len(citybus_group)):
          next_stop_etas = citybus_group.loc[i, 'etas']
          min_time_diff = float('inf')
          next_arrival_time = None

          # 寻找进站最小值
          for arrival_time in next_stop_etas:
              time_diff = (arrival_time - current_arrival_time).total_seconds() / 60

              if time_diff >= 0 and time_diff < min_time_diff:
                  min_time_diff = time_diff
                  next_arrival_time = arrival_time

          if next_arrival_time:
              trip_travel_times.append(next_arrival_time)
              current_arrival_time = next_arrival_time
          else:
              break

      if trip_travel_times:
          travel_times.append(trip_travel_times)


  # 转换格式
  formatted_travel_times = []

  for trip_times in travel_times:
    formatted_trip = [time.strftime('%H:%M') for time in trip_times]
    formatted_travel_times.append(formatted_trip)

  citybus_group = citybus_group.drop('etas', axis=1)
  for i, trip_etas in enumerate(formatted_travel_times):
    # Create a pandas Series from the list, padding with NaN if necessary
    # The length of the Series should match the number of rows in citybus_group
    new_column_data = pd.Series(trip_etas).reindex(citybus_group.index)
    citybus_group[f'Trip_{i+1}'] = new_column_data

  return citybus_group

citybus_frequency_inbound = citybus_frequency(citybus_group_inbound)
citybus_frequency_outbound = citybus_frequency(citybus_group_outbound)

In [None]:
print(tabulate(citybus_frequency_outbound, headers='keys', tablefmt='psql', showindex=False))

+------------------+------------------------------+----------+----------+----------+----------+----------+----------+
| mapped_stop_id   | stop_name                    | Trip_1   | Trip_2   | Trip_3   | Trip_4   | Trip_5   | Trip_6   |
|------------------+------------------------------+----------+----------+----------+----------+----------+----------|
| 003838           | 白石角                       | 17:00    | 17:30    | 18:00    | 18:30    | 19:00    | 19:30    |
| 003736           | 雲滙, 創新路                 | 17:01    | 17:31    | 18:01    | 18:31    | 19:01    | 19:31    |
| 003737           | 海日灣II, 創新路             | 17:02    | 17:32    | 18:02    | 18:32    | 19:02    | 19:32    |
| 003738           | 科研路, 創新路               | 17:09    | 17:39    | 18:11    | 18:39    | 19:08    | 19:40    |
| 003739           | 香港科學園第一期, 創新路     | 17:10    | 17:39    | 18:11    | 18:40    | 19:08    | 19:41    |
| 003740           | 香港生物科技研究院, 科學園路 | 17:11    | 17:40    | 18:18    | 18:42    | 19:1

In [None]:
def highlight_simultaneous_eta(row):
  styles = [''] * len(row)
  mapped_stop_id = row['mapped_stop_id']
  citybus_stop_name = row['stop_name']

  # Filter simul_arrival_df for the current mapped_stop_id and Citybus stop name
  simul_etas_for_stop = simul_arrival_df[simul_arrival_df['mapped_stop_id'] == mapped_stop_id]

  for col_name, eta_value in row.items():
    if col_name.startswith('Trip_') and pd.notna(eta_value):
      if any(eta_value==citybus_eta for citybus_eta in simul_etas_for_stop['citybus_eta']):
        styles[row.index.get_loc(col_name)] = 'background-color: yellow'

  return styles

styled_citybus_frequency_inbound = citybus_frequency_inbound.style.apply(highlight_simultaneous_eta, axis=1)
print(f"citybus的inbound方向\n")
display(styled_citybus_frequency_inbound)

citybus的inbound方向



Unnamed: 0,mapped_stop_id,stop_name,Trip_1,Trip_2,Trip_3,Trip_4,Trip_5,Trip_6
0,3841,大學站,17:16,17:43,18:19,18:56,19:14,19:43
1,3743,"馬料水公眾碼頭, 科學園路",17:19,17:49,18:21,18:58,19:16,19:45
2,3744,"水上活動中心, 科學園路",17:19,17:49,18:22,18:59,19:17,19:46
3,3745,"香港生物科技研究院, 科學園路",17:20,17:50,18:22,18:59,19:17,19:46
4,3748,"雲滙, 創新路",17:24,17:54,18:31,19:04,19:20,19:50


In [None]:
styled_citybus_frequency_outbound = citybus_frequency_outbound.style.apply(highlight_simultaneous_eta, axis=1)
print(f"citybus的outbound方向\n")
display(styled_citybus_frequency_outbound)

citybus的outbound方向



Unnamed: 0,mapped_stop_id,stop_name,Trip_1,Trip_2,Trip_3,Trip_4,Trip_5,Trip_6
0,003838,白石角,17:00,17:30,18:00,18:30,19:00,19:30
1,003736,"雲滙, 創新路",17:01,17:31,18:01,18:31,19:01,19:31
2,003737,"海日灣II, 創新路",17:02,17:32,18:02,18:32,19:02,19:32
3,003738,"科研路, 創新路",17:09,17:39,18:11,18:39,19:08,19:40
4,003739,"香港科學園第一期, 創新路",17:10,17:39,18:11,18:40,19:08,19:41
5,003740,"香港生物科技研究院, 科學園路",17:11,17:40,18:18,18:42,19:10,19:42
6,003741,"水上活動中心, 科學園路",17:12,17:41,18:19,18:42,19:10,19:43
7,003742,"馬料水公眾碼頭, 科學園路",17:12,17:41,18:19,18:43,19:11,19:43
8,003841O,大學站,17:15,17:45,18:22,18:47,19:15,19:46
