In [None]:
import requests
import time
import pandas as pd
from datetime import datetime
from tabulate import tabulate
import concurrent.futures
import threading
import signal
import sys
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

In [None]:
# KMB API Configuration
kmb_base_url = "https://data.etabus.gov.hk/v1/transport/kmb"
kmb_route = "272A"

# Citybus API Configuration
citybus_base_url = "https://rt.data.gov.hk/v2/transport/citybus"
citybus_company_id = "CTB"
citybus_route = "582"

In [None]:
def kmb_route_info(direction, service_type):
  url = f"{kmb_base_url}/route/{kmb_route}/{direction}/{service_type}"
  try:
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    return response.json().get('data', "Unknown route")
  except Exception as e:
    print(f"Failed to get KMB route information: {e}")
    return None

In [None]:
def kmb_stopid_info(stop_id):
  url = f"{kmb_base_url}/stop/{stop_id}"
  try:
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    data = response.json().get('data', {})
    return [data.get('name_tc', 'Unknown stop'), data.get('name_en', 'Unknown stop'), data.get('lat', 'Unknown stop'), data.get('long', 'Unknown stop')]
  except Exception as e:
    print(f"Failed to get KMB stop {stop_id} name: {e}")
    return None

In [None]:
def kmb_stop_info(direction, service_type):
  url = f"{kmb_base_url}/route-stop/{kmb_route}/{direction}/{service_type}"
  try:
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    data = response.json()
    stops = data.get('data', [])
    stops = [stop['stop'] for stop in stops]
    stops_name = {key: kmb_stopid_info(key) for key in stops}
    return stops, stops_name
  except Exception as e:
    print(f"Failed to get KMB stop IDs: {e}")
    return None, None

In [None]:
def kmb_eta_info(stop_id, service_type):
  url = f"{kmb_base_url}/eta/{stop_id}/{kmb_route}/{service_type}"
  try:
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    data = response.json().get('data', [])
    for eta in data:
      eta_string = eta.get('eta')
      time_string = eta.get('data_timestamp')
      if eta_string:
        eta_obj = datetime.fromisoformat(eta_string)
        eta['eta'] = eta_obj.strftime('%Y-%m-%d %H:%M')
        time_obj = datetime.fromisoformat(time_string)
        eta['data_timestamp'] = time_obj.strftime('%Y-%m-%d %H:%M')
    selected_keys = ['route', 'dir', 'service_type', 'seq', 'eta', 'eta_seq', 'data_timestamp']
    return [{k: item.get(k) for k in selected_keys} for item in data]
  except Exception as e:
    print(f"Failed to get KMB stop {stop_id} ETA information: {e}")
    return None

In [None]:
def citybus_route_info():
  url = f"{citybus_base_url}/route/{citybus_company_id}/{citybus_route}"
  try:
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    return response.json().get('data', "Unknown route")
  except Exception as e:
    print(f"Failed to get Citybus route information: {e}")
    return None

In [None]:
def citybus_bus_stop_info(stop_id):
  url = f"{citybus_base_url}/stop/{stop_id}"
  try:
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    data = response.json().get('data', {})
    return [data.get('name_tc', 'Unknown stop'), data.get('name_en', 'Unknown stop'), data.get('lat', 'Unknown stop'), data.get('long', 'Unknown stop')]
  except Exception as e:
    print(f"Failed to get Citybus stop {stop_id} name: {e}")
    return None

In [None]:
def citybus_stop_info(direction):
  url = f"{citybus_base_url}/route-stop/{citybus_company_id}/{citybus_route}/{direction}"
  try:
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    data = response.json()
    stops = sorted(data.get('data', []), key=lambda x: x.get('seq', 0))
    stops = [stop['stop'] for stop in stops]
    stops_name = {key: citybus_bus_stop_info(key) for key in stops}
    return stops, stops_name
  except Exception as e:
    print(f"Failed to get Citybus stop IDs: {e}")
    return None, None

In [None]:
def citybus_eta_info(stop_id):
  url = f"{citybus_base_url}/eta/{citybus_company_id}/{stop_id}/{citybus_route}"
  try:
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    data = response.json().get('data', [])
    for eta in data:
      eta_string = eta.get('eta')
      time_string = eta.get('data_timestamp')
      if eta_string:
        eta_obj = datetime.fromisoformat(eta_string)
        eta['eta'] = eta_obj.strftime('%Y-%m-%d %H:%M')
        time_obj = datetime.fromisoformat(time_string)
        eta['data_timestamp'] = time_obj.strftime('%Y-%m-%d %H:%M')
    selected_keys = ['route', 'dir', 'seq', 'stop', 'eta', 'eta_seq', 'data_timestamp']
    return [{k: item.get(k) for k in selected_keys} for item in data]
  except Exception as e:
    print(f"Failed to get Citybus stop {stop_id} ETA information: {e}")
    return None

In [None]:
def collect_eta_dataframe(df, stop_id, eta_func, stops_name_id, service_type=None, lock=None, source=''):
  if stop_event.is_set():
    return df

  # print(f"Getting ETA information for {source} stop {stop_id}...")
  eta_data = eta_func(stop_id, service_type) if service_type else eta_func(stop_id)
  if eta_data and len(eta_data) > 0:
    records_to_append = []
    for eta_record in eta_data:
      stop_name = stops_name_id.get(eta_record.get('stop', stop_id), ['Unknown stop'])[0]
      new_eta = datetime.strptime(eta_record['eta'], '%Y-%m-%d %H:%M')
      skip_record = False
      match_condition = (
        (df['route'] == eta_record['route']) &
        (df['dir'] == eta_record['dir']) &
        (df['seq'] == eta_record['seq']) &
        (df['stop_id'] == eta_record.get('stop', stop_id))
      )
      matching_rows = df[match_condition]
      if not matching_rows.empty:
        for index, row in matching_rows.iterrows():
          existing_eta = datetime.strptime(row['eta'], '%Y-%m-%d %H:%M')
          time_diff = abs((new_eta - existing_eta).total_seconds() / 60)
          if time_diff == 0:
            skip_record = True
            break
          elif time_diff <= 3:
            with lock:
              df.loc[index, 'eta'] = eta_record['eta']
            skip_record = True
            break
      if not skip_record:
        record = {
          'route': eta_record['route'],
          'dir': eta_record['dir'],
          'seq': eta_record['seq'],
          'stop_id': eta_record.get('stop', stop_id),
          'stop_name': stop_name,
          'eta': eta_record['eta'],
          'eta_seq': eta_record['eta_seq'],
          'data_timestamp': eta_record['data_timestamp'],
          'source': source
        }
        records_to_append.append(record)
    if records_to_append:
      with lock:
        for record in records_to_append:
          df.loc[len(df)] = record
  else:
    print(f"Failed to get ETA information for {source} stop {stop_id}")
  return df

In [None]:
def kmb_fetch_loop():
  while not stop_event.is_set():
    with concurrent.futures.ThreadPoolExecutor(max_workers=len(kmb_stops_id_list)) as executor:
      futures = [executor.submit(collect_eta_dataframe, kmb_eta_df, stop_id, kmb_eta_info, kmb_stops_name_id, kmb_service_type, kmb_lock, 'KMB') for stop_id in kmb_stops_id_list]
      concurrent.futures.wait(futures)
    time.sleep(interval)

In [None]:
def citybus_fetch_loop():
  while not stop_event.is_set():
    with concurrent.futures.ThreadPoolExecutor(max_workers=len(citybus_stops_id_list)) as executor:
      futures = [executor.submit(collect_eta_dataframe, citybus_eta_df, stop_id, citybus_eta_info, citybus_stops_name_id, None, citybus_lock, 'Citybus') for stop_id in citybus_stops_id_list]
      concurrent.futures.wait(futures)
    time.sleep(interval)

In [None]:
# Handle interrupt signal
def signal_handler(sig, frame):
  print("\nShutting down program...")
  stop_event.set()  # Notify threads to stop

In [None]:
# Get stop data

kmb_service_type_num = 4
kmb_stops_name_id = {}
for i in range(1, kmb_service_type_num + 1):
  stops_in, stops_name_in = kmb_stop_info("inbound", i)
  stops_out, stops_name_out = kmb_stop_info("outbound", i)
  kmb_stops_name_id.update(stops_name_in or {})
  kmb_stops_name_id.update(stops_name_out or {})

display(pd.DataFrame(kmb_stops_name_id).T)
print(f"Number of stops: {len(kmb_stops_name_id)} \n")
kmb_stops_id_list = list(kmb_stops_name_id.keys())

citybus_stops_in, citybus_stops_name_in = citybus_stop_info("inbound")
citybus_stops_out, citybus_stops_name_out = citybus_stop_info("outbound")
citybus_stops_name_id = {**citybus_stops_name_in, **citybus_stops_name_out}

display(pd.DataFrame(citybus_stops_name_id).T)
print(f"Number of stops: {len(citybus_stops_name_id)} \n")
citybus_stops_id_list = list(citybus_stops_name_id.keys())

In [None]:
# Main program

stop_event = threading.Event()
signal.signal(signal.SIGINT, signal_handler)

# Initialize DataFrame and locks
columns = ['route', 'dir', 'seq', 'stop_id', 'stop_name', 'eta', 'eta_seq', 'data_timestamp', 'source']
kmb_eta_df = pd.DataFrame(columns=columns)
citybus_eta_df = pd.DataFrame(columns=columns)
kmb_lock = threading.Lock()
citybus_lock = threading.Lock()

# Manual parameters
interval = 10  # Fetch interval (seconds)
kmb_service_type = 1 # 272A has 4 route options
run_duration = 20 #3 * 3600  # Timer parameter (in seconds), default 3 hours here

global combined_df
combined_df = pd.DataFrame(columns=columns)

# Start threads
kmb_thread = threading.Thread(target=kmb_fetch_loop, daemon=True)
citybus_thread = threading.Thread(target=citybus_fetch_loop, daemon=True)
kmb_thread.start()
citybus_thread.start()

# Start timer
timer = threading.Timer(run_duration, stop_event.set)
timer.start()

try:
  while not stop_event.is_set(): # Check if stop_event is set
    time.sleep(interval)  # Synchronize with inner layer
    with kmb_lock:
      kmb_copy = kmb_eta_df.copy()
    with citybus_lock:
      citybus_copy = citybus_eta_df.copy()
    combined_df = pd.concat([kmb_copy, citybus_copy], axis=0, ignore_index=True)
    print("\nCombined DataFrame content:")
    print(tabulate(combined_df, headers='keys', tablefmt='psql', showindex=False))
    print(f"\n Data count: {len(combined_df)}")
    print(f"\nNext merge waiting {interval} seconds...")

except KeyboardInterrupt:
  print("\nInterrupted")
  stop_event.set()  # Signal threads to stop

# Cancel timer (if user manually interrupts)
timer.cancel()

# Wait for threads to end (if they are not daemon threads)
# If they are daemon threads, they will automatically terminate when main thread exits, but waiting ensures they complete current tasks
kmb_thread.join(timeout= 5) # Give threads some extra time
citybus_thread.join(timeout= 5)

print("Program has been closed.")

In [None]:
# Save CSV file
csv_path = 'current_eta_data.csv'
combined_df.to_csv(csv_path, index=False, encoding='utf-8-sig')
print(f"Data has been saved to {csv_path}")

In [None]:
combined_df= pd.read_csv('current_eta_data.csv')

In [None]:
# KMB stop_id: Citybus stop_id
stop_id_mapping = {
    '9F542D4B6CF41651':'003841',  # University Station (ST905, inbound) - University Station (inbound)
    '739A5DDE0CF1970C':'003743',  # Ma Liu Shui Public Pier (PA100)
    '9E95343C77BB85E2':'003744',  # Water Sports Centre (PA104)
    'F808AF482CCA028E':'003745',  # Hong Kong Institute of Biotechnology (PA105)
    '3313B37BF82AFF18':'003748',  # Cloud Nine (PA125)
    'F8F0E91589F099CD':'003838',  # Pak Shek Kok (PA141)
    # '730AEBA1D2D8B20E':'003839',  # Hong Kong Science Park (PA112)
    # 'A9459D38A4A41F36':'003840',  # Hong Kong Science Park Phase 3 (PA115)
    '3F24CFF9046300D9':'003736',  # Cloud Nine (PA206, outbound) - Cloud Nine, Innovation Road (outbound)
    'B34F59A0270AEDA4':'003737',  # Innovation Road (PA212) - The Cove II, Innovation Road (outbound)
    '39E7051B17D302DA':'003738',  # Science Park Road (PA214)
    '1C6EAAF5F48167F9':'003739',  # Science Park (Phase 1) (PA219)
    'DA9490C24D6E6026':'003740',  # Hong Kong Institute of Biotechnology (PA230)
    '64101F297D3C1C55':'003741',  # Water Sports Centre (PA234)
    '5B39CC28607910E4':'003742',  # Ma Liu Shui Public Pier (PA237)
    'EC5018363D5C45EB':'003841O'  # University Station (ST900)
}

# Transform KMB direction
inbound_kmb_ids = [
    '9F542D4B6CF41651',
    '739A5DDE0CF1970C',
    '9E95343C77BB85E2',
    'F808AF482CCA028E',
    'BE940A2F1B154D6E',
    'E8F5E085BAEFE100',
    '3313B37BF82AFF18',
    '730AEBA1D2D8B20E',
    'A9459D38A4A41F36'
]

In [None]:
combined_df = combined_df.sort_values(by=['route','seq','stop_id','eta'])

print(len(combined_df))

In [None]:
# Update KMB and Citybus arrival data separately
update_df = pd.DataFrame(columns=combined_df.columns)
combined_df['eta'] = pd.to_datetime(combined_df['eta'])
for index, row in combined_df.iterrows():
    matches = update_df[
        (update_df['route'] == row['route']) &
        (update_df['dir'] == row['dir']) &
        (update_df['seq'] == row['seq']) &
        (update_df['stop_id'] == row['stop_id'])
    ]

    updated_existing = False
    if not matches.empty:
        if row['route'] == '272A':
          # If data differs by 4min, update the data
            time_threshold = 4
        elif row['route'] == '582':
            time_threshold = 13

        for match_index, match_row in matches.iterrows():
            time_diff = abs((row['eta'] - match_row['eta']).total_seconds() / 60)

            # Time difference within threshold range
            if time_diff <= time_threshold:
                update_df.loc[match_index, 'eta'] = row['eta']
                updated_existing = True
                break

    # Outside time difference range, add to table
    if not updated_existing:
        update_df = pd.concat([update_df, pd.DataFrame([row])], ignore_index=True)

update_df['eta'] = update_df['eta'].apply(lambda x: datetime.strftime(x, '%Y-%m-%d %H:%M'))
combined_df = update_df
display(combined_df)

In [None]:
# Change KMB direction
combined_df_copy = combined_df.copy()
combined_df_copy.loc[(combined_df_copy['source']=='KMB') & (combined_df_copy['stop_id'].isin(inbound_kmb_ids)),'dir'] = 'I'

# Filter peak hour data
def is_peak_hour(eta):
  eta = datetime.strptime(eta, '%Y-%m-%d %H:%M')
  morning_peak = (eta.hour == 8) or (eta.hour == 9 and eta.minute <= 59)
  evening_peak = (eta.hour == 17) or (eta.hour == 18) or (eta.hour == 19 and eta.minute <= 59)
  return morning_peak or evening_peak
combined_df_copy = combined_df_copy[combined_df_copy['eta'].apply(is_peak_hour)]

# Change Citybus outbound University Station stop_id
combined_df_copy.loc[(combined_df_copy['source']=='Citybus') & (combined_df_copy['dir']=='O') & (combined_df_copy['stop_id']=='003841'),'stop_id'] = '003841O'

# Change Citybus inbound Pak Shek Kok Station stop_id
combined_df_copy.loc[(combined_df_copy['source']=='Citybus') & (combined_df_copy['dir']=='I') & (combined_df_copy['stop_id']=='003838'),'stop_id'] = '003838I'

# Change ETA display format in combined_df_copy
combined_df_copy['eta'] = combined_df_copy['eta'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M'))
combined_df_copy['eta'] = combined_df_copy['eta'].apply(lambda x: datetime.strftime(x, '%H:%M'))

# Unify stop_id
def map_stop_id(row):
  if row['source']=='KMB':
    return stop_id_mapping.get(row['stop_id'],row['stop_id'])
  return row['stop_id']
combined_df_copy['mapped_stop_id'] = combined_df_copy.apply(map_stop_id, axis=1)

# Simultaneous arrivals
simul_arrival = []
for map_id in combined_df_copy['mapped_stop_id'].unique():

  group = combined_df_copy[combined_df_copy['mapped_stop_id'] == map_id]
  kmb_group = group[group['source']=='KMB']
  citybus_group = group[group['source']=='Citybus']

  for kmb_index, kmb_row in kmb_group.iterrows():
    kmb_eta = datetime.strptime(kmb_row['eta'], '%H:%M')
    for citybus_index, citybus_row in citybus_group.iterrows():
      if kmb_row['dir'] == citybus_row['dir']:
        citybus_eta = datetime.strptime(citybus_row['eta'], '%H:%M')
        time_diff = abs((kmb_eta - citybus_eta).total_seconds() / 60)
        if time_diff <= 2:
          simul_arrival.append({
              'mapped_stop_id': map_id,
              'kmb_route': kmb_row['route'],
              'citybus_route': citybus_row['route'],
              'kmb_dir': kmb_row['dir'],
              'citybus_dir': citybus_row['dir'],
              'kmb_stop_id': kmb_row['stop_id'],
              'citybus_stop_id': citybus_row['stop_id'],
              'kmb_stop_name': kmb_row['stop_name'],
              'citybus_stop_name': citybus_row['stop_name'],
              'kmb_eta': kmb_row['eta'],
              'citybus_eta': citybus_row['eta'],
              'time_different(min)': time_diff
          })

simul_arrival_df = pd.DataFrame(simul_arrival)
simul_arrival_df = simul_arrival_df.sort_values(by=['citybus_dir','citybus_eta'])
print(f"\nKMB {kmb_route} and {citybus_route} simultaneous arrival table")
print(f"Data count: {len(simul_arrival_df)}")
print(tabulate(simul_arrival_df, headers='keys', tablefmt='psql', showindex=False))



In [None]:
# Filter records for KMB and Citybus shared stops
overlap_stops = list(set(stop_id_mapping.values()))
overlap_data = combined_df_copy[combined_df_copy['mapped_stop_id'].isin(overlap_stops)]

# Group by mapped_stop_id and source and count
record_counts = overlap_data.groupby(['mapped_stop_id', 'source']).size().unstack(fill_value=0)

# To display stop names, merge mapped_stop_id with stop names
# First create a mapping from mapped_stop_id to stop_name
stop_name_map = overlap_data.drop_duplicates(subset=['mapped_stop_id']).set_index('mapped_stop_id')['stop_name'].to_dict()

record_counts['stop_name'] = record_counts.index.map(stop_name_map)

# Reorder stops according to stop_id_mapping
exist_ids = record_counts.index.tolist()

new_index = [id for id in list(stop_id_mapping.values()) if id in exist_ids]
record_counts = record_counts.reindex(new_index)
record_counts = record_counts[['stop_name', 'KMB', 'Citybus']]

print("Shared stops KMB and Citybus record count:")
display(record_counts)

In [None]:
# Citybus total data at overlap stops
citybus_overlap_data = combined_df_copy[(combined_df_copy['source'] == 'Citybus') & (combined_df_copy['mapped_stop_id'].isin(overlap_stops))]
print(f"Data count: {len(citybus_overlap_data)} \n ")
display(citybus_overlap_data.head())

In [None]:
# Citybus grouping, keep only stop_id, stop_name, eta(list)
citybus_group = citybus_overlap_data.groupby('mapped_stop_id').agg(
    stop_name=('stop_name', 'first'),
    etas=('eta', list))
exist_ids = citybus_group.index.tolist()
new_index = [id for id in list(stop_id_mapping.values()) if id in exist_ids]

citybus_group = citybus_group.reindex(new_index)
citybus_group = citybus_group.reset_index()
citybus_group_inbound = citybus_group.iloc[:5]
citybus_group_outbound = citybus_group.iloc[5:].reset_index(drop=True)
display(citybus_group_outbound)

In [None]:
def citybus_frequency(citybus_group):
  for index, row in citybus_group.iterrows():
      citybus_group.at[index, 'etas'] = [datetime.strptime(eta, '%H:%M') for eta in row['etas']]

  travel_times = []

  # First stop list
  first_stop_etas = citybus_group.loc[0, 'etas']

  for departure_time in first_stop_etas:
      current_arrival_time = departure_time
      trip_travel_times = [current_arrival_time]

      # Traverse other stops
      for i in range(1, len(citybus_group)):
          next_stop_etas = citybus_group.loc[i, 'etas']
          min_time_diff = float('inf')
          next_arrival_time = None

          # Find minimum arrival time
          for arrival_time in next_stop_etas:
              time_diff = (arrival_time - current_arrival_time).total_seconds() / 60

              if time_diff >= 0 and time_diff < min_time_diff:
                  min_time_diff = time_diff
                  next_arrival_time = arrival_time

          if next_arrival_time:
              trip_travel_times.append(next_arrival_time)
              current_arrival_time = next_arrival_time
          else:
              break

      if trip_travel_times:
          travel_times.append(trip_travel_times)


  # Convert format
  formatted_travel_times = []

  for trip_times in travel_times:
    formatted_trip = [time.strftime('%H:%M') for time in trip_times]
    formatted_travel_times.append(formatted_trip)

  citybus_group = citybus_group.drop('etas', axis=1)
  for i, trip_etas in enumerate(formatted_travel_times):
    # Create a pandas Series from the list, padding with NaN if necessary
    # The length of the Series should match the number of rows in citybus_group
    new_column_data = pd.Series(trip_etas).reindex(citybus_group.index)
    citybus_group[f'Trip_{i+1}'] = new_column_data

  return citybus_group

citybus_frequency_inbound = citybus_frequency(citybus_group_inbound)
citybus_frequency_outbound = citybus_frequency(citybus_group_outbound)

In [None]:
print(tabulate(citybus_frequency_outbound, headers='keys', tablefmt='psql', showindex=False))

In [None]:
def highlight_simultaneous_eta(row):
  styles = [''] * len(row)
  mapped_stop_id = row['mapped_stop_id']
  citybus_stop_name = row['stop_name']

  # Filter simul_arrival_df for the current mapped_stop_id and Citybus stop name
  simul_etas_for_stop = simul_arrival_df[simul_arrival_df['mapped_stop_id'] == mapped_stop_id]

  for col_name, eta_value in row.items():
    if col_name.startswith('Trip_') and pd.notna(eta_value):
      if any(eta_value==citybus_eta for citybus_eta in simul_etas_for_stop['citybus_eta']):
        styles[row.index.get_loc(col_name)] = 'background-color: yellow'

  return styles

styled_citybus_frequency_inbound = citybus_frequency_inbound.style.apply(highlight_simultaneous_eta, axis=1)
print(f"Citybus inbound direction\n")
display(styled_citybus_frequency_inbound)

In [None]:
styled_citybus_frequency_outbound = citybus_frequency_outbound.style.apply(highlight_simultaneous_eta, axis=1)
print(f"Citybus outbound direction\n")
display(styled_citybus_frequency_outbound)