In [1]:
import psycopg2
import pandas as pd
import numpy as np
import json
import os

In [5]:
conn = psycopg2.connect(dbname='vic_db', user='postgres', password='postgres', host='localhost', port='5432')
conn.autocommit = True
cursor = conn.cursor()

In [19]:
cursor.execute("SELECT * FROM gtfs_4.calendar;")
calendar_df = pd.DataFrame(cursor.fetchall(), columns=[desc[0] for desc in cursor.description])
cursor.execute("SELECT * FROM gtfs_4.calendar_dates;")
calendar_dates_df = pd.DataFrame(cursor.fetchall(), columns=[desc[0] for desc in cursor.description])

In [21]:
calendar_dates_df

Unnamed: 0,service_id,date,exception_type
0,MF10-21-670-aus,20240404,2
1,MF10-21-670-aus,20240411,2
2,MF10-21-670-aus,20240418,2
3,MF10-21-670-aus,20240425,2
4,MF10-21-670-aus,20240502,2
...,...,...,...
65089,Sun9-53-472-aus,20240602,2
65090,Sun9-53-472-aus,20240609,2
65091,Sun9-53-472-aus,20240616,2
65092,Sun9-53-472-aus,20240623,2


In [22]:
def get_dates(monday, tuesday, wednesday, thursday, friday, saturday, sunday, start_date, end_date):
    # Get list of dates based on week pattern and date range
    week_pattern = [bool(int(monday)), bool(int(tuesday)), bool(int(wednesday)), bool(int(thursday)), bool(int(friday)), bool(int(saturday)), bool(int(sunday))]
    start_date = pd.to_datetime(start_date, format='%Y%m%d')
    end_date = pd.to_datetime(end_date, format='%Y%m%d')
    dates = pd.date_range(start_date, end_date)
    return dates[[week_pattern[i] for i in dates.dayofweek]]
    
def get_dates_df_calendar(calendar_df: pd.DataFrame, calendar_dates_df: pd.DataFrame):
    
    weekdate_columns = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
    daterange_columns = ['start_date', 'end_date']
    pattern_columns = weekdate_columns + daterange_columns
    
    # Drop duplicates to reduce the number of rows to be processed
    df_dates = calendar_df[pattern_columns].drop_duplicates()
    
    # Get date list based on week pattern and date range
    df_dates['date'] = df_dates.apply(lambda x: get_dates(x['monday'], x['tuesday'], x['wednesday'], x['thursday'], x['friday'], x['saturday'], x['sunday'], x['start_date'], x['end_date']), axis=1)

    df_dates['date'] = df_dates['date'].apply(lambda x: [y.strftime('%Y%m%d') for y in x])
    
    # Join the date list with the original calendar table
    df_dates = pd.merge(calendar_df, df_dates, on=pattern_columns, how='left')
    
    # Explode the date list into separate rows
    df_dates = df_dates[['service_id', 'date']].explode('date')

    # Join the date df with the calendar_dates df
    df_dates = pd.merge(df_dates, calendar_dates_df.astype({'date': str, 'exception_type': str}), on=['service_id', 'date'], how='outer')
    
    # Drop 2 and keep 1 and NaN
    df_dates = df_dates[df_dates['exception_type'] != '2'].reset_index(drop=True)

    return df_dates


In [62]:
df = get_dates_df_calendar(calendar_df, calendar_dates_df)
df['route_idx'] = df['service_id'].apply(lambda x: x.split('-'))
df['route_id'] = df['route_idx'].apply(lambda x: x[2] + ('-' + x[3] if len(x) > 4 else ''))
del df['route_idx']
df = df.groupby('route_id')['date'].apply(lambda x: sorted(np.array(x))).reset_index()
df['date_count'] = df['date'].apply(lambda x: len(x))
df['date_set'] = df['date'].apply(lambda x: sorted(list(set(x))))
df['date_set_count'] = df['date_set'].apply(lambda x: len(x))
df['date_start'] = df['date_set'].apply(lambda x: x[0])
df['date_end'] = df['date_set'].apply(lambda x: x[-1])

In [63]:
df.sort_values('date_set_count', ascending=False)

Unnamed: 0,route_id,date,date_count,date_set,date_set_count,date_start,date_end
318,858,"[20240328, 20240328, 20240329, 20240330, 20240...",109,"[20240328, 20240329, 20240330, 20240331, 20240...",95,20240328,20240630
290,813,"[20240328, 20240328, 20240329, 20240329, 20240...",100,"[20240328, 20240329, 20240330, 20240331, 20240...",95,20240328,20240630
188,578,"[20240328, 20240329, 20240330, 20240331, 20240...",95,"[20240328, 20240329, 20240330, 20240331, 20240...",95,20240328,20240630
189,579,"[20240328, 20240329, 20240330, 20240331, 20240...",95,"[20240328, 20240329, 20240330, 20240331, 20240...",95,20240328,20240630
190,580,"[20240328, 20240329, 20240330, 20240331, 20240...",95,"[20240328, 20240329, 20240330, 20240331, 20240...",95,20240328,20240630
...,...,...,...,...,...,...,...
356,953,"[20240329, 20240330, 20240405, 20240406, 20240...",6,"[20240329, 20240330, 20240405, 20240406, 20240...",6,20240329,20240413
357,959,"[20240329, 20240330, 20240405, 20240406, 20240...",6,"[20240329, 20240330, 20240405, 20240406, 20240...",6,20240329,20240413
42,301,"[20240328, 20240408, 20240409, 20240410, 20240...",6,"[20240328, 20240408, 20240409, 20240410, 20240...",6,20240328,20240412
355,951,"[20240329, 20240330, 20240405, 20240406, 20240...",6,"[20240329, 20240330, 20240405, 20240406, 20240...",6,20240329,20240413


In [69]:
calendar_df['route_idx'] = calendar_df['service_id'].apply(lambda x: x.split('-'))
calendar_df['route_id'] = calendar_df['route_idx'].apply(lambda x: x[2] + ('-' + x[3] if len(x) > 4 else ''))
del calendar_df['route_idx']

In [73]:
service_availability_df = calendar_df.groupby('route_id').aggregate({
    'monday': 'sum',
    'tuesday': 'sum',
    'wednesday': 'sum',
    'thursday': 'sum',
    'friday': 'sum',
    'saturday': 'sum',
    'sunday': 'sum',
}).reset_index()
service_availability_df['pattern'] = service_availability_df.apply(lambda x: [int(x[day]) for day in ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']], axis=1)

In [91]:
service_absent_df = service_availability_df[service_availability_df['pattern'].apply(lambda x: 0 in x)]
service_full_df = service_availability_df[service_availability_df['pattern'].apply(lambda x: 0 not in x)]

In [94]:
service_availability_df[service_availability_df['route_id'].str.contains('-')]

Unnamed: 0,route_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,pattern
228,695-F,3,0,0,0,5,6,8,"[3, 0, 0, 0, 5, 6, 8]"


In [93]:
service_full_df[service_full_df['route_id'].str.startswith('7')]

Unnamed: 0,route_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,pattern
232,701,2,1,1,2,2,1,1,"[2, 1, 1, 2, 2, 1, 1]"
233,703,2,1,1,2,2,1,1,"[2, 1, 1, 2, 2, 1, 1]"
234,704,2,1,1,2,2,1,1,"[2, 1, 1, 2, 2, 1, 1]"
237,708,6,3,3,6,5,3,3,"[6, 3, 3, 6, 5, 3, 3]"
238,709,5,2,2,5,4,3,3,"[5, 2, 2, 5, 4, 3, 3]"
239,732,7,4,4,7,7,6,6,"[7, 4, 4, 7, 7, 6, 6]"
240,733,2,1,1,2,2,1,1,"[2, 1, 1, 2, 2, 1, 1]"
241,734,2,1,1,3,3,4,4,"[2, 1, 1, 3, 3, 4, 4]"
242,735,5,3,3,6,6,6,6,"[5, 3, 3, 6, 6, 6, 6]"
243,736,5,3,3,6,6,6,6,"[5, 3, 3, 6, 6, 6, 6]"


In [96]:
TABLE_NAMES = ['agency', 'calendar', 'calendar_dates', 'routes', 'shapes', 'stop_times', 'stops', 'trips']
MODE_IDS = ['1', '2', '3', '4', '5', '6', '7', '8', '10', '11']

cursor.execute("SELECT * FROM gtfs_4.stop_times;")
stop_times_df = pd.DataFrame(cursor.fetchall(), columns=[desc[0] for desc in cursor.description])
cursor.execute("SELECT * FROM gtfs_4.stops;")
stops_df = pd.DataFrame(cursor.fetchall(), columns=[desc[0] for desc in cursor.description])
cursor.execute("SELECT * FROM gtfs_4.trips;")
trips_df = pd.DataFrame(cursor.fetchall(), columns=[desc[0] for desc in cursor.description])

In [97]:
trips_df

Unnamed: 0,route_id,service_id,trip_id,shape_id,trip_headsign,direction_id
0,12-831-aus-1,MF2-12-831-aus,12-831--1-MF2-105,12-831-aus-1.3.H,Kingsmere Estate,0
1,12-831-aus-1,MF2-12-831-aus,12-831--1-MF2-107,12-831-aus-1.3.H,Kingsmere Estate,0
2,12-831-aus-1,MF2-12-831-aus,12-831--1-MF2-109,12-831-aus-1.3.H,Kingsmere Estate,0
3,12-831-aus-1,MF2-12-831-aus,12-831--1-MF2-111,12-831-aus-1.3.H,Kingsmere Estate,0
4,12-831-aus-1,MF2-12-831-aus,12-831--1-MF2-113,12-831-aus-1.3.H,Kingsmere Estate,0
...,...,...,...,...,...,...
143172,82-498-aus-1,Sun7-82-498-aus,82-498--1-Sun7-6640774,82-498-aus-1.3.R,Laverton Station,1
143173,82-498-aus-1,Sun7-82-498-aus,82-498--1-Sun7-6640874,82-498-aus-1.2.H,Hoppers Crossing Station,0
143174,82-498-aus-1,Sun7-82-498-aus,82-498--1-Sun7-6641174,82-498-aus-1.3.R,Laverton Station,1
143175,82-498-aus-1,Sun7-82-498-aus,82-498--1-Sun7-6641274,82-498-aus-1.2.H,Hoppers Crossing Station,0


In [None]:
def merge_stop_ids_sequence(stop_ids_list):
    stop_ids_max : list = max(stop_ids_list, key=lambda x: len(x)).copy()
    for stop_ids in stop_ids_list:
        old_i = 0
        stack = []
        for stop_id in stop_ids:
            cur_i = old_i
            while cur_i < len(stop_ids_max) and stop_id != stop_ids_max[cur_i]:
                cur_i += 1
            if cur_i >= len(stop_ids_max):
                stack.append(stop_id)
            else:
                stack.extend(stop_ids_max[old_i:cur_i+1])
                old_i = cur_i+1
        stack.extend(stop_ids_max[old_i:])
        stop_ids_max = stack
    return stop_ids_max

def get_true_stop_order_sequence(stop_ids_full: list, stop_ids: list, stops_sequence: list[int]):
    # assert len(stop_id) == len(stop_sequence)
    stop_true_sequence = []
    i = 0
    j = 0
    offset = 0
    while i < len(stop_ids_full) and j < len(stop_ids):
        if stop_ids_full[i] == stop_ids[j]:
            new_ix = i + 1 + offset
            cur_ix = stops_sequence[j]
            if new_ix < cur_ix:
                stop_true_sequence.append(cur_ix)
                offset = cur_ix - new_ix
            else:
                stop_true_sequence.append(new_ix)
            j += 1
        i += 1
    assert j == len(stop_ids)
    return stop_true_sequence

In [100]:
conn.close()

In [None]:
bid = '4'
vid = '20220403_025040'
df_trips = stop_times_df.sort_values(by=['trip_id', 'stop_sequence'])
# 5s - 10s
df_trips1 = df_trips.groupby('trip_id')['stop_id'].apply(np.array).reset_index(name='stop_ids')
# 2s - 5s, using np.array. Faster than using list (2s - 5s)
df_trips2 = df_trips.groupby('trip_id')['stop_sequence'].apply(np.array).reset_index(name='stops_sequence')
# 3s - 5s, using np.array. Faster than using list (1m 30s - 4m)
df_trips = pd.merge(df_trips1, df_trips2, on='trip_id')
df_trips = pd.merge(df_trips, DF[vid][bid]['trips'][['trip_id', 'direction_id']], on='trip_id', how='left')
df_trips = pd.merge(df_trips, DF[vid][bid]['trip_ids'], on='trip_id', how='left')
df_trips['pattern'] = df_trips['stop_ids'].apply(lambda x: '-'.join(x))
df_tripstops_full_all_patterns = df_trips.drop_duplicates(subset=['route_code', 'direction_id', 'pattern'])
df_tripstops_full = df_tripstops_full_all_patterns.groupby(['route_code', 'direction_id'])['stop_ids'].apply(np.array)
df_tripstops_full = df_tripstops_full.apply(lambda x: merge_stop_ids_sequence(x))
df_tripstops_full.rename('stop_ids_full', inplace=True)
df_tripstops_full = df_tripstops_full.reset_index()
df_tripstops_full['stop_ids_full_count'] = df_tripstops_full['stop_ids_full'].apply(len)
df_trips_rck = pd.merge(df_trips, df_tripstops_full, on=['route_code', 'direction_id'], how='left')
df_tripstops_full_all_patterns = pd.merge(df_tripstops_full_all_patterns, df_tripstops_full, on=['route_code', 'direction_id'], how='left')
