# Imports

In [1]:
import pandas as pd
import numpy as np
import os

# Structure

In [2]:
def define_paths(data_folder=None, date=None):
    if data_folder:
        routes_df_path = os.path.join(data_folder, 'routes.csv')
        route_versions_df_path = os.path.join(data_folder, 'route_versions.csv')
        shape_variants_df_path = os.path.join(data_folder, 'shape_variants.csv')
        shape_variant_activations_df_path = os.path.join(data_folder, 'shape_variant_activations.csv')

        temporary_changes_df_path = os.path.join(data_folder, 'temporary_changes.csv')
        return routes_df_path, route_versions_df_path, shape_variants_df_path, shape_variant_activations_df_path, temporary_changes_df_path

    if date:
        date_folder = '../data/raw/'
        routes_path = os.path.join(date_folder, date, 'routes.txt')
        trips_path = os.path.join(date_folder, date, 'trips.txt')
        shapes_path = os.path.join(date_folder, date, 'shapes.txt')
        calendar_dates_path = os.path.join(date_folder, date, 'calendar_dates.txt')

        return routes_path, trips_path, shapes_path, calendar_dates_path

    raise ValueError("Either data_folder or date must be provided.")

In [3]:
def load_txt_data(date, print_shapes=False):
    routes_path, trips_path, shapes_path, calendar_dates_path = define_paths(date=date)
    routes_txt = pd.read_csv(routes_path)
    trips_txt = pd.read_csv(trips_path)
    shapes_txt = pd.read_csv(shapes_path)
    calendar_dates_txt = pd.read_csv(calendar_dates_path)

    if print_shapes:
        print("Routes:", routes_txt.shape)
        print("Trips:", trips_txt.shape)
        print("Shapes:", shapes_txt.shape)
        print("Calendar Dates:", calendar_dates_txt.shape)
    return routes_txt, trips_txt, shapes_txt, calendar_dates_txt

In [4]:
date = '20131018'
routes_txt, trips_txt, shapes_txt, calendar_dates_txt = load_txt_data(date)

In [5]:
def load_df_data(data_folder):
    routes_df_path, route_versions_df_path, shape_variants_df_path, shape_variant_activations_df_path, temporary_changes_df_path = define_paths(data_folder=data_folder)

    try:
        routes_df = pd.read_csv(routes_df_path)
        #route_versions_df = pd.read_csv(route_versions_df_path)
        route_versions_df = pd.read_csv(route_versions_df_path, parse_dates=['valid_from', 'valid_to'])
        shape_variants_df = pd.read_csv(shape_variants_df_path)
        shape_variant_activations_df = pd.read_csv(shape_variant_activations_df_path)
        temporary_changes_df = pd.read_csv(temporary_changes_df_path)
    except FileNotFoundError:
        # Make empty dataframes for the first time
        ### routes_df ###
        routes_df = pd.DataFrame(columns=[
            "route_id", "agency_id", "route_short_name", "route_type", "route_color", "route_text_color"
        ])

        ### route_versions_df ###
        route_versions_df = pd.DataFrame(columns=[
            "version_id", "route_id", "direction_id", "route_long_name", "route_desc",
            "valid_from", "valid_to", "shape_id", "trip_headsign",
            "parent_version_id", "note"
        ])
        # valid_from and valid_to be converted to datetime
        route_versions_df['valid_from'] = pd.to_datetime(route_versions_df['valid_from'])
        route_versions_df['valid_to'] = pd.to_datetime(route_versions_df['valid_to'])
        
        ### shape_variants_df ###
        shape_variants_df = pd.DataFrame(columns=[
            "shape_variant_id", "version_id", "shape_id", "is_main", "note"
        ])

        ### shape_variant_activations_df ###
        shape_variant_activations_df = pd.DataFrame(columns=[
            "date", "shape_variant_id"
        ])

        ### temporary_changes_df ###
        temporary_changes_df = pd.DataFrame(columns=[
            "detour_id", "route_id", "start_date", "end_date", "affects_version_id", "description"
        ])
        # Save
        routes_df.to_csv(routes_df_path, index=False)
        route_versions_df.to_csv(route_versions_df_path, index=False)
        temporary_changes_df.to_csv(temporary_changes_df_path, index=False)

    return routes_df, route_versions_df, shape_variants_df, shape_variant_activations_df, temporary_changes_df

In [6]:
data_folder = '../data/processed/'
routes_df, route_versions_df, shape_variants_df, shape_variant_activations_df, temporary_changes_df = load_df_data(data_folder)

In [7]:
routes_df

Unnamed: 0,route_id,agency_id,route_short_name,route_type,route_color,route_text_color


## Update routes_df

In [8]:
def update_routes_df(routes_df, routes_txt):
    # Use relevant columns, without route_desc
    cols_to_use = [col for col in routes_txt.columns if col != "route_desc"]

    # Select new rows - rows whats route_id is not in routes_df 
    new_routes = routes_txt[~routes_txt["route_id"].isin(routes_df["route_id"])][cols_to_use]

    # Concatenate new routes
    updated_routes_df = pd.concat([routes_df, new_routes], ignore_index=True)

    # Check for duplicates
    duplicates = updated_routes_df[updated_routes_df.duplicated(subset="route_id", keep=False)]

    if not duplicates.empty:
        print(f"Warning: There are {duplicates['route_id'].nunique()} duplicated route_id(s) in routes_df!")
        print("Duplicated route_id(s):")
        print(duplicates['route_id'].unique())
    else:
        print("No duplicate route_id found in routes_df.")

    return updated_routes_df

def save_routes(routes_df, data_folder):
    routes_df_path, _, _, _, _ = define_paths(data_folder=data_folder)
    routes_df.to_csv(routes_df_path, index=False)
    print(f"routes_df saved to {routes_df_path}")

In [9]:
updated_routes_df = update_routes_df(routes_df, routes_txt)
save_routes(updated_routes_df, data_folder)

No duplicate route_id found in routes_df.
routes_df saved to ../data/processed/routes.csv


In [11]:
updated_routes_df

Unnamed: 0,route_id,agency_id,route_short_name,route_type,route_color,route_text_color,route_long_name
0,MOZG,BKK,MOZG,3,1188FF,000000,
1,MP52,BKK,M2,3,1188FF,000000,
2,MP525,BKK,M2E,3,1188FF,000000,
3,VP01,BKK,1V,3,1188FF,000000,
4,VP02,BKK,2V,3,1188FF,000000,
...,...,...,...,...,...,...,...
315,9940,BKK,994,3,333333,FFFFFF,
316,9960,BKK,996,3,333333,FFFFFF,
317,9961,BKK,996A,3,333333,FFFFFF,
318,9980,BKK,998,3,333333,FFFFFF,


## Update route_versions_df

In [22]:
def version_exists(current_versions, row):
    return (
        ((current_versions["route_id"] == row["route_id"]) &
         (current_versions["direction_id"] == row["direction_id"]) &
         (current_versions["shape_id"] == row["shape_id"]) &
         (current_versions["trip_headsign"] == row["trip_headsign"]))
        .any()
    )

def update_route_versions(route_versions_df, trips_txt, routes_txt, date):
    route_versions_copy_df = route_versions_df.copy()
    # version_id starting point
    START_VERSION_ID = 100_000

    # If the file is empty
    if route_versions_df.empty:
        next_version_id = START_VERSION_ID
    else:
        next_version_id = route_versions_df["version_id"].max() + 1

    # Prepare new versions
    trips_grouped = trips_txt.groupby(['route_id', 'shape_id', 'trip_headsign', 'direction_id']).count()
    trips_grouped = trips_grouped.sort_values(by=['route_id', 'service_id'], ascending=[True, False])
    trips_grouped = trips_grouped.groupby('route_id').head(2).sort_values(by=['route_id', 'direction_id']).reset_index()
    trips_grouped = trips_grouped[["route_id", "shape_id", "trip_headsign", "direction_id"]]

    # Create a new versions dataframe
    new_versions_df = pd.merge(trips_grouped, routes_txt[["route_id", "route_long_name", "route_desc"]], on="route_id")
    new_versions_df["valid_from"] = pd.to_datetime(date)  # az adott GTFS snapshot dátuma
    new_versions_df["valid_to"] = pd.NaT
    new_versions_df["parent_version_id"] = np.nan
    new_versions_df["note"] = np.nan

    # Define the current versions
    current_versions = route_versions_df[route_versions_df["valid_to"].isna()]

    # Let only the new versions
    ##new_versions_filtered = new_versions_df[~new_versions_df.apply(version_exists, axis=1)].copy()
    new_versions_filtered = new_versions_df[~new_versions_df.apply(lambda row: version_exists(row, current_versions), axis=1)].copy()

    # Update the previous versions valid_to date
    for _, row in new_versions_filtered.iterrows():
        mask = (
            (route_versions_df["route_id"] == row["route_id"]) &
            (route_versions_df["valid_to"].isna())
        )
        route_versions_copy_df.loc[mask, "valid_to"] = row["valid_from"] - pd.Timedelta(days=1)

    new_versions_filtered["version_id"] = range(next_version_id, next_version_id + len(new_versions_filtered))

    # Concat
    extended_route_versions_df = pd.concat([route_versions_copy_df, new_versions_filtered], ignore_index=True)

    return extended_route_versions_df

def save_route_versions(route_versions_df, data_folder):
    _, route_versions_df_path, _, _, _ = define_paths(data_folder=data_folder)
    route_versions_df.to_csv(route_versions_df_path, index=False)
    print(f"routes_df saved to {route_versions_df_path}")

In [23]:
extended_route_versions_df = update_route_versions(route_versions_df, trips_txt, routes_txt, date)
save_route_versions(extended_route_versions_df, data_folder)
extended_route_versions_df.head(4)

routes_df saved to ../data/processed/route_versions.csv


Unnamed: 0,version_id,route_id,direction_id,route_long_name,route_desc,valid_from,valid_to,shape_id,trip_headsign,parent_version_id,note
0,100000,50,0,,"Pasaréti tér / Rákospalota, Kossuth utca",2013-10-18,NaT,R952,"Rákospalota, Kossuth utca",,
1,100001,50,1,,"Pasaréti tér / Rákospalota, Kossuth utca",2013-10-18,NaT,U961,Pasaréti tér,,
2,100002,70,0,,"Albertfalva vasútállomás / Újpalota, Nyírpalot...",2013-10-18,NaT,S704,"Újpalota, Nyírpalota út",,
3,100003,70,1,,"Albertfalva vasútállomás / Újpalota, Nyírpalot...",2013-10-18,NaT,S705,Albertfalva vasútállomás,,


# Check trips_bkk_ref

In [14]:
trips_txt.head(3)

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,direction_id,block_id,shape_id,wheelchair_accessible,trips_bkk_ref
0,6100,A65119ASZGPP-021,A651191,Örs vezér tere M+H,1,A65119ASZGPP-021_10A,1226,2,61001
1,6100,A65119ASZGPP-021,A6511910,Gödöllő,0,A65119ASZGPP-021_10B,1225,2,61000
2,6100,A65119ASZGPP-021,A65119100,Gödöllő,0,A65119ASZGPP-021_7B,1225,2,61000


In [31]:
trips_grouped = trips_txt.groupby(['route_id', 'shape_id', 'trip_headsign', 'direction_id']).count()
trips_grouped = trips_grouped.sort_values(by=['route_id', 'service_id'], ascending=[True, False])
trips_grouped = trips_grouped.groupby('route_id').head(2).sort_values(by=['route_id', 'direction_id']).reset_index()
#trips_grouped = trips_grouped[["route_id", "shape_id", "trip_headsign", "direction_id"]]
trips_grouped[50:100]

Unnamed: 0,route_id,shape_id,trip_headsign,direction_id,service_id,trip_id,block_id,wheelchair_accessible,trips_bkk_ref
50,301,W488,Keleti pályaudvar M,1,384,384,384,384,384
51,310,C595,"Árpádföld, Bekecs utca",0,731,731,731,731,0
52,310,C597,Örs vezér tere M+H,1,747,747,747,747,0
53,320,H302,Örs vezér tere M+H,0,470,470,470,470,470
54,320,E322,Árpád híd M,1,470,470,470,470,470
55,330,U527,"Nagytétény, ipartelep",0,639,639,639,639,639
56,330,U528,Móricz Zsigmond körtér,1,621,621,621,621,621
57,340,X382,"Békásmegyer, Újmegyeri tér",0,226,226,226,226,226
58,340,X381,Árpád híd M,1,222,222,222,222,222
59,350,U719,Szentlőrinci úti lakótelep,0,393,393,393,393,393


In [34]:
trips_txt[trips_txt["route_id"] == "0660"]["trips_bkk_ref"].unique()

array([nan, '06600', '06601'], dtype=object)

In [47]:
trips_txt[trips_txt["route_id"] == "0660"].groupby(["route_id", "shape_id", "direction_id"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,service_id,trip_id,trip_headsign,block_id,wheelchair_accessible,trips_bkk_ref
route_id,shape_id,direction_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
660,1769,1,14,14,14,14,14,0
660,J520,0,549,549,549,549,549,0
660,J521,0,35,35,35,35,35,35
660,J522,1,50,50,50,50,50,50
660,J523,1,533,533,533,533,533,0
660,P926,1,5,5,5,5,5,0


In [48]:
trips_txt[trips_txt["route_id"] == "0660"].groupby(["route_id", "shape_id", "direction_id", "trips_bkk_ref"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,service_id,trip_id,trip_headsign,block_id,wheelchair_accessible
route_id,shape_id,direction_id,trips_bkk_ref,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
660,J521,0,6600,35,35,35,35,35
660,J522,1,6601,50,50,50,50,50


In [49]:
routes_txt[routes_txt["route_id"] == "0660"]

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_color,route_text_color
57,660,BKK,66,,"Határ út M / Soroksár, központi raktárak",3,1188FF,0
