# Imports

In [643]:
import pandas as pd
import numpy as np
import os

# Structure

In [644]:
def define_paths(data_folder=None, date=None):
    if data_folder:
        shapes_df_path = os.path.join(data_folder, 'shapes.csv')
        routes_df_path = os.path.join(data_folder, 'routes.csv')
        route_versions_df_path = os.path.join(data_folder, 'route_versions.csv')
        shape_variants_df_path = os.path.join(data_folder, 'shape_variants.csv')
        shape_variant_activations_df_path = os.path.join(data_folder, 'shape_variant_activations.csv')

        temporary_changes_df_path = os.path.join(data_folder, 'temporary_changes.csv')
        return shapes_df_path, routes_df_path, route_versions_df_path, shape_variants_df_path, shape_variant_activations_df_path, temporary_changes_df_path

    if date:
        date_folder = '../data/raw/'
        routes_path = os.path.join(date_folder, date, 'routes.txt')
        trips_path = os.path.join(date_folder, date, 'trips.txt')
        shapes_path = os.path.join(date_folder, date, 'shapes.txt')
        calendar_path = os.path.join(date_folder, date, 'calendar.txt')
        calendar_dates_path = os.path.join(date_folder, date, 'calendar_dates.txt')

        return routes_path, trips_path, shapes_path, calendar_path, calendar_dates_path

    raise ValueError("Either data_folder or date must be provided.")

In [645]:
def load_txt_data(date, print_shapes=False):
    routes_path, trips_path, shapes_path, calendar_path, calendar_dates_path = define_paths(date=date)
    routes_txt = pd.read_csv(routes_path)
    trips_txt = pd.read_csv(trips_path)
    shapes_txt = pd.read_csv(shapes_path)
    calendar_dates_txt = pd.read_csv(calendar_dates_path)

    if print_shapes:
        print("Routes:", routes_txt.shape)
        print("Trips:", trips_txt.shape)
        print("Shapes:", shapes_txt.shape)
        print("Calendar Dates:", calendar_dates_txt.shape)

    try:  # Check if the file exists
        calendar_txt = pd.read_csv(calendar_path)
    except FileNotFoundError:
        # Make empty dataframes for the first time
        print("Calendar file not found. Creating empty dataframe.")
        calendar_txt = pd.DataFrame(columns=['service_id', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 'start_date', 'end_date'])
    
    return routes_txt, trips_txt, shapes_txt, calendar_txt, calendar_dates_txt

In [646]:
date = '20131018'
routes_txt, trips_txt, shapes_txt, calendar_txt, calendar_dates_txt = load_txt_data(date)

In [647]:
def load_df_data(data_folder):
    shapes_df_path, routes_df_path, route_versions_df_path, shape_variants_df_path, shape_variant_activations_df_path, temporary_changes_df_path = define_paths(data_folder=data_folder)

    try:
        shapes_df = pd.read_csv(shapes_df_path)
        routes_df = pd.read_csv(routes_df_path)
        route_versions_df = pd.read_csv(route_versions_df_path, parse_dates=['valid_from', 'valid_to'])
        shape_variants_df = pd.read_csv(shape_variants_df_path)
        shape_variant_activations_df = pd.read_csv(shape_variant_activations_df_path)
        temporary_changes_df = pd.read_csv(temporary_changes_df_path)
    except FileNotFoundError:
        # Make empty dataframes for the first time
        ### shapes_df ###
        shapes_df = pd.DataFrame(columns=[
            "shape_id", "shape_pt_lat", "shape_pt_lon", "shape_pt_sequence", "shape_dist_traveled", "shape_bkk_ref"
        ])

        ### routes_df ###
        routes_df = pd.DataFrame(columns=[
            "route_id", "agency_id", "route_short_name", "route_type", "route_color", "route_text_color"
        ])

        ### route_versions_df ###
        route_versions_df = pd.DataFrame(columns=[
            "version_id", "route_id", "direction_id", "route_long_name", "route_desc",
            "valid_from", "valid_to", "main_shape_id", "trip_headsign",
            "parent_version_id", "note"
        ])
        # valid_from and valid_to be converted to datetime
        route_versions_df['valid_from'] = pd.to_datetime(route_versions_df['valid_from'])
        route_versions_df['valid_to'] = pd.to_datetime(route_versions_df['valid_to'])
        
        ### shape_variants_df ###
        shape_variants_df = pd.DataFrame(columns=[
            "shape_variant_id", "version_id", "shape_id", "trip_headsign", "is_main", "note"
        ])

        ### shape_variant_activations_df ###
        shape_variant_activations_df = pd.DataFrame(columns=[
            "date", "shape_variant_id", "exception_type"
        ])

        ### temporary_changes_df ###
        temporary_changes_df = pd.DataFrame(columns=[
            "detour_id", "route_id", "start_date", "end_date", "affects_version_id", "description"
        ])
        # Save
        shapes_df.to_csv(shapes_df_path, index=False)
        routes_df.to_csv(routes_df_path, index=False)
        route_versions_df.to_csv(route_versions_df_path, index=False)
        shape_variants_df.to_csv(shape_variants_df_path, index=False)
        shape_variant_activations_df.to_csv(shape_variant_activations_df_path, index=False)
        temporary_changes_df.to_csv(temporary_changes_df_path, index=False)

    return shapes_df, routes_df, route_versions_df, shape_variants_df, shape_variant_activations_df, temporary_changes_df

In [648]:
data_folder = '../data/processed/'
shapes_df, routes_df, route_versions_df, shape_variants_df, shape_variant_activations_df, temporary_changes_df = load_df_data(data_folder)

In [649]:
trips_df = trips_txt.copy()
calendar_df = calendar_txt.copy()
calendar_dates_df = calendar_dates_txt.copy()

## Update routes_df

In [650]:
date_dt = pd.to_datetime(date, format="%Y%m%d")

# Dátum tartomány szűrése
calendar_df['start_date'] = pd.to_datetime(calendar_df['start_date'], format="%Y%m%d")
calendar_df['end_date'] = pd.to_datetime(calendar_df['end_date'], format="%Y%m%d")

# Nap neve (pl. 'friday')
day = date_dt.strftime('%A').lower()

# Egyszerű szűrés egy lépésben
valid_service_ids = calendar_df[
    (calendar_df['start_date'] <= date_dt) &
    (calendar_df['end_date'] >= date_dt) &
    (calendar_df[day] == 1)
]["service_id"].tolist()
valid_service_ids

['A87571AHPGPP-021',
 'A88120APPCsZ-011',
 'A88303AHPHA-0011',
 'A88327AHPSzGy011',
 'A88584AHPSzGy011',
 'A88866APPGPP-021',
 'A88957APPCsZ-011',
 'A88991APPKP-0011',
 'A89527APPKP-0011',
 'A91905AHPKZ-0011',
 'A92340AHPCsZ-031',
 'A92429APPPG-0011',
 'A92884APPPG-0011',
 'A93049APPKP-0021',
 'A93473APPKZ-0011',
 'A93584APPHA-0011',
 'A93967APPPG-0011',
 'A94562APPPG-0011',
 'A94762AHPHA-0021',
 'A94769APPCsZ-011',
 'A94963APPHA-0011',
 'A94975APPCsZ-051',
 'A94975APPCsZ-061',
 'A95090APPGER-011',
 'A95129AHPKZ-0011',
 'A95149APPMA-0011',
 'A95551APPCsZ-051',
 'A95576AHPKZ-0011',
 'A95586APPKZ-0011',
 'A95633AHPKZ-0011',
 'A95640APPMA-0021',
 'A95720APPPG-0011',
 'A95730APPHA-0061',
 'A95797AHPHA-0021',
 'A95818APPSRM-021',
 'A95868APPPG-0061',
 'A95886APPCsZ-011',
 'A95886APPCsZ-021',
 'A96075APPHA-0011',
 'A96236APPHA-0011',
 'A96245AHPBA0011',
 'A96307APPBA0011',
 'A96311AHPHA-0031',
 'A96320AHPKM-0011',
 'A96395APPPG-0041',
 'A96405APPPG-0031',
 'A96435APPSRM-021',
 'A96690TJHPSzM

In [651]:
calendar_dates_df[(calendar_dates_df["date"] == int(date)) & (calendar_dates_df["exception_type"] == 2)]

Unnamed: 0,service_id,date,exception_type
1888,A98178AHPCsZ-041,20131018,2
3632,B01134APPCsZ-021,20131018,2
3682,B01188AHPKM-0011,20131018,2
4127,B01326RA1PPHZS-011,20131018,2
4131,B01331RA1PPPG-0011,20131018,2


In [652]:
remove = calendar_dates_df[(calendar_dates_df["date"] == int(date)) & (calendar_dates_df["exception_type"] == 2)]["service_id"]
remove_set = set(remove)
filtered_service_ids = [sid for sid in valid_service_ids if sid not in remove_set]

In [653]:
len(valid_service_ids), len(remove_set), len(filtered_service_ids)

(246, 5, 241)

In [654]:
#trips2routes_df = trips_df[trips_df['service_id'].isin(valid_service_ids)]
#trips2routes_df = trips2routes_df[["service_id", "route_id", "shape_id", "trip_headsign", "direction_id", "trips_bkk_ref"]]
#trips2routes_df = trips2routes_df[trips2routes_df["trips_bkk_ref"].notna()]
#trips2routes_df = trips2routes_df.groupby(["service_id", "route_id", "shape_id", "trip_headsign", "direction_id"]).count().reset_index()
#trips2routes_df = trips2routes_df.rename(columns={"shape_id" : "main_shape_id"})
#trips2routes_df

In [655]:
trips2routes_df = trips_df[trips_df['service_id'].isin(valid_service_ids)]
trips2routes_df = trips2routes_df[["service_id", "route_id", "shape_id", "trip_headsign", "direction_id", "trips_bkk_ref"]]
#trips2routes_df = trips2routes_df[trips2routes_df["trips_bkk_ref"].notna()]
trips2routes_df = trips2routes_df.groupby(["route_id", "shape_id", "trip_headsign", "direction_id"]).count().reset_index()
trips2routes_df =trips2routes_df.sort_values(by=['route_id', 'direction_id', 'trips_bkk_ref', 'service_id'], ascending=[True, True, False, False])
trips2routes_df = trips2routes_df.drop_duplicates(subset=['route_id', 'direction_id'])
trips2routes_df = trips2routes_df.rename(columns={"shape_id" : "main_shape_id"})
trips2routes_df

Unnamed: 0,route_id,main_shape_id,trip_headsign,direction_id,service_id,trips_bkk_ref
0,0050,R952,"Rákospalota, Kossuth utca",0,112,112
2,0050,U961,Pasaréti tér,1,114,114
3,0070,S704,"Újpalota, Nyírpalota út",0,145,145
4,0070,S705,Albertfalva vasútállomás,1,145,145
5,0071,S710,Bosnyák tér,0,127,127
...,...,...,...,...,...,...
1034,VP101,X064,"Óbuda, Bogdáni út",1,486,486
1035,VP42,X162,"Kispest, Tulipán utca",0,114,114
1036,VP42,X163,Határ út M,1,115,115
1038,VP61,X017,Hűvösvölgy,0,570,570


In [656]:
trips2routes_df[80:120]

Unnamed: 0,route_id,main_shape_id,trip_headsign,direction_id,service_id,trips_bkk_ref
110,540,M124,Boráros tér H,1,27,0
119,550,M129,"Gyál, Vecsési út",0,39,39
120,550,M130,Boráros tér H,1,41,0
127,570,2375,Hűvösvölgy,0,73,73
129,580,H320,Balatoni út (Háros utca),0,81,81
130,580,U741,Savoya Park,1,85,85
131,630,N016,"Nagykovácsi, Tisza István tér",0,81,81
132,630,N017,Hűvösvölgy,1,81,81
133,640,R803,"Solymár, Auchan áruház",0,14,14
134,640,R804,Hűvösvölgy,1,13,13


In [657]:
trips2routes_df.sort_values("trips_bkk_ref", ascending=True)

Unnamed: 0,route_id,main_shape_id,trip_headsign,direction_id,service_id,trips_bkk_ref
58,0310,C595,"Árpádföld, Bekecs utca",0,101,0
59,0310,C597,Örs vezér tere M+H,1,104,0
925,9370,C123,Máramaros út,0,1,0
943,9500,H075,Határ út M,1,4,0
98,0450,A067,Örs vezér tere M+H,1,88,0
...,...,...,...,...,...,...
797,5100,1110,Mexikói út M,0,393,393
1034,VP101,X064,"Óbuda, Bogdáni út",1,486,486
1033,VP101,X034,Lehel utca/Róbert Károly körút,0,490,490
1038,VP61,X017,Hűvösvölgy,0,570,570


In [658]:
trips2routes_df[trips2routes_df["route_id"] == "0450"]

Unnamed: 0,route_id,main_shape_id,trip_headsign,direction_id,service_id,trips_bkk_ref
97,450,A066,"Cinkota, Lassú utca",0,86,86
98,450,A067,Örs vezér tere M+H,1,88,0


In [659]:
latest_routes_df = pd.merge(
    #routes_txt[["route_id", "route_long_name" ,"route_desc"]],
    routes_txt,
    trips2routes_df[["route_id", "main_shape_id", "trip_headsign", "direction_id"]],
    on="route_id",
    how="inner",)

In [660]:
latest_routes_df

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_color,route_text_color,main_shape_id,trip_headsign,direction_id
0,MP52,BKK,M2,,Széll Kálmán tér M / Örs vezér tere M+H,3,1188FF,000000,V040,Örs vezér tere M+H,0
1,MP52,BKK,M2,,Széll Kálmán tér M / Örs vezér tere M+H,3,1188FF,000000,V041,Széll Kálmán tér M,1
2,MP525,BKK,M2E,,Széll Kálmán tér M / Örs vezér tere M+H,3,1188FF,000000,V038,Örs vezér tere M+H,0
3,MP525,BKK,M2E,,Széll Kálmán tér M / Örs vezér tere M+H,3,1188FF,000000,V039,Széll Kálmán tér M,1
4,VP01,BKK,1V,,Bécsi út (Vörösvári út) / Lehel utca/Róbert Ká...,3,1188FF,000000,X032,Lehel utca/Róbert Károly körút,0
...,...,...,...,...,...,...,...,...,...,...,...
608,9961,BKK,996A,,Cinkotai autóbuszgarázs / Újpest-Központ M,3,333333,FFFFFF,C182,Újpest-Központ M,1
609,9980,BKK,998,,"Rákoskeresztúr, városközpont / Rákoskeresztúr,...",3,333333,FFFFFF,M116,"Rákoskeresztúr, városközpont",0
610,9980,BKK,998,,"Rákoskeresztúr, városközpont / Rákoskeresztúr,...",3,333333,FFFFFF,M118,Cinkotai autóbuszgarázs,1
611,9990,BKK,999,,Határ út M / Dél-pesti autóbuszgarázs,3,333333,FFFFFF,Q605,Dél-pesti autóbuszgarázs,0


In [661]:
latest_routes_df[latest_routes_df["route_id"] == "3060"]

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_color,route_text_color,main_shape_id,trip_headsign,direction_id
427,3060,BKK,6,,Széll Kálmán tér M / Móricz Zsigmond körtér,0,FFEE00,0,B693,Móricz Zsigmond körtér,0
428,3060,BKK,6,,Széll Kálmán tér M / Móricz Zsigmond körtér,0,FFEE00,0,B692,Széll Kálmán tér M,1


In [662]:
routes_df

Unnamed: 0,route_id,agency_id,route_short_name,route_type,route_color,route_text_color


In [663]:
def update_routes_df(routes_df, latest_routes_df):
    # Use relevant columns, without route_desc
    cols_to_use = [col for col in routes_df.columns]

    # Select new rows - rows whats route_id is not in routes_df 
    new_routes = latest_routes_df[~latest_routes_df["route_id"].isin(routes_df["route_id"])][cols_to_use]
    
    # Concatenate new routes
    updated_routes_df = pd.concat([routes_df, new_routes], ignore_index=True)

    # Check for duplicates
    #duplicates = updated_routes_df[updated_routes_df.duplicated(subset="route_id", keep=False)]
    duplicates = updated_routes_df[updated_routes_df.groupby("route_id")["route_id"].transform("count") > 2]

    if not duplicates.empty:
        print(f"Warning: There are {duplicates['route_id'].nunique()} duplicated route_id(s) in routes_df!")
        print("Duplicated route_id(s):")
        print(duplicates['route_id'].unique())
    else:
        print("No duplicate route_id found in routes_df.")

    return updated_routes_df

def save_routes(routes_df, data_folder):
    _, routes_df_path, _, _, _, _ = define_paths(data_folder=data_folder)
    routes_df.to_csv(routes_df_path, index=False)
    print(f"routes_df saved to {routes_df_path}")

In [664]:
updated_routes_df = update_routes_df(routes_df, latest_routes_df)
save_routes(updated_routes_df, data_folder)

No duplicate route_id found in routes_df.
routes_df saved to ../data/processed/routes.csv


In [665]:
updated_routes_df

Unnamed: 0,route_id,agency_id,route_short_name,route_type,route_color,route_text_color
0,MP52,BKK,M2,3,1188FF,000000
1,MP52,BKK,M2,3,1188FF,000000
2,MP525,BKK,M2E,3,1188FF,000000
3,MP525,BKK,M2E,3,1188FF,000000
4,VP01,BKK,1V,3,1188FF,000000
...,...,...,...,...,...,...
608,9961,BKK,996A,3,333333,FFFFFF
609,9980,BKK,998,3,333333,FFFFFF
610,9980,BKK,998,3,333333,FFFFFF
611,9990,BKK,999,3,333333,FFFFFF


In [666]:
updated_routes_df[updated_routes_df["route_id"] == "3190"]

Unnamed: 0,route_id,agency_id,route_short_name,route_type,route_color,route_text_color
437,3190,BKK,19,0,FFEE00,0
438,3190,BKK,19,0,FFEE00,0


## Update route_versions_df

In [667]:
route_versions_df

Unnamed: 0,version_id,route_id,direction_id,route_long_name,route_desc,valid_from,valid_to,main_shape_id,trip_headsign,parent_version_id,note


In [668]:
def version_exists(current_versions, row):
    return (
        ((current_versions["route_id"] == row["route_id"]) &
         (current_versions["direction_id"] == row["direction_id"]) &
         (current_versions["main_shape_id"] == row["main_shape_id"]) &
         (current_versions["trip_headsign"] == row["trip_headsign"]))
        .any()
    )

def update_route_versions(route_versions_df, latest_routes_df, date):
    route_versions_copy_df = route_versions_df.copy()
    # version_id starting point
    START_VERSION_ID = 100_000

    # If the file is empty
    if route_versions_df.empty:
        next_version_id = START_VERSION_ID
    else:
        next_version_id = route_versions_df["version_id"].max() + 1

    # Prepare new versions
    #trips_grouped = trips_txt.groupby(['route_id', 'shape_id', 'trip_headsign', 'direction_id']).count()
    #trips_grouped = trips_grouped.sort_values(by=['route_id', 'service_id'], ascending=[True, False])
    #trips_grouped = trips_grouped.groupby('route_id').head(2).sort_values(by=['route_id', 'direction_id']).reset_index()
    #trips_grouped = trips_grouped[["route_id", "shape_id", "trip_headsign", "direction_id"]]

    # Create a new versions dataframe
    #new_versions_df = pd.merge(trips_grouped, routes_txt[["route_id", "route_long_name", "route_desc"]], on="route_id")
    new_versions_df = latest_routes_df.copy()[["route_id", "main_shape_id", "trip_headsign", "direction_id", "route_desc"]]
    new_versions_df["valid_from"] = pd.to_datetime(date)  # az adott GTFS snapshot dátuma
    new_versions_df["valid_to"] = pd.NaT
    new_versions_df["parent_version_id"] = np.nan
    new_versions_df["note"] = np.nan

    # Define the current versions
    current_versions = route_versions_df[route_versions_df["valid_to"].isna()]

    # Let only the new versions
    ##new_versions_filtered = new_versions_df[~new_versions_df.apply(version_exists, axis=1)].copy()
    new_versions_filtered = new_versions_df[~new_versions_df.apply(lambda row: version_exists(row, current_versions), axis=1)].copy()

    # Update the previous versions valid_to date
    for _, row in new_versions_filtered.iterrows():
        mask = (
            (route_versions_df["route_id"] == row["route_id"]) &
            (route_versions_df["valid_to"].isna())
        )
        route_versions_copy_df.loc[mask, "valid_to"] = row["valid_from"] - pd.Timedelta(days=1)

    new_versions_filtered["version_id"] = range(next_version_id, next_version_id + len(new_versions_filtered))

    # Concat
    extended_route_versions_df = pd.concat([route_versions_copy_df, new_versions_filtered], ignore_index=True)

    return extended_route_versions_df

def save_route_versions(route_versions_df, data_folder):
    _, _, route_versions_df_path, _, _, _ = define_paths(data_folder=data_folder)
    route_versions_df.to_csv(route_versions_df_path, index=False)
    print(f"routes_df saved to {route_versions_df_path}")

In [669]:
extended_route_versions_df = update_route_versions(route_versions_df, latest_routes_df, date)
save_route_versions(extended_route_versions_df, data_folder)
extended_route_versions_df

routes_df saved to ../data/processed/route_versions.csv


Unnamed: 0,version_id,route_id,direction_id,route_long_name,route_desc,valid_from,valid_to,main_shape_id,trip_headsign,parent_version_id,note
0,100000,MP52,0,,Széll Kálmán tér M / Örs vezér tere M+H,2013-10-18,NaT,V040,Örs vezér tere M+H,,
1,100001,MP52,1,,Széll Kálmán tér M / Örs vezér tere M+H,2013-10-18,NaT,V041,Széll Kálmán tér M,,
2,100002,MP525,0,,Széll Kálmán tér M / Örs vezér tere M+H,2013-10-18,NaT,V038,Örs vezér tere M+H,,
3,100003,MP525,1,,Széll Kálmán tér M / Örs vezér tere M+H,2013-10-18,NaT,V039,Széll Kálmán tér M,,
4,100004,VP01,0,,Bécsi út (Vörösvári út) / Lehel utca/Róbert Ká...,2013-10-18,NaT,X032,Lehel utca/Róbert Károly körút,,
...,...,...,...,...,...,...,...,...,...,...,...
608,100608,9961,1,,Cinkotai autóbuszgarázs / Újpest-Központ M,2013-10-18,NaT,C182,Újpest-Központ M,,
609,100609,9980,0,,"Rákoskeresztúr, városközpont / Rákoskeresztúr,...",2013-10-18,NaT,M116,"Rákoskeresztúr, városközpont",,
610,100610,9980,1,,"Rákoskeresztúr, városközpont / Rákoskeresztúr,...",2013-10-18,NaT,M118,Cinkotai autóbuszgarázs,,
611,100611,9990,0,,Határ út M / Dél-pesti autóbuszgarázs,2013-10-18,NaT,Q605,Dél-pesti autóbuszgarázs,,


In [670]:
extended_route_versions_df.groupby("route_id").count()["version_id"].to_dict()

{'0050': 2,
 '0070': 2,
 '0071': 2,
 '0075': 2,
 '0078': 1,
 '0080': 2,
 '0090': 2,
 '0110': 2,
 '0130': 2,
 '0131': 2,
 '0150': 2,
 '0160': 2,
 '0161': 2,
 '0205': 2,
 '0210': 2,
 '0211': 2,
 '0220': 2,
 '0221': 2,
 '0230': 2,
 '0235': 2,
 '0250': 2,
 '0260': 2,
 '0270': 2,
 '0290': 2,
 '0300': 2,
 '0301': 2,
 '0310': 2,
 '0320': 2,
 '0330': 2,
 '0340': 2,
 '0350': 2,
 '0360': 2,
 '0380': 2,
 '0390': 2,
 '0400': 2,
 '0405': 2,
 '0440': 2,
 '0450': 2,
 '0460': 2,
 '0530': 2,
 '0540': 2,
 '0550': 2,
 '0570': 1,
 '0580': 2,
 '0630': 2,
 '0640': 2,
 '0641': 2,
 '0650': 2,
 '0660': 2,
 '0665': 2,
 '0670': 2,
 '0680': 2,
 '0710': 2,
 '0845': 2,
 '0850': 2,
 '0855': 2,
 '0858': 1,
 '0860': 2,
 '0870': 2,
 '0871': 2,
 '0880': 2,
 '0895': 2,
 '0910': 2,
 '0920': 2,
 '0930': 2,
 '0931': 2,
 '0945': 2,
 '0950': 2,
 '0960': 2,
 '0975': 2,
 '0980': 2,
 '0985': 2,
 '0990': 2,
 '1000': 1,
 '1020': 2,
 '1030': 2,
 '1040': 2,
 '1041': 2,
 '1050': 2,
 '1060': 2,
 '1075': 2,
 '1110': 2,
 '1120': 2,
 '11

In [671]:
extended_route_versions_df[extended_route_versions_df["route_id"] == "3060"]

Unnamed: 0,version_id,route_id,direction_id,route_long_name,route_desc,valid_from,valid_to,main_shape_id,trip_headsign,parent_version_id,note
427,100427,3060,0,,Széll Kálmán tér M / Móricz Zsigmond körtér,2013-10-18,NaT,B693,Móricz Zsigmond körtér,,
428,100428,3060,1,,Széll Kálmán tér M / Móricz Zsigmond körtér,2013-10-18,NaT,B692,Széll Kálmán tér M,,


## Update shape_variants_df

In [672]:
calendar_dates_df['date'] = pd.to_datetime(calendar_dates_df['date'], format="%Y%m%d")

extra_service_ids = calendar_dates_df[
    (calendar_dates_df['date'] == date_dt)
][["service_id", "exception_type"]]
extra_service_ids["exception_type"] = extra_service_ids["exception_type"].astype(int)

In [673]:
extra_service_ids

Unnamed: 0,service_id,exception_type
1888,A98178AHPCsZ-041,2
3632,B01134APPCsZ-021,2
3682,B01188AHPKM-0011,2
4127,B01326RA1PPHZS-011,2
4131,B01331RA1PPPG-0011,2


In [674]:
valid_routes = extended_route_versions_df[extended_route_versions_df["valid_to"].isna()][["version_id", "route_id", "direction_id", "main_shape_id"]]
valid_routes

Unnamed: 0,version_id,route_id,direction_id,main_shape_id
0,100000,MP52,0,V040
1,100001,MP52,1,V041
2,100002,MP525,0,V038
3,100003,MP525,1,V039
4,100004,VP01,0,X032
...,...,...,...,...
608,100608,9961,1,C182
609,100609,9980,0,M116
610,100610,9980,1,M118
611,100611,9990,0,Q605


In [675]:
trips2routes_df2 = trips_df[trips_df['service_id'].isin(valid_service_ids)]
trips2routes_df2 = trips2routes_df2[["service_id", "route_id", "shape_id", "trip_headsign", "direction_id"]]
trips2routes_df2 = pd.merge(trips2routes_df2, extra_service_ids, how="left", on="service_id")
trips2routes_df2 = trips2routes_df2.groupby(["route_id", "shape_id", "trip_headsign", "direction_id"]).agg('first').reset_index()
trips2routes_df2 = trips2routes_df2[["route_id", "shape_id", "trip_headsign", "direction_id", "exception_type"]]

# This will actually convert 2.0 to "2" and keep NaN as NaN
trips2routes_df2["exception_type"] = trips2routes_df2["exception_type"].apply(
    lambda x: str(int(x)) if pd.notna(x) else x
)

trips2routes_df2

Unnamed: 0,route_id,shape_id,trip_headsign,direction_id,exception_type
0,0050,R952,"Rákospalota, Kossuth utca",0,
1,0050,U197,"Rákospalota, Kossuth utca",0,
2,0050,U961,Pasaréti tér,1,
3,0070,S704,"Újpalota, Nyírpalota út",0,
4,0070,S705,Albertfalva vasútállomás,1,
...,...,...,...,...,...
1034,VP101,X064,"Óbuda, Bogdáni út",1,2
1035,VP42,X162,"Kispest, Tulipán utca",0,
1036,VP42,X163,Határ út M,1,
1037,VP61,X016,Széll Kálmán tér M,1,2


In [676]:
merged_df = pd.merge(valid_routes, trips2routes_df2, on=["route_id", "direction_id"])
merged_df["main_shape_id"] = (merged_df["main_shape_id"] == merged_df["shape_id"]).astype(int)
merged_df = merged_df.rename(columns={"main_shape_id" : "is_main"})
merged_df

Unnamed: 0,version_id,route_id,direction_id,is_main,shape_id,trip_headsign,exception_type
0,100000,MP52,0,1,V040,Örs vezér tere M+H,
1,100000,MP52,0,0,X067,Örs vezér tere M+H,
2,100001,MP52,1,1,V041,Széll Kálmán tér M,
3,100001,MP52,1,0,X068,Széll Kálmán tér M,
4,100002,MP525,0,1,V038,Örs vezér tere M+H,
...,...,...,...,...,...,...,...
1034,100609,9980,0,1,M116,"Rákoskeresztúr, városközpont",
1035,100610,9980,1,0,M117,"Rákoskeresztúr, városközpont",
1036,100610,9980,1,1,M118,Cinkotai autóbuszgarázs,
1037,100611,9990,0,1,Q605,Dél-pesti autóbuszgarázs,


In [677]:
shape_variants_df

Unnamed: 0,shape_variant_id,version_id,shape_id,trip_headsign,is_main,note


In [678]:
shape_variant_activations_df

Unnamed: 0,date,shape_variant_id,exception_type


In [679]:
def save_shape_variants_df(shape_variants_df, data_folder):
    _, _, _, shape_variants_df_path, _, _ = define_paths(data_folder=data_folder)
    shape_variants_df.to_csv(shape_variants_df_path, index=False)
    print(f"shape_variants_df saved to {shape_variants_df_path}")

def save_shape_variant_activations_df(shape_variant_activations_df, data_folder):
    _, _, _, _, shape_variant_activations_df_path, _ = define_paths(data_folder=data_folder)
    shape_variant_activations_df.to_csv(shape_variant_activations_df_path, index=False)
    print(f"shape_variant_activations_df saved to {shape_variant_activations_df_path}")

In [680]:
# Start assigning new shape_variant_ids from here
next_id = 100000 if shape_variants_df.empty else shape_variants_df['shape_variant_id'].max() + 1

# Rebuild index if necessary
merged_df = merged_df.reset_index()

# Build lookup for existing variants
variant_cols = ["version_id", "shape_id", "trip_headsign", "is_main"]
existing_variants = shape_variants_df.set_index(variant_cols)['shape_variant_id'].to_dict()

# Lists to collect new rows
new_shape_variants = []
new_activations = []

for i, row in merged_df.iterrows():
    key = tuple(row[col] for col in variant_cols)

    # Lookup or create shape_variant_id
    if key in existing_variants:
        shape_variant_id = existing_variants[key]
    else:
        shape_variant_id = next_id
        next_id += 1

        new_shape_variants.append({
            "shape_variant_id": shape_variant_id,
            "version_id": row["version_id"],
            "shape_id": row["shape_id"],
            "trip_headsign": row["trip_headsign"],
            "is_main": row["is_main"],
            "note": None  # or "" as needed
        })

        existing_variants[key] = shape_variant_id

    # Add activation using row index as "date"
    new_activations.append({
        "date": date,  # Use original row index as the date
        "shape_variant_id": shape_variant_id,
        "exception_type": row["exception_type"]
    })

# Append new variants
if new_shape_variants:
    shape_variants_df = pd.concat([shape_variants_df, pd.DataFrame(new_shape_variants)], ignore_index=True)

# Append activations
shape_variant_activations_df = pd.concat(
    [shape_variant_activations_df, pd.DataFrame(new_activations)],
    ignore_index=True
)


save_shape_variants_df(shape_variants_df, data_folder)
save_shape_variant_activations_df(shape_variant_activations_df, data_folder)

shape_variants_df saved to ../data/processed/shape_variants.csv
shape_variant_activations_df saved to ../data/processed/shape_variant_activations.csv


In [681]:
shape_variants_df

Unnamed: 0,shape_variant_id,version_id,shape_id,trip_headsign,is_main,note
0,100000,100000,V040,Örs vezér tere M+H,1,
1,100001,100000,X067,Örs vezér tere M+H,0,
2,100002,100001,V041,Széll Kálmán tér M,1,
3,100003,100001,X068,Széll Kálmán tér M,0,
4,100004,100002,V038,Örs vezér tere M+H,1,
...,...,...,...,...,...,...
1034,101034,100609,M116,"Rákoskeresztúr, városközpont",1,
1035,101035,100610,M117,"Rákoskeresztúr, városközpont",0,
1036,101036,100610,M118,Cinkotai autóbuszgarázs,1,
1037,101037,100611,Q605,Dél-pesti autóbuszgarázs,1,


In [682]:
shape_variant_activations_df

Unnamed: 0,date,shape_variant_id,exception_type
0,20131018,100000,
1,20131018,100001,
2,20131018,100002,
3,20131018,100003,
4,20131018,100004,
...,...,...,...
1034,20131018,101034,
1035,20131018,101035,
1036,20131018,101036,
1037,20131018,101037,


## OLD Update route_versions_df

In [None]:
def version_exists(current_versions, row):
    return (
        ((current_versions["route_id"] == row["route_id"]) &
         (current_versions["direction_id"] == row["direction_id"]) &
         (current_versions["shape_id"] == row["shape_id"]) &
         (current_versions["trip_headsign"] == row["trip_headsign"]))
        .any()
    )

def update_route_versions(route_versions_df, trips_txt, routes_txt, date):
    route_versions_copy_df = route_versions_df.copy()
    # version_id starting point
    START_VERSION_ID = 100_000

    # If the file is empty
    if route_versions_df.empty:
        next_version_id = START_VERSION_ID
    else:
        next_version_id = route_versions_df["version_id"].max() + 1

    # Prepare new versions
    trips_grouped = trips_txt.groupby(['route_id', 'shape_id', 'trip_headsign', 'direction_id']).count()
    trips_grouped = trips_grouped.sort_values(by=['route_id', 'service_id'], ascending=[True, False])
    trips_grouped = trips_grouped.groupby('route_id').head(2).sort_values(by=['route_id', 'direction_id']).reset_index()
    trips_grouped = trips_grouped[["route_id", "shape_id", "trip_headsign", "direction_id"]]

    # Create a new versions dataframe
    new_versions_df = pd.merge(trips_grouped, routes_txt[["route_id", "route_long_name", "route_desc"]], on="route_id")
    new_versions_df["valid_from"] = pd.to_datetime(date)  # az adott GTFS snapshot dátuma
    new_versions_df["valid_to"] = pd.NaT
    new_versions_df["parent_version_id"] = np.nan
    new_versions_df["note"] = np.nan

    # Define the current versions
    current_versions = route_versions_df[route_versions_df["valid_to"].isna()]

    # Let only the new versions
    ##new_versions_filtered = new_versions_df[~new_versions_df.apply(version_exists, axis=1)].copy()
    new_versions_filtered = new_versions_df[~new_versions_df.apply(lambda row: version_exists(row, current_versions), axis=1)].copy()

    # Update the previous versions valid_to date
    for _, row in new_versions_filtered.iterrows():
        mask = (
            (route_versions_df["route_id"] == row["route_id"]) &
            (route_versions_df["valid_to"].isna())
        )
        route_versions_copy_df.loc[mask, "valid_to"] = row["valid_from"] - pd.Timedelta(days=1)

    new_versions_filtered["version_id"] = range(next_version_id, next_version_id + len(new_versions_filtered))

    # Concat
    extended_route_versions_df = pd.concat([route_versions_copy_df, new_versions_filtered], ignore_index=True)

    return extended_route_versions_df

def save_route_versions(route_versions_df, data_folder):
    _, route_versions_df_path, _, _, _ = define_paths(data_folder=data_folder)
    route_versions_df.to_csv(route_versions_df_path, index=False)
    print(f"routes_df saved to {route_versions_df_path}")

In [None]:
extended_route_versions_df = update_route_versions(route_versions_df, trips_txt, routes_txt, date)
save_route_versions(extended_route_versions_df, data_folder)
extended_route_versions_df.head(4)

routes_df saved to ../data/processed/route_versions.csv


Unnamed: 0,version_id,route_id,direction_id,route_long_name,route_desc,valid_from,valid_to,shape_id,trip_headsign,parent_version_id,note
0,100000,50,0,,"Pasaréti tér / Rákospalota, Kossuth utca",2013-10-18,NaT,R952,"Rákospalota, Kossuth utca",,
1,100001,50,1,,"Pasaréti tér / Rákospalota, Kossuth utca",2013-10-18,NaT,U961,Pasaréti tér,,
2,100002,70,0,,"Albertfalva vasútállomás / Újpalota, Nyírpalot...",2013-10-18,NaT,S704,"Újpalota, Nyírpalota út",,
3,100003,70,1,,"Albertfalva vasútállomás / Újpalota, Nyírpalot...",2013-10-18,NaT,S705,Albertfalva vasútállomás,,
