# Structure

In [1]:
import pandas as pd
import numpy as np
import os

date_folder = '../data/raw/'
date = '20131018'

routes_path = os.path.join(date_folder, date, 'routes.txt')
trips_path = os.path.join(date_folder, date, 'trips.txt')
shapes_path = os.path.join(date_folder, date, 'shapes.txt')

routes_txt = pd.read_csv(routes_path)
trips_txt = pd.read_csv(trips_path)
shapes_txt = pd.read_csv(shapes_path)

print("Routes:", routes_txt.shape)
print("Trips:", trips_txt.shape)
print("Shapes:", shapes_txt.shape)


Routes: (320, 8)
Trips: (219139, 9)
Shapes: (210973, 6)


In [2]:
data_folder = '../data/processed/'

routes_df_path = os.path.join(data_folder, 'routes.csv')
route_versions_df_path = os.path.join(data_folder, 'route_versions.csv')
temporary_changes_df_path = os.path.join(data_folder, 'temporary_changes.csv')

try:
    routes_df = pd.read_csv(routes_df_path)
    #route_versions_df = pd.read_csv(route_versions_df_path)
    route_versions_df = pd.read_csv(route_versions_df_path, parse_dates=['valid_from', 'valid_to'])
    temporary_changes_df = pd.read_csv(temporary_changes_df_path)
except FileNotFoundError:
    # Make empty dataframes for the first time
    routes_df = pd.DataFrame(columns=[
        "route_id", "agency_id", "route_short_name", "route_type", "route_color", "route_text_color"
    ])
    
    #route_versions_df = pd.DataFrame(columns=[
    #    "version_id", "route_id", "route_long_name", "route_desc", "valid_from", "valid_to", "shape_id", "trip_headsign", "parent_version_id", "note"
    #])

    route_versions_df = pd.DataFrame(columns=[
        "version_id", "route_id", "direction_id", "route_long_name", "route_desc",
        "valid_from", "valid_to", "shape_id", "trip_headsign",
        "parent_version_id", "note"
    ])
    # valid_from és valid_to legyen datetime típusú még üres állapotban is
    route_versions_df['valid_from'] = pd.to_datetime(route_versions_df['valid_from'])
    route_versions_df['valid_to'] = pd.to_datetime(route_versions_df['valid_to'])
    
    temporary_changes_df = pd.DataFrame(columns=[
        "detour_id", "route_id", "start_date", "end_date", "affects_version_id", "description"
    ])
    # Save
    routes_df.to_csv(routes_df_path, index=False)
    route_versions_df.to_csv(route_versions_df_path, index=False)
    temporary_changes_df.to_csv(temporary_changes_df_path, index=False)


In [3]:
routes_df

Unnamed: 0,route_id,agency_id,route_short_name,route_type,route_color,route_text_color


In [4]:
routes_txt.route_long_name.unique()

array([nan, 'Gödöllői járat', 'Cinkotai járat', 'Csömöri járat',
       'Ráckevei járat', 'Tököli járat', 'Dunaharaszti járat',
       'Csepeli járat', 'Szentendrei járat', 'Békásmegyeri járat'],
      dtype=object)

## routes_df

In [5]:
# Use relevant columns, without route_desc
cols_to_use = [col for col in routes_txt.columns if col != "route_desc"]

# Select new rows - rows whats route_id is not in routes_df 
new_routes = routes_txt[~routes_txt["route_id"].isin(routes_df["route_id"])][cols_to_use]

# Concatenate new routes
routes_df = pd.concat([routes_df, new_routes], ignore_index=True)

# Check for duplicates
duplicates = routes_df[routes_df.duplicated(subset="route_id", keep=False)]

if not duplicates.empty:
    print(f"Warning: There are {duplicates['route_id'].nunique()} duplicated route_id(s) in routes_df!")
    print("Duplicated route_id(s):")
    print(duplicates['route_id'].unique())
else:
    print("No duplicate route_id found in routes_df.")



No duplicate route_id found in routes_df.


In [6]:
routes_df.head(4)

Unnamed: 0,route_id,agency_id,route_short_name,route_type,route_color,route_text_color,route_long_name
0,MOZG,BKK,MOZG,3,1188FF,0,
1,MP52,BKK,M2,3,1188FF,0,
2,MP525,BKK,M2E,3,1188FF,0,
3,VP01,BKK,1V,3,1188FF,0,


## route_versions_df

In [7]:
route_versions_df

Unnamed: 0,version_id,route_id,direction_id,route_long_name,route_desc,valid_from,valid_to,shape_id,trip_headsign,parent_version_id,note


In [8]:
trips_txt.groupby(['route_id', 'shape_id', 'trip_headsign', 'direction_id']).count().sort_values(by=['route_id', 'service_id'], ascending=[True, False]).groupby('route_id').head(2).sort_values(by=['route_id', 'direction_id']).reset_index()

Unnamed: 0,route_id,shape_id,trip_headsign,direction_id,service_id,trip_id,block_id,wheelchair_accessible,trips_bkk_ref
0,0050,R952,"Rákospalota, Kossuth utca",0,762,762,762,762,762
1,0050,U961,Pasaréti tér,1,770,770,770,770,770
2,0070,S704,"Újpalota, Nyírpalota út",0,1007,1007,1007,1007,1007
3,0070,S705,Albertfalva vasútállomás,1,1007,1007,1007,1007,1007
4,0071,S710,Bosnyák tér,0,1003,1003,1003,1003,1003
...,...,...,...,...,...,...,...,...,...
622,VP101,X064,"Óbuda, Bogdáni út",1,1100,1100,1100,1100,1100
623,VP42,X162,"Kispest, Tulipán utca",0,370,370,370,370,370
624,VP42,X163,Határ út M,1,372,372,372,372,372
625,VP61,X017,Hűvösvölgy,0,1510,1510,1510,1510,1510


In [26]:
def version_exists(current_versions, row):
    return (
        ((current_versions["route_id"] == row["route_id"]) &
         (current_versions["direction_id"] == row["direction_id"]) &
         (current_versions["shape_id"] == row["shape_id"]) &
         (current_versions["trip_headsign"] == row["trip_headsign"]))
        .any()
    )

def update_route_versions(route_versions_df, trips_txt, routes_txt, date):
    route_versions_copy_df = route_versions_df.copy()
    # version_id starting point
    START_VERSION_ID = 100_000

    # If the file is empty
    if route_versions_df.empty:
        next_version_id = START_VERSION_ID
    else:
        next_version_id = route_versions_df["version_id"].max() + 1

    # Prepare new versions
    trips_grouped = trips_txt.groupby(['route_id', 'shape_id', 'trip_headsign', 'direction_id']).count()
    trips_grouped = trips_grouped.sort_values(by=['route_id', 'service_id'], ascending=[True, False])
    trips_grouped = trips_grouped.groupby('route_id').head(2).sort_values(by=['route_id', 'direction_id']).reset_index()
    trips_grouped = trips_grouped[["route_id", "shape_id", "trip_headsign", "direction_id"]]

    # Create a new versions dataframe
    new_versions_df = pd.merge(trips_grouped, routes_txt[["route_id", "route_long_name", "route_desc"]], on="route_id")
    new_versions_df["valid_from"] = pd.to_datetime(date)  # az adott GTFS snapshot dátuma
    new_versions_df["valid_to"] = pd.NaT
    new_versions_df["parent_version_id"] = np.nan
    new_versions_df["note"] = np.nan

    # Define the current versions
    current_versions = route_versions_df[route_versions_df["valid_to"].isna()]

    # Let only the new versions
    ##new_versions_filtered = new_versions_df[~new_versions_df.apply(version_exists, axis=1)].copy()
    new_versions_filtered = new_versions_df[~new_versions_df.apply(lambda row: version_exists(row, current_versions), axis=1)].copy()

    # Update the previous versions valid_to date
    for _, row in new_versions_filtered.iterrows():
        mask = (
            (route_versions_df["route_id"] == row["route_id"]) &
            (route_versions_df["valid_to"].isna())
        )
        route_versions_copy_df.loc[mask, "valid_to"] = row["valid_from"] - pd.Timedelta(days=1)

    new_versions_filtered["version_id"] = range(next_version_id, next_version_id + len(new_versions_filtered))

    # Concat
    extended_route_versions_df = pd.concat([route_versions_copy_df, new_versions_filtered], ignore_index=True)

    return extended_route_versions_df

In [27]:
extended_route_versions_df = update_route_versions(route_versions_df, trips_txt, routes_txt, date)
extended_route_versions_df

Unnamed: 0,version_id,route_id,direction_id,route_long_name,route_desc,valid_from,valid_to,shape_id,trip_headsign,parent_version_id,note
0,100000,0050,0,,"Pasaréti tér / Rákospalota, Kossuth utca",2013-10-18,NaT,R952,"Rákospalota, Kossuth utca",,
1,100001,0050,1,,"Pasaréti tér / Rákospalota, Kossuth utca",2013-10-18,NaT,U961,Pasaréti tér,,
2,100002,0070,0,,"Albertfalva vasútállomás / Újpalota, Nyírpalot...",2013-10-18,NaT,S704,"Újpalota, Nyírpalota út",,
3,100003,0070,1,,"Albertfalva vasútállomás / Újpalota, Nyírpalot...",2013-10-18,NaT,S705,Albertfalva vasútállomás,,
4,100004,0071,0,,Bornemissza tér / Bosnyák tér,2013-10-18,NaT,S710,Bosnyák tér,,
...,...,...,...,...,...,...,...,...,...,...,...
622,100622,VP101,1,,"Óbuda, Bogdáni út / Lehel utca/Róbert Károly k...",2013-10-18,NaT,X064,"Óbuda, Bogdáni út",,
623,100623,VP42,0,,"Határ út M / Kispest, Tulipán utca",2013-10-18,NaT,X162,"Kispest, Tulipán utca",,
624,100624,VP42,1,,"Határ út M / Kispest, Tulipán utca",2013-10-18,NaT,X163,Határ út M,,
625,100625,VP61,0,,Széll Kálmán tér M / Hűvösvölgy,2013-10-18,NaT,X017,Hűvösvölgy,,


In [31]:
route_versions_df

Unnamed: 0,version_id,route_id,direction_id,route_long_name,route_desc,valid_from,valid_to,shape_id,trip_headsign,parent_version_id,note
0,100000,0050,0,,"Pasaréti tér / Rákospalota, Kossuth utca",2013-10-18,NaT,R952,"Rákospalota, Kossuth utca",,
1,100001,0050,1,,"Pasaréti tér / Rákospalota, Kossuth utca",2013-10-18,NaT,U961,Pasaréti tér,,
2,100002,0070,0,,"Albertfalva vasútállomás / Újpalota, Nyírpalot...",2013-10-18,NaT,S704,"Újpalota, Nyírpalota út",,
3,100003,0070,1,,"Albertfalva vasútállomás / Újpalota, Nyírpalot...",2013-10-18,NaT,S705,Albertfalva vasútállomás,,
4,100004,0071,0,,Bornemissza tér / Bosnyák tér,2013-10-18,NaT,S710,Bosnyák tér,,
...,...,...,...,...,...,...,...,...,...,...,...
622,100622,VP101,1,,"Óbuda, Bogdáni út / Lehel utca/Róbert Károly k...",2013-10-18,NaT,X064,"Óbuda, Bogdáni út",,
623,100623,VP42,0,,"Határ út M / Kispest, Tulipán utca",2013-10-18,NaT,X162,"Kispest, Tulipán utca",,
624,100624,VP42,1,,"Határ út M / Kispest, Tulipán utca",2013-10-18,NaT,X163,Határ út M,,
625,100625,VP61,0,,Széll Kálmán tér M / Hűvösvölgy,2013-10-18,NaT,X017,Hűvösvölgy,,


In [29]:
route_versions_df[route_versions_df["route_id"] == "5200"]

Unnamed: 0,version_id,route_id,direction_id,route_long_name,route_desc,valid_from,valid_to,shape_id,trip_headsign,parent_version_id,note
503,100503,5200,0,,Örs vezér tere M+H / Déli pályaudvar M,2013-10-18,NaT,1112,Örs vezér tere M+H,,
504,100504,5200,1,,Örs vezér tere M+H / Déli pályaudvar M,2013-10-18,NaT,1111,Déli pályaudvar M,,
