# Structure

In [8]:
import pandas as pd
import os

date_folder = '../data/raw/'
date = '20131018'

routes_path = os.path.join(date_folder, date, 'routes.txt')
trips_path = os.path.join(date_folder, date, 'trips.txt')
shapes_path = os.path.join(date_folder, date, 'shapes.txt')

routes_txt = pd.read_csv(routes_path)
trips_txt = pd.read_csv(trips_path)
shapes_txt = pd.read_csv(shapes_path)

print("Routes:", routes_txt.shape)
print("Trips:", trips_txt.shape)
print("Shapes:", shapes_txt.shape)


Routes: (320, 8)
Trips: (219139, 9)
Shapes: (210973, 6)


In [None]:
data_folder = '../data/processed/'

routes_df_path = os.path.join(data_folder, 'routes.csv')
route_versions_df_path = os.path.join(data_folder, 'route_versions.csv')
temporary_changes_df_path = os.path.join(data_folder, 'temporary_changes.csv')

try:
    routes_df = pd.read_csv(routes_df_path)
    route_versions_df = pd.read_csv(route_versions_df_path)
    temporary_changes_df = pd.read_csv(temporary_changes_df_path)
except FileNotFoundError:
    # Make empty dataframes for the first time
    routes_df = pd.DataFrame(columns=[
        "route_id", "agency_id", "route_short_name", "route_long_name", "route_type", "route_color", "route_text_color"
    ])
    
    route_versions_df = pd.DataFrame(columns=[
        "version_id", "route_id", "valid_from", "valid_to", "route_desc", "route_long_name", "route_short_name", "parent_route_id"
    ])
    
    temporary_changes_df = pd.DataFrame(columns=[
        "detour_id", "route_id", "start_date", "end_date", "description", "affects_version_id"
    ])
    # Save
    routes_df.to_csv(routes_df_path, index=False)
    route_versions_df.to_csv(route_versions_df_path, index=False)
    temporary_changes_df.to_csv(temporary_changes_df_path, index=False)


In [10]:
routes_df

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_type,route_color,route_text_color


In [15]:
routes_txt.route_long_name.unique()

array([nan, 'Gödöllői járat', 'Cinkotai járat', 'Csömöri járat',
       'Ráckevei járat', 'Tököli járat', 'Dunaharaszti járat',
       'Csepeli járat', 'Szentendrei járat', 'Békásmegyeri járat'],
      dtype=object)

In [19]:
# Use relevant columns, without route_desc
cols_to_use = [col for col in routes_txt.columns if col != "route_desc"]

# Select new rows - rows whats route_id is not in routes_df 
new_routes = routes_txt[~routes_txt["route_id"].isin(routes_df["route_id"])][cols_to_use]

# Concatenate new routes
routes_df = pd.concat([routes_df, new_routes], ignore_index=True)

# Check for duplicates
duplicates = routes_df[routes_df.duplicated(subset="route_id", keep=False)]

if not duplicates.empty:
    print(f"Warning: There are {duplicates['route_id'].nunique()} duplicated route_id(s) in routes_df!")
    print("Duplicated route_id(s):")
    print(duplicates['route_id'].unique())
else:
    print("No duplicate route_id found in routes_df.")



No duplicate route_id found in routes_df.


In [20]:
routes_df

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_type,route_color,route_text_color
0,MOZG,BKK,MOZG,,3,1188FF,000000
1,MP52,BKK,M2,,3,1188FF,000000
2,MP525,BKK,M2E,,3,1188FF,000000
3,VP01,BKK,1V,,3,1188FF,000000
4,VP02,BKK,2V,,3,1188FF,000000
...,...,...,...,...,...,...,...
315,9940,BKK,994,,3,333333,FFFFFF
316,9960,BKK,996,,3,333333,FFFFFF
317,9961,BKK,996A,,3,333333,FFFFFF
318,9980,BKK,998,,3,333333,FFFFFF
