# Notebook for extracting railroad traffic from large GTFS-dataset
This notebook is customized for GTFS Sweden2 Dataset from [Trafiklab.se](https://www.trafiklab.se/api/trafiklab-apis/gtfs-sverige-2/).




In [5]:
import pandas as pd
from datetime import datetime, timedelta

path = '../data/gtfs/sweden (11)'
savepath = '../data/gtfs'

# Load GTFS files into pandas DataFrames
routes_df = pd.read_csv(path + '/routes.txt', dtype={'route_id': str, 'route_type': int})

# Save routes.txt
# Filter out all route_types that are above 106 (non-train) and agency 275 (SL)
train_routes = routes_df[(routes_df['route_type'] <= 106) & (routes_df['agency_id'] != 275)]

#train_routes = routes_df[routes_df['route_type'] <= 106 && routes_df['agency_name'] != 275]
train_routes.to_csv(savepath + '/routes.txt', index=False)

# agency.txt
agency_df = pd.read_csv(path + '/agency.txt')
train_agency = agency_df[agency_df['agency_id'].isin(train_routes['agency_id'])]
train_agency.to_csv(savepath + '/agency.txt', index=False)

#trips
trips_df = pd.read_csv(path + '/trips.txt', dtype={'trip_id':str,'service_id': str, 'route_id': str})
train_trips = trips_df[trips_df['route_id'].isin(train_routes['route_id'])]
train_trips.to_csv(savepath + '/trips.txt')

#calendar_times.txt
calendar_times_df = pd.read_csv(path + '/calendar_dates.txt', dtype={'service_id': str})
calendar_times_df['date'] = pd.to_datetime(calendar_times_df['date']) # convert to datetime
#train_calendar_times = calendar_times_df[calendar_times_df['service_id'].isin(train_trips['service_id'])]

today = pd.to_datetime(datetime.today().strftime('%Y-%m-%d'))
days_to_add = 30
future_date = today + timedelta(days=days_to_add)

train_calendar_times = calendar_times_df[
    (calendar_times_df['service_id'].isin(train_trips['service_id'])) &
    (calendar_times_df['date'] >= today) &
    (calendar_times_df['date'] <= future_date)
]
train_calendar_times.to_csv(savepath + '/calendar_dates.txt', index=False)

#stop_times.txt
stop_times_df = pd.read_csv(path + '/stop_times.txt', dtype={'trip_id': str, 'stop_id': str})
train_stop_times = stop_times_df[stop_times_df['trip_id'].isin(train_trips['trip_id'])]
train_stop_times.to_csv(savepath + '/stop_times.txt', index=False)

#stops.txt
stops_df = pd.read_csv(path + '/stops.txt', dtype={'stop_id': str})
train_stops = stops_df[stops_df['stop_id'].isin(train_stop_times['stop_id'])]
train_stops.to_csv(savepath + '/stops.txt', index=False)

#transfers.txt
transfers_df = pd.read_csv(path + '/transfers.txt', dtype={'from_stop_id': str, 'to_stop_id': str})

# Merge transfers with stops on from_stop_id
transfers_with_from_stops = pd.merge(transfers_df, train_stops, left_on='from_stop_id', right_on='stop_id', how='inner')

# Merge transfers with stops on to_stop_id
transfers_with_to_stops = pd.merge(transfers_df, train_stops, left_on='to_stop_id', right_on='stop_id', how='inner')

# Concatenate the two dataframes to get transfers with either from or to stop
transfers_filtered = pd.concat([transfers_with_from_stops, transfers_with_to_stops])

# Drop duplicate rows
transfers_filtered.drop_duplicates(inplace=True)

transfers_filtered.to_csv(savepath + '/transfers.txt')

# print done
print('Done!')


Done!
