In [59]:
import requests
import zipfile
import io
import pandas as pd
import os
from tabulate import tabulate

In [2]:
def process_gtfs_from_zipfile_object(main_zip_ref : zipfile.ZipFile):
    # Create a dictionary to store all data
    all_data = {}

    # Iterate through the file list in the main GTFS ZIP
    for file_name in main_zip_ref.namelist():
        # Check if the item is a directory
        if file_name.endswith('/'):
            subdir_name = file_name.strip('/')
            
            # Convert the folder name to an integer
            folder_number = int(subdir_name)
            
            # Look for the nested ZIP file inside the subdirectory
            nested_zip_path = f"{subdir_name}/google_transit.zip"
            
            # Check if the nested ZIP file exists in the subdirectory
            if nested_zip_path in main_zip_ref.namelist():
                # Create a dictionary to store DataFrames for each folder
                folder_data = {}
                
                # Extract the nested ZIP contents directly from memory
                with main_zip_ref.open(nested_zip_path) as nested_zip_file:
                    with zipfile.ZipFile(io.BytesIO(nested_zip_file.read())) as nested_zip_ref:
                        nested_file_list = nested_zip_ref.namelist()
                        for nested_file_name in nested_file_list:
                            if nested_file_name.endswith('.txt'):
                                with nested_zip_ref.open(nested_file_name) as nested_file:
                                    # Read the CSV content as a Pandas DataFrame
                                    folder_data[nested_file_name.removesuffix('.txt')] = pd.read_csv(nested_file)
                
                # Create a Pandas Series for the folder data
                all_data[int(folder_number)] = pd.Series(folder_data)

            else:
                print("Nested ZIP file not found in", subdir_name)

    # Convert the dictionary to a Pandas Series
    all_data = pd.Series(all_data) 

    # Sort the series by the folder number
    all_data.sort_index(inplace=True)

    return all_data

def process_gtfs_from_url(url):
    # Send an HTTP GET request to get the main GTFS ZIP file content
    response = requests.get(url, stream=True)

    if response.status_code == 200:
        # Create a ZipFile object from the response content
        with zipfile.ZipFile(io.BytesIO(response.content)) as main_zip_ref:
            return process_gtfs_from_zipfile_object(main_zip_ref)

    else:
        print(response, "Failed to fetch the main GTFS ZIP file.")

def process_gtfs_from_local_zip(zip_path):
    # Create a ZipFile object from the local ZIP file
    with zipfile.ZipFile(zip_path) as main_zip_ref:
        return process_gtfs_from_zipfile_object(main_zip_ref)

In [3]:
data = pd.Series()

for dirpath, dirnames, filenames in os.walk('downloads'):

    for filename in filenames:
        
        gtfs_zip_file_path = os.path.join(dirpath, filename)
        
        gtfs_zip_parent_folder_name = gtfs_zip_file_path.split(os.sep)[-2]
        
        data[gtfs_zip_parent_folder_name] = process_gtfs_from_local_zip(gtfs_zip_file_path)

data['main'] = process_gtfs_from_url("http://data.ptv.vic.gov.au/downloads/gtfs.zip")

  data = pd.Series()


In [4]:
data.keys()

Index(['20220403_025040', '20220704_003721', '20220715_191941',
       '20230805_030129', 'main'],
      dtype='object')

In [10]:
data['20220704_003721'][2]['routes'].columns

Index(['route_id', 'agency_id', 'route_short_name', 'route_long_name',
       'route_type', 'route_color', 'route_text_color'],
      dtype='object')

In [84]:
df :pd.DataFrame = data['20230805_030129'][3]['routes'].copy(deep=True)

df['Demo'] =  '<span style="background-color:#' + df['route_color'] + ';color:#' + df['route_text_color'] + ';padding-left:5px;padding-right:5px;font-weight:bold;border-radius:2px">' + df['route_short_name'] + '</span>'

df['Color'] = "`#" + df['route_color'] + "`"
df['Text'] = "`#" + df['route_text_color'] + "`"
df['Name'] = df['route_short_name']
df['Long name'] = df['route_long_name']
markdown_table = tabulate(df[['Name', 'Long name', 'Color', 'Text', 'Demo']], showindex=False, headers='keys', tablefmt='pipe')
with open('table.md', 'w') as f:
    f.write(markdown_table)

In [60]:
tabulate(df, headers='keys', tablefmt='pipe')

'|    | route_id    |   agency_id | route_short_name   | route_long_name                                  |   route_type | route_color   | route_text_color   |\n|---:|:------------|------------:|:-------------------|:-------------------------------------------------|-------------:|:--------------|:-------------------|\n|  0 | 3-109-mjp-1 |         nan | 109                | Port Melbourne - Box Hill                        |            0 | E87722        | 000000             |\n|  1 | 3-11-mjp-1  |         nan | 11                 | Victoria Harbour Docklands - West Preston        |            0 | 6ECEB2        | 000000             |\n|  2 | 3-12-mjp-1  |         nan | 12                 | St Kilda (Fitzroy St) - Victoria Gardens         |            0 | 007E92        | FFFFFF             |\n|  3 | 3-16-mjp-1  |         nan | 16                 | Melbourne University - Kew via St Kilda Beach    |            0 | FBD872        | 000000             |\n|  4 | 3-19-mjp-1  |         nan | 19  

In [93]:
data['main'][4]['routes'].sort_values(by=['route_short_name']).to_csv('route_short_name.csv')

In [None]:
data['20220704_003721'][1]['calendar_dates'].equals(data['20220715_191941'][1]['calendar_dates'])

In [None]:
data['20220403_025040'][2]['routes']['route_color'].value_counts()

In [None]:
data['20220715_191941'][2]['routes']['route_color'].value_counts()