In [1]:
import requests
import zipfile
import io
import pandas as pd
import os
from tabulate import tabulate

In [25]:
# # URL of the main GTFS ZIP file
# url = "http://data.ptv.vic.gov.au/downloads/gtfs.zip"

def process_google_transit_from_zipfile_object(google_transit_zip_ref):
    
    # Create a dictionary to store DataFrames for each folder
    google_transit_data = {}

    nested_file_list = google_transit_zip_ref.namelist()
    for nested_file_name in nested_file_list:
        if nested_file_name.endswith('.txt'):
            with google_transit_zip_ref.open(nested_file_name) as nested_file:
                # Read the CSV content as a Pandas DataFrame
                google_transit_data[nested_file_name.removesuffix('.txt')] = pd.read_csv(nested_file, keep_default_na=False)

    return pd.Series(google_transit_data)

def process_gtfs_from_zipfile_object(main_zip_ref):
    # Create a dictionary to store all data
    all_data = {}

    # Iterate through the file list in the main GTFS ZIP
    for file_name in main_zip_ref.namelist():
        # Check if the item is a directory
        if file_name.endswith('/'):
            subdir_name = file_name.strip('/')
            
            # Look for the nested ZIP file inside the subdirectory
            nested_zip_path = f"{subdir_name}/google_transit.zip"
            
            # Check if the nested ZIP file exists in the subdirectory
            if nested_zip_path in main_zip_ref.namelist():
                
                # Extract the nested ZIP contents directly from memory
                with main_zip_ref.open(nested_zip_path) as nested_zip_file:
                    with zipfile.ZipFile(io.BytesIO(nested_zip_file.read())) as nested_zip_ref:
                        google_transit_data = process_google_transit_from_zipfile_object(nested_zip_ref)
                
                all_data[int(subdir_name)] = google_transit_data

            else:
                print("Nested ZIP file not found in", subdir_name)

    # Convert the dictionary to a Pandas Series
    all_data = pd.Series(all_data) 

    # Sort the series by the folder number
    all_data.sort_index(inplace=True)

    return all_data

def process_google_transit_from_url(url):
    # Send an HTTP GET request to get the main GTFS ZIP file content
    response = requests.get(url, stream=True)

    if response.status_code == 200:
        # Create a ZipFile object from the response content
        with zipfile.ZipFile(io.BytesIO(response.content)) as main_zip_ref:
            return process_google_transit_from_zipfile_object(main_zip_ref)

    else:
        print(response, "Failed to fetch the main GTFS ZIP file.")

def process_google_transit_from_local_zip(zip_path):
    # Create a ZipFile object from the local ZIP file
    with zipfile.ZipFile(zip_path) as main_zip_ref:
        return process_google_transit_from_zipfile_object(main_zip_ref)

def process_gtfs_from_url(url):
    # Send an HTTP GET request to get the main GTFS ZIP file content
    response = requests.get(url, stream=True)

    if response.status_code == 200:
        # Create a ZipFile object from the response content
        with zipfile.ZipFile(io.BytesIO(response.content)) as main_zip_ref:
            return process_gtfs_from_zipfile_object(main_zip_ref)

    else:
        print(response, "Failed to fetch the main GTFS ZIP file.")

def process_gtfs_from_local_zip(zip_path):
    # Create a ZipFile object from the local ZIP file
    with zipfile.ZipFile(zip_path) as main_zip_ref:
        return process_gtfs_from_zipfile_object(main_zip_ref)

In [79]:
borough = ["bronx", "brooklyn", "manhattan", "queens", "staten_island"]
mta_buses = {}
for b in borough:
    mta_buses[b] = process_google_transit_from_url("http://web.mta.info/developers/data/nyct/bus/google_transit_%s.zip" % b)

mta_buses['mta'] = process_google_transit_from_url("http://web.mta.info/developers/data/busco/google_transit.zip")

mta_buses = pd.Series(mta_buses)

In [None]:
http://web.mta.info/developers/data/lirr/google_transit.zip

In [77]:
mta_data.keys()

Index(['agency', 'calendar', 'calendar_dates', 'routes', 'shapes', 'stops',
       'stop_times', 'trips'],
      dtype='object')

In [78]:
mta_data['routes']['route_color'].sort_values().unique()

array(['006CB7', '00933C', '00AEEF', '6CBE45', 'B933AD', 'D0006F',
       'EE352E', 'FAA61A'], dtype=object)

In [56]:
df :pd.DataFrame = mta_data['routes']

df.groupby('route_color')['006CB7']

KeyError: 'Column not found: 006CB7'

In [38]:
pd.read_csv("http://web.mta.info/developers/data/colors.csv", keep_default_na=False)

Unnamed: 0,MTA Color Standards,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,,,,,
1,MTA Mode,Line/Branch,RGB Hex,Pantone CVC,CMYK
2,,,,,
3,NYCT Subway,A/C/E,0039A6,PMS 286,c100;m56
4,NYCT Subway,B/D/F/M,FF6319,PMS 165,m60;y100
5,NYCT Subway,G,6CBE45,PMS 376,c56;y100
6,NYCT Subway,J/Z,996633,PMS 154,m43;y100;k34
7,NYCT Subway,L,A7A9AC,50% black,50k
8,NYCT Subway,N/Q/R,FCCC0A,PMS 116,m15;y94
9,NYCT Subway,S,808183,70% black,70k


In [37]:
mta_data['routes']

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
0,1,MTA NYCT,1,Broadway - 7 Avenue Local,Trains operate between 242 St in the Bronx and...,1,http://web.mta.info/nyct/service/pdf/t1cur.pdf,EE352E,
1,2,MTA NYCT,2,7 Avenue Express,"Trains operate between Wakefield-241 St, Bronx...",1,http://web.mta.info/nyct/service/pdf/t2cur.pdf,EE352E,
2,3,MTA NYCT,3,7 Avenue Express,"Trains operate between 148 St, 7 Av, Manhattan...",1,http://web.mta.info/nyct/service/pdf/t3cur.pdf,EE352E,
3,4,MTA NYCT,4,Lexington Avenue Express,Trains operate daily between Woodlawn/Jerome A...,1,http://web.mta.info/nyct/service/pdf/t4cur.pdf,00933C,
4,5,MTA NYCT,5,Lexington Avenue Express,"Weekdays daytime, most trains operate between ...",1,http://web.mta.info/nyct/service/pdf/t5cur.pdf,00933C,
5,5X,MTA NYCT,5X,Lexington Avenue Express,"Weekdays daytime, most trains operate between ...",1,http://web.mta.info/nyct/service/pdf/t5cur.pdf,00933C,
6,6,MTA NYCT,6,Lexington Avenue Local,Local trains operate between Pelham Bay Park/B...,1,http://web.mta.info/nyct/service/pdf/t6cur.pdf,00933C,
7,6X,MTA NYCT,6X,Pelham Bay Park Express,Express trains operate between Pelham Bay Park...,1,http://web.mta.info/nyct/service/pdf/t6cur.pdf,00933C,
8,7,MTA NYCT,7,Flushing Local,"Trains operate between Main St-Flushing, Queen...",1,http://web.mta.info/nyct/service/pdf/t7cur.pdf,B933AD,
9,7X,MTA NYCT,7X,Flushing Express,"Trains operate between Main St-Flushing, Queen...",1,http://web.mta.info/nyct/service/pdf/t7cur.pdf,B933AD,
