In [17]:
from sqlalchemy import create_engine
import psycopg2
import pandas as pd
import requests
import zipfile
import io
import os

# Database connection details
db_user = "postgres"
db_password = "postgres"
db_host = "localhost"
db_port = 5432
db_name = "gtfs"  # Replace with the actual database name

# Create the database URL
db_url = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"

# Create an SQLAlchemy engine to connect to the database
engine = create_engine(db_url)

# Now you have an active database connection through SQLAlchemy

# Connect to PostgreSQL
conn = psycopg2.connect(
    host=db_host,
    port=db_port,
    database=db_name,
    user=db_user,
    password=db_password
)
cursor = conn.cursor()

# # Create table
# cursor.execute(
#     "CREATE TABLE IF NOT EXISTS tweets (id SERIAL PRIMARY KEY, tweet VARCHAR(255), sentiment VARCHAR(255))"
# )

# conn.commit()

# cursor.close()
# conn.close()




# # URL of the main GTFS ZIP file
# url = "http://data.ptv.vic.gov.au/downloads/gtfs.zip"
# https://discover.data.vic.gov.au/dataset/timetable-and-geographic-information-gtfs
# https://data.ptv.vic.gov.au/downloads/GTFSReleaseNotes.pdf

# GTFS Release
# The DTP GTFS data has been exported by operational branches listed in the folder numbers below:
# 1 - Regional Train
# 2 - Metropolitan Train
# 3 - Metropolitan Tram
# 4 - Metropolitan Bus
# 5 - Regional Coach
# 6 - Regional Bus
# 7 - TeleBus
# 8 â€“ Night Bus
# 10 - Interstate
# 11 - SkyBus
# The GTFS data provided for each of the 10 Operational branches is in the form of 8 files and is
# described in the following table: 



def process_google_transit_from_zipfile_object(google_transit_zip_ref):
    
    # Create a dictionary to store DataFrames for each folder
    google_transit_data = {}

    nested_file_list = google_transit_zip_ref.namelist()
    for nested_file_name in nested_file_list:
        if nested_file_name.endswith('.txt'):
            with google_transit_zip_ref.open(nested_file_name) as nested_file:
                # Read the CSV content as a Pandas DataFrame
                google_transit_data[nested_file_name.removesuffix('.txt')] = pd.read_csv(nested_file, keep_default_na=False)

    return pd.Series(google_transit_data)

def process_gtfs_from_zipfile_object(main_zip_ref):
    # Create a dictionary to store all data
    all_data = {}

    # Iterate through the file list in the main GTFS ZIP
    for file_name in main_zip_ref.namelist():
        # Check if the item is a directory
        if file_name.endswith('/'):
            subdir_name = file_name.strip('/')
            
            # Look for the nested ZIP file inside the subdirectory
            nested_zip_path = f"{subdir_name}/google_transit.zip"
            
            # Check if the nested ZIP file exists in the subdirectory
            if nested_zip_path in main_zip_ref.namelist():
                
                # Extract the nested ZIP contents directly from memory
                with main_zip_ref.open(nested_zip_path) as nested_zip_file:
                    with zipfile.ZipFile(io.BytesIO(nested_zip_file.read())) as nested_zip_ref:
                        google_transit_data = process_google_transit_from_zipfile_object(nested_zip_ref)
                
                all_data[int(subdir_name)] = google_transit_data

            else:
                print("Nested ZIP file not found in", subdir_name)

    # Convert the dictionary to a Pandas Series
    all_data = pd.Series(all_data) 

    # Sort the series by the folder number
    all_data.sort_index(inplace=True)

    return all_data

def process_google_transit_from_url(url):
    # Send an HTTP GET request to get the main GTFS ZIP file content
    response = requests.get(url, stream=True)

    if response.status_code == 200:
        # Create a ZipFile object from the response content
        with zipfile.ZipFile(io.BytesIO(response.content)) as main_zip_ref:
            return process_google_transit_from_zipfile_object(main_zip_ref)

    else:
        print(response, "Failed to fetch the main GTFS ZIP file.")

def process_google_transit_from_local_zip(zip_path):
    # Create a ZipFile object from the local ZIP file
    with zipfile.ZipFile(zip_path) as main_zip_ref:
        return process_google_transit_from_zipfile_object(main_zip_ref)

def process_gtfs_from_url(url):
    # Send an HTTP GET request to get the main GTFS ZIP file content
    response = requests.get(url, stream=True)

    if response.status_code == 200:
        # Create a ZipFile object from the response content
        with zipfile.ZipFile(io.BytesIO(response.content)) as main_zip_ref:
            return process_gtfs_from_zipfile_object(main_zip_ref)

    else:
        print(response, "Failed to fetch the main GTFS ZIP file.")

def process_gtfs_from_local_zip(zip_path):
    # Create a ZipFile object from the local ZIP file
    with zipfile.ZipFile(zip_path) as main_zip_ref:
        return process_gtfs_from_zipfile_object(main_zip_ref)
    

data = pd.Series()

for dirpath, dirnames, filenames in os.walk('downloads'):

    for filename in filenames:
        
        gtfs_zip_file_path = os.path.join(dirpath, filename)
        
        gtfs_zip_parent_folder_name = gtfs_zip_file_path.split(os.sep)[-2]
        
        data[gtfs_zip_parent_folder_name] = process_gtfs_from_local_zip(gtfs_zip_file_path)

# Get current time and date in format YYYYMMDD_HHMMSS
current_time = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")

data[current_time] = process_gtfs_from_url("http://data.ptv.vic.gov.au/downloads/gtfs.zip")

for version_id, dfx in data.items():
    for branch_id, dfs in dfx.items():
        # df = data['main'][service_id]
        for df_name, df in dfs.items():
            df['x_branch_id'] = branch_id
            df['x_version_id'] = version_id
            df.to_sql(df_name, engine, if_exists='append', index=False)