In [1]:
import pandas as pd
from sodapy import Socrata
from datetime import datetime
from pymongo import MongoClient
import nest_asyncio
from dateutil.relativedelta import relativedelta

In [2]:
# initialize some variable
APP_TOKEN = "JvFbfQ0fr0F5UWOjkgBNor6R8"
USERNAME = "xw3759@nyu.edu"
PASSWORD = "^/l/vPC$T3R70E"
DATA_SOURCE = "data.ny.gov"
MONGO_URI = "mongodb://my-mongo-db:27017/"

### Getting the Ridership data

In [7]:
DATASET_ID = "wujg-7c2s"
DATABASE = 'ny_transit_data'
DB_COLLECTION = 'ridership'
RECORD_LIMIT_PER_MONTH = 5000000 # Adjust if a month's data exceeds this

# --- Date Range for Data Fetching ---
# Set the start and end year and month for the data you want to retrieve.
START_YEAR = 2024
START_MONTH = 12
END_YEAR = 2024
END_MONTH = 12 # The loop will include this month

In [8]:
"""
Fetches data from the Socrata API for a specified date range,
one month at a time.
"""
try:
    # Authenticated client to connect to Socrata API
    client = Socrata(DATA_SOURCE,
                     APP_TOKEN,
                     username=USERNAME,
                     password=PASSWORD,
                     timeout=3600) # Increased timeout for large queries

    print(f"Successfully authenticated with {DATA_SOURCE}")    

    mongo_client = MongoClient(MONGO_URI)

    # this is needed to make it async, otherwise it throws an error withou finishing the call
    #nest_asyncio.apply()

    # check if the DB is up
    try:
        mongo_client.admin.command('ping')
        print("DB connected!")
    except Exception as e:
        print("DB connection error!")
        print(e)
        
    # Select database
    db = mongo_client[DATABASE]
    
    # Select collection
    collection = db[DB_COLLECTION]
    
    # Before inserting, it's good practice to clear old data to avoid duplicates
    collection.delete_many({})
    
    # List to hold all the monthly dataframes
    #all_monthly_data = []

    # Generate the start and end dates for the loop
    start_date = datetime(START_YEAR, START_MONTH, 1)
    
    # The end_date for the loop condition is the first day of the month *after* the specified end month.
    end_date = datetime(END_YEAR, END_MONTH, 1) + relativedelta(months=1)

    current_date = start_date
    while current_date < end_date:
        # Format the start and end of the month for the SoQL query
        start_of_month = current_date.strftime('%Y-%m-01T00:00:00.000')
        # Calculate the end of the month
        end_of_month_date = current_date + relativedelta(months=1) - relativedelta(days=1)
        end_of_month = end_of_month_date.strftime('%Y-%m-%dT23:59:59.999')

        # Build the SoQL query to get all records within the current month
        soql_query = f"transit_timestamp between '{start_of_month}' and '{end_of_month}'"

        print(f"Fetching data for: {current_date.strftime('%Y-%m')}...")

        # Make the API call with the 'where' filter for the current month
        results = client.get(DATASET_ID, where=soql_query, limit=RECORD_LIMIT_PER_MONTH)

        if results:
            # Convert the list of dictionaries to a pandas DataFrame
            results_df = pd.DataFrame.from_records(results)
            print(f"Successfully fetched {len(results_df)} records for {current_date.strftime('%Y-%m')}.")
            #all_monthly_data.append(results_df)
            
            # Columns to convert to numbers
            numeric_cols = ['ridership', 'transfers', 'latitude', 'longitude']
            for col in numeric_cols:
                results_df[col] = pd.to_numeric(results_df[col], errors='coerce')
            
            # Column to convert to a datetime object
            results_df['transit_timestamp'] = pd.to_datetime(results_df['transit_timestamp'], errors='coerce')
            
            # Text columns are usually loaded as 'object' dtype by pandas, which is fine.
            # If you want to be explicit, you can convert them to the modern 'string' dtype.
            string_cols = [
                'transit_mode', 'station_complex_id', 'station_complex',
                'borough', 'payment_method', 'fare_class_category'
            ]
            
            for col in string_cols:
                results_df[col] = results_df[col].astype('string')

            print("\n--- Data Types After Conversion ---")
            print(results_df.info())
            
            # Now you can work with your data with the correct types
            print("\n--- First 5 Rows of Cleaned Data ---")
            print(results_df.head())

            # Convert the DataFrame to a list of dictionaries
            records_to_insert = results_df.to_dict('records')

            # Insert the records into the collection
            collection.insert_many(records_to_insert)
            
            print(f"Mongo DB : Successfully inserted {len(records_to_insert)} records into the '{collection.name}' collection.")
        else:
            print(f"No records found for {current_date.strftime('%Y-%m')}.")

        # Move to the next month
        current_date += relativedelta(months=1)

    """
    # Concatenate all the monthly dataframes into a single dataframe
    if all_monthly_data:
        final_df = pd.concat(all_monthly_data, ignore_index=True)
        print("\n--- Data Fetching Complete ---")
        print(f"Total records fetched: {len(final_df)}")
        print("--- First 5 rows of the combined data: ---")
        print(final_df.head())
    else:
        print("\nNo data was fetched for the specified date range.")
    """
except Exception as e:
    print(f"An error occurred: {e}")

finally:
    if 'client' in locals():
        client.close()
        print("Socrata client connection closed.")
    if 'mongo_client' in locals():
        mongo_client.close()
        print("Mongo DB client connection closed.")

Successfully authenticated with data.ny.gov
DB connected!
Fetching data for: 2024-12...
Successfully fetched 2446236 records for 2024-12.
Mongo DB : Successfully inserted 2446236 records into the 'ridership' collection.
Socrata client connection closed.
Mongo DB client connection closed.


### Getting the Station Data

In [15]:
DATASET_ID = "39hk-dx4f"
DATABASE = 'ny_transit_data'
DB_COLLECTION = 'station'
RECORD_LIMIT = 5000

In [18]:
try:
    # Authenticated client to connect to Socrata API
    client = Socrata(DATA_SOURCE,
                     APP_TOKEN,
                     username=USERNAME,
                     password=PASSWORD,
                     timeout=3600) # Increased timeout for large queries

    print(f"Successfully authenticated with {DATA_SOURCE}")

    # returned as JSON from API / converted to Python list of dictionaries by sodapy.
    results = client.get(DATASET_ID, limit=RECORD_LIMIT)

    # Convert to pandas DataFrame
    results_df = pd.DataFrame.from_records(results)

    # Columns to convert to numbers
    numeric_cols = ['station_id', 'complex_id', 'gtfs_latitude', 'gtfs_longitude','ada','ada_northbound','ada_southbound']
    for col in numeric_cols:
        results_df[col] = pd.to_numeric(results_df[col], errors='coerce')

    # For the boolean 'cbd' column ("TRUE" or "FALSE" strings)
    if 'cbd' in results_df.columns:
        results_df['cbd'] = results_df['cbd'].apply(lambda x: True if x == 'TRUE' else False)
    
    mongo_client = MongoClient(MONGO_URI)

    # check if the DB is up
    try:
        mongo_client.admin.command('ping')
        print("DB connected!")
    except Exception as e:
        print("DB connection error!")
        print(e)
        
    # Select database
    db = mongo_client[DATABASE]
    
    # Select collection
    collection = db[DB_COLLECTION]
    
    # Before inserting, it's good practice to clear old data to avoid duplicates
    collection.delete_many({})

    # Convert the DataFrame to a list of dictionaries
    records_to_insert = results_df.to_dict('records')

    # Insert the records into the collection
    collection.insert_many(records_to_insert)
            
    print(f"Mongo DB : Successfully inserted {len(results)} records into the '{collection.name}' collection.")
    


except Exception as e:
    print(f"An error occurred: {e}")

finally:
    if 'client' in locals():
        client.close()
        print("Socrata client connection closed.")
    if 'mongo_client' in locals():
        mongo_client.close()
        print("Mongo DB client connection closed.")

Successfully authenticated with data.ny.gov
DB connected!
Mongo DB : Successfully inserted 496 records into the 'station' collection.
Socrata client connection closed.
Mongo DB client connection closed.
