In [1]:
import pandas as pd
from pymongo import MongoClient
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [2]:
MONGO_URI = "mongodb://my-mongo-db:27017/"

In [3]:
def get_mongo_data_to_pandas(db_name, collection_name):
    """
    Connects to MongoDB, fetches all documents from a collection,
    and returns them as a Pandas DataFrame.

    Args:
        db_name (str): The name of the database to connect to.
        collection_name (str): The name of the collection to read from.

    Returns:
        pandas.DataFrame: A DataFrame containing the data from the collection,
                          or None if an error occurs.
    """
    print(f"Connecting to MongoDB at {MONGO_URI}...")
    try:
        # Establish a connection to the MongoDB server
        client = MongoClient(MONGO_URI)
        db = client[db_name]
        collection = db[collection_name]

        print(f"Fetching data from '{db_name}.{collection_name}'...")
        # Fetch all documents from the collection
        cursor = collection.find()

        # Convert the cursor to a list of dictionaries
        data_list = list(cursor)

        # close the connection
        client.close()

        if not data_list:
            print("Warning: No data found in the collection.")
            return pd.DataFrame() # Return an empty DataFrame

        # Create a Pandas DataFrame from the list of dictionaries
        # The '_id' column from MongoDB is  dropped as it's not needed for analysis.
        df = pd.DataFrame(data_list).drop(columns=['_id'], errors='ignore')
        print("Successfully loaded data into Pandas DataFrame.")
        return df

    except Exception as e:
        print(f"An error occurred while connecting to MongoDB: {e}")
        return None

In [4]:
DB_NAME = "ny_transit_data"
COLLECTION_NAME = "ridership"
pandas_df = get_mongo_data_to_pandas(DB_NAME, COLLECTION_NAME)

Connecting to MongoDB at mongodb://my-mongo-db:27017/...
Fetching data from 'ny_transit_data.ridership'...
Successfully loaded data into Pandas DataFrame.


In [5]:
print("\n--- Pandas DataFrame Head ---")
print(pandas_df.head())
print("\n--- Pandas DataFrame Info ---")
pandas_df.info()


--- Pandas DataFrame Head ---
         transit_timestamp transit_mode station_complex_id  \
0  2024-12-01T00:00:00.000         tram              TRAM2   
1  2024-12-01T00:00:00.000         tram              TRAM1   
2  2024-12-01T00:00:00.000         tram              TRAM1   
3  2024-12-01T00:00:00.000         tram              TRAM2   
4  2024-12-01T00:00:00.000         tram              TRAM1   

          station_complex    borough payment_method  \
0  RI Tramway (Roosevelt)  Manhattan      metrocard   
1  RI Tramway (Manhattan)  Manhattan      metrocard   
2  RI Tramway (Manhattan)  Manhattan      metrocard   
3  RI Tramway (Roosevelt)  Manhattan      metrocard   
4  RI Tramway (Manhattan)  Manhattan           omny   

                fare_class_category ridership transfers   latitude  longitude  \
0             Metrocard - Full Fare       6.0       0.0   40.75734  -73.95412   
1             Metrocard - Full Fare       8.0       3.0  40.761337  -73.96416   
2       Metrocard - Un

In [6]:
print("\nInitializing SparkSession...")
try:
    spark = SparkSession.builder \
        .appName("MongoToSpark") \
        .config("spark.driver.memory", "16g") \
        .getOrCreate()
    print("SparkSession initialized successfully.")
except Exception as e:
    print(f"Failed to initialize SparkSession: {e}")
spark


Initializing SparkSession...
SparkSession initialized successfully.


In [7]:
spark_df = spark.createDataFrame(pandas_df)

print("Conversion successful!")

# --- Step 4: Show results from Spark DataFrame ---
print("\n--- Spark DataFrame (Top 20 Rows) ---")
spark_df.show()

print("\n--- Spark DataFrame Schema ---")
spark_df.printSchema()

Conversion successful!

--- Spark DataFrame (Top 20 Rows) ---
+--------------------+------------+------------------+--------------------+---------+--------------+--------------------+---------+---------+---------+----------+--------------------+
|   transit_timestamp|transit_mode|station_complex_id|     station_complex|  borough|payment_method| fare_class_category|ridership|transfers| latitude| longitude|        georeference|
+--------------------+------------+------------------+--------------------+---------+--------------+--------------------+---------+---------+---------+----------+--------------------+
|2024-12-01T00:00:...|        tram|             TRAM2|RI Tramway (Roose...|Manhattan|     metrocard|Metrocard - Full ...|      6.0|      0.0| 40.75734| -73.95412|{coordinates -> [...|
|2024-12-01T00:00:...|        tram|             TRAM1|RI Tramway (Manha...|Manhattan|     metrocard|Metrocard - Full ...|      8.0|      3.0|40.761337| -73.96416|{coordinates -> [...|
|2024-12-01T00:00: