In [6]:
from pyspark.sql import SparkSession

# Create SparkSession connecting to the Spark cluster (NOT Spark Connect)
# This approach allows for true distributed data loading
spark = (
    SparkSession.builder.appName("TaxiDataAnalysis")
    .remote("sc://192.168.1.7:15002")  # Direct connection to Spark master (not Connect)
    .config("spark.sql.ansi.enabled", "false")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "1g")
    .getOrCreate()
)

# Enable eager evaluation for nice HTML tables in Jupyter
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

spark

<pyspark.sql.connect.session.SparkSession at 0x23b97bd5940>

In [None]:
# Read parquet files from all three locations (master + 2 workers)
# The master can access all three directories, but workers can only access their own
taxi_data = spark.read.parquet(
    "/opt/spark/data/worker1/*.parquet", "/opt/spark/data/worker2/*.parquet"
)
taxi_data.createOrReplaceTempView("taxi_data")
taxi_data

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee
1,2025-01-01 00:18:38,2025-01-01 00:26:59,1,1.6,1,N,229,237,1,10.0,3.5,0.5,3.0,0.0,1.0,18.0,2.5,0.0,0.0
1,2025-01-01 00:32:40,2025-01-01 00:35:13,1,0.5,1,N,236,237,1,5.1,3.5,0.5,2.02,0.0,1.0,12.12,2.5,0.0,0.0
1,2025-01-01 00:44:04,2025-01-01 00:46:01,1,0.6,1,N,141,141,1,5.1,3.5,0.5,2.0,0.0,1.0,12.1,2.5,0.0,0.0
2,2025-01-01 00:14:27,2025-01-01 00:20:01,3,0.52,1,N,244,244,2,7.2,1.0,0.5,0.0,0.0,1.0,9.7,0.0,0.0,0.0
2,2025-01-01 00:21:34,2025-01-01 00:25:06,3,0.66,1,N,244,116,2,5.8,1.0,0.5,0.0,0.0,1.0,8.3,0.0,0.0,0.0
2,2025-01-01 00:48:24,2025-01-01 01:08:26,2,2.63,1,N,239,68,2,19.1,1.0,0.5,0.0,0.0,1.0,24.1,2.5,0.0,0.0
1,2025-01-01 00:14:47,2025-01-01 00:16:15,0,0.4,1,N,170,170,1,4.4,3.5,0.5,2.35,0.0,1.0,11.75,2.5,0.0,0.0
1,2025-01-01 00:39:27,2025-01-01 00:51:51,0,1.6,1,N,234,148,1,12.1,3.5,0.5,2.0,0.0,1.0,19.1,2.5,0.0,0.0
1,2025-01-01 00:53:43,2025-01-01 01:13:23,0,2.8,1,N,148,170,1,19.1,3.5,0.5,3.0,0.0,1.0,27.1,2.5,0.0,0.0
2,2025-01-01 00:00:02,2025-01-01 00:09:36,1,1.71,1,N,237,262,2,11.4,1.0,0.5,0.0,0.0,1.0,16.4,2.5,0.0,0.0


In [3]:
spark.sql("select * from taxi_data order by tpep_pickup_datetime asc limit 40")

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee
2,2024-12-31 20:47:55,2024-12-31 20:54:00,2,1.72,1,N,48,246,1,9.3,1.0,0.5,2.86,0.0,1.0,17.16,2.5,0.0,0.0
2,2024-12-31 20:52:50,2024-12-31 21:09:34,2,5.05,1,N,249,262,1,23.3,1.0,0.5,4.0,0.0,1.0,32.3,2.5,0.0,0.0
2,2024-12-31 20:54:50,2024-12-31 21:30:18,2,1.39,1,N,246,48,1,28.2,1.0,0.5,6.64,0.0,1.0,39.84,2.5,0.0,0.0
2,2024-12-31 21:15:22,2024-12-31 21:26:00,2,3.2,1,N,141,146,1,15.6,1.0,0.5,3.0,0.0,1.0,23.6,2.5,0.0,0.0
2,2024-12-31 21:20:05,2024-12-31 21:35:13,2,2.64,1,N,42,238,1,16.3,1.0,0.5,2.0,0.0,1.0,23.3,2.5,0.0,0.0
2,2024-12-31 21:33:43,2024-12-31 21:39:00,2,1.12,1,N,179,7,2,7.9,1.0,0.5,0.0,0.0,1.0,10.4,0.0,0.0,0.0
2,2024-12-31 23:24:31,2024-12-31 23:25:35,5,0.28,1,N,132,132,2,3.7,1.0,0.5,0.0,0.0,1.0,7.95,0.0,1.75,0.0
2,2024-12-31 23:25:38,2025-01-01 01:01:53,5,18.78,2,N,132,142,1,70.0,0.0,0.5,18.5,0.0,1.0,92.5,2.5,0.0,0.0
2,2024-12-31 23:27:13,2024-12-31 23:35:48,1,1.53,1,N,170,141,1,10.0,1.0,0.5,3.0,0.0,1.0,18.0,2.5,0.0,0.0
2,2024-12-31 23:30:03,2024-12-31 23:43:02,1,3.0,1,N,246,13,1,16.3,1.0,0.5,5.32,0.0,1.0,26.62,2.5,0.0,0.0


In [4]:
spark.sql(
    """
SELECT 
    MIN(tpep_pickup_datetime) as earliest_date,
    MAX(tpep_pickup_datetime) as latest_date,
    COUNT(DISTINCT DATE_FORMAT(tpep_pickup_datetime, 'yyyy-MM')) as num_months,
    COLLECT_SET(DATE_FORMAT(tpep_pickup_datetime, 'yyyy-MM')) as months_covered
FROM taxi_data
"""
)

earliest_date,latest_date,num_months,months_covered
2024-12-31 20:47:55,2025-02-01 00:00:44,3,"[2025-01, 2025-02..."
