In [1]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window

print("GOLD LAYER: Creating FactTaxiDaily (2019-2024)")

print("\nReading source tables...")
df_taxi = spark.table("silver_nyc_taxi")
df_dim_zone = spark.table("DimZone")

taxi_count = df_taxi.count()
zone_count = df_dim_zone.count()

print(f"   Silver NYC Taxi: {taxi_count:,} records")
print(f"   DimZone: {zone_count:,} zones")

print("\nJoining with DimZone to get zone keys...")
df_taxi_with_zones = df_taxi \
    .join(
        df_dim_zone.select("ZoneKey", col("LocationID").alias("PULocationID")),
        "PULocationID",
        "left"
    ).withColumnRenamed("ZoneKey", "PickupZoneKey") \
    .join(
        df_dim_zone.select("ZoneKey", col("LocationID").alias("DOLocationID")),
        "DOLocationID",
        "left"
    ).withColumnRenamed("ZoneKey", "DropoffZoneKey")

unmapped_pickup = df_taxi_with_zones.filter(col("PickupZoneKey").isNull()).count()
unmapped_dropoff = df_taxi_with_zones.filter(col("DropoffZoneKey").isNull()).count()

if unmapped_pickup > 0 or unmapped_dropoff > 0:
    print(f"   Warning: {unmapped_pickup:,} trips with unmapped pickup zones")
    print(f"   Warning: {unmapped_dropoff:,} trips with unmapped dropoff zones")
    print(f"   These will be filtered out in the aggregation")
else:
    print(f"   All zones mapped successfully!")

print("\nAggregating to daily metrics by pickup/dropoff zone...")
df_fact_taxi = df_taxi_with_zones \
    .filter(col("PickupZoneKey").isNotNull() & col("DropoffZoneKey").isNotNull()) \
    .groupBy(
        col("pickup_year"),
        col("pickup_month"),
        col("pickup_day"),
        col("PickupZoneKey"),
        col("DropoffZoneKey")
    ).agg(
        count("*").alias("TripCount"),
        sum("passenger_count").cast("int").alias("TotalPassengers"),
        round(sum("trip_distance"), 2).alias("TotalDistance"),
        round(sum("fare_amount"), 2).alias("TotalFareAmount"),
        round(sum("tip_amount"), 2).alias("TotalTipAmount"),
        round(sum("tolls_amount"), 2).alias("TotalTollAmount"),
        round(sum("total_amount"), 2).alias("TotalAmount"),
        round(sum(when(col("airport_fee").isNotNull(), col("airport_fee")).otherwise(0)), 2).alias("TotalAirportFee"),
        round(avg("trip_distance"), 2).alias("AvgTripDistance"),
        round(avg("fare_amount"), 2).alias("AvgFareAmount"),
        round(avg("trip_duration_minutes"), 2).alias("AvgTripDuration"),
        round(avg("speed_mph"), 2).alias("AvgSpeed")
    )

print("\nCreating DateKey and surrogate keys...")
df_fact_taxi = df_fact_taxi.withColumn(
    "DateKey",
    (col("pickup_year") * 10000 + col("pickup_month") * 100 + col("pickup_day")).cast("int")
)

df_fact_taxi = df_fact_taxi.withColumn(
    "TaxiDailyKey",
    row_number().over(Window.orderBy("DateKey", "PickupZoneKey", "DropoffZoneKey"))
)

df_fact_taxi_final = df_fact_taxi.select(
    "TaxiDailyKey",
    "DateKey",
    "PickupZoneKey",
    "DropoffZoneKey",
    "TripCount",
    "TotalPassengers",
    "TotalDistance",
    "TotalFareAmount",
    "TotalTipAmount",
    "TotalTollAmount",
    "TotalAirportFee",
    "TotalAmount",
    "AvgTripDistance",
    "AvgFareAmount",
    "AvgTripDuration",
    "AvgSpeed"
)

daily_count = df_fact_taxi_final.count()
print(f"   Daily aggregated records: {daily_count:,}")

print("\nSample of aggregated data (most recent):")
df_fact_taxi_final.orderBy(desc("DateKey")).show(10, truncate=False)

print("\nSaving to Gold layer...")
spark.sql("DROP TABLE IF EXISTS FactTaxiDaily")

df_fact_taxi_final.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("FactTaxiDaily")

print(f"   FactTaxiDaily created with {daily_count:,} records!")

print("SUMMARY STATISTICS BY YEAR (2019-2024)")

df_year_summary = spark.sql("""
SELECT 
    CAST(DateKey / 10000 AS INT) as Year,
    COUNT(DISTINCT DateKey) as UniqueDays,
    COUNT(*) as DailyZonePairs,
    FORMAT_NUMBER(SUM(TripCount), 0) as TotalTrips,
    FORMAT_NUMBER(SUM(TotalPassengers), 0) as TotalPassengers,
    FORMAT_NUMBER(ROUND(SUM(TotalDistance), 0), 0) as TotalMiles,
    CONCAT('$', FORMAT_NUMBER(ROUND(SUM(TotalAmount), 0), 0)) as TotalRevenue,
    CONCAT('$', FORMAT_NUMBER(ROUND(AVG(AvgFareAmount), 2), 2)) as AvgFarePerTrip,
    ROUND(AVG(AvgTripDistance), 2) as AvgTripMiles,
    ROUND(AVG(AvgSpeed), 1) as AvgSpeedMph
FROM FactTaxiDaily
GROUP BY CAST(DateKey / 10000 AS INT)
ORDER BY Year
""")

df_year_summary.show(truncate=False)

print("TOP 10 BUSIEST ZONE PAIRS (2019-2024)")

df_top_zones = spark.sql("""
SELECT 
    p.ZoneName as PickupZone,
    p.Borough as PickupBorough,
    d.ZoneName as DropoffZone,
    d.Borough as DropoffBorough,
    FORMAT_NUMBER(SUM(f.TripCount), 0) as TotalTrips
FROM FactTaxiDaily f
JOIN DimZone p ON f.PickupZoneKey = p.ZoneKey
JOIN DimZone d ON f.DropoffZoneKey = d.ZoneKey
GROUP BY p.ZoneName, p.Borough, d.ZoneName, d.Borough
ORDER BY SUM(f.TripCount) DESC
LIMIT 10
""")

df_top_zones.show(truncate=False)

print("MONTHLY TREND: Last 13 Months")

df_monthly = spark.sql("""
SELECT 
    CAST(DateKey / 10000 AS INT) as Year,
    CAST((DateKey / 100) % 100 AS INT) as Month,
    FORMAT_NUMBER(SUM(TripCount), 0) as Trips
FROM FactTaxiDaily
WHERE DateKey >= 20231201
GROUP BY CAST(DateKey / 10000 AS INT), CAST((DateKey / 100) % 100 AS INT)
ORDER BY Year DESC, Month DESC
LIMIT 13
""")

df_monthly.show(truncate=False)

print("ANALYSIS COMPLETE!")

StatementMeta(, cbd96dfd-cebe-4270-8d3c-1f15996d71c4, 3, Finished, Available, Finished)

GOLD LAYER: Creating FactTaxiDaily (2019-2024)

Reading source tables...
   Silver NYC Taxi: 240,890,050 records
   DimZone: 265 zones

Joining with DimZone to get zone keys...
   All zones mapped successfully!

Aggregating to daily metrics by pickup/dropoff zone...

Creating DateKey and surrogate keys...
   Daily aggregated records: 14,827,172

Sample of aggregated data (most recent):
+------------+--------+-------------+--------------+---------+---------------+-------------+---------------+--------------+---------------+---------------+-----------+---------------+-------------+---------------+--------+
|TaxiDailyKey|DateKey |PickupZoneKey|DropoffZoneKey|TripCount|TotalPassengers|TotalDistance|TotalFareAmount|TotalTipAmount|TotalTollAmount|TotalAirportFee|TotalAmount|AvgTripDistance|AvgFareAmount|AvgTripDuration|AvgSpeed|
+------------+--------+-------------+--------------+---------+---------------+-------------+---------------+--------------+---------------+---------------+----------