In [1]:
from pyspark.sql import functions as F

df_fact = spark.table("dbo.gold_fact_taxi_daily")
df_fx   = spark.table("dbo.dim_fx")
df_gdp  = spark.table("dbo.dim_gdp")

df_gold = (
    df_fact
    .withColumn("year", F.year("trip_date"))

    # FX
    .join(
        df_fx,
        on="year",
        how="left"
    )

    # GDP (US)
    .join(
        df_gdp.filter(F.col("country") == "United States"),
        on="year",
        how="left"
    )
)



StatementMeta(, b86a05b0-9d43-429d-ae66-a7924d6ff466, 3, Finished, Available, Finished)

In [6]:
df_gold.printSchema()



StatementMeta(, b86a05b0-9d43-429d-ae66-a7924d6ff466, 8, Finished, Available, Finished)

root
 |-- year: integer (nullable = true)
 |-- trip_date: date (nullable = true)
 |-- trips_count: long (nullable = true)
 |-- total_revenue_usd: double (nullable = true)
 |-- avg_fare_usd: double (nullable = true)
 |-- avg_distance: double (nullable = true)
 |-- avg_usd_eur_rate: double (nullable = true)
 |-- gdp_usd: double (nullable = true)
 |-- country: string (nullable = true)



In [7]:
from pyspark.sql import functions as F

df_gold_final = (
    df_gold
    .select(
        "trip_date",
        "year",
        F.col("trips_count").alias("trips_cnt"),
        F.col("total_revenue_usd").alias("revenue_usd"),
        F.col("avg_fare_usd").alias("avg_revenue_per_trip_usd"),
        "avg_distance",
        "avg_usd_eur_rate",
        "gdp_usd",
        "country"
    )
)


StatementMeta(, b86a05b0-9d43-429d-ae66-a7924d6ff466, 9, Finished, Available, Finished)

In [8]:
(
    df_gold_final
    .write
    .mode("overwrite")
    .format("delta")
    .saveAsTable("dbo.gold_taxi_kpi_daily")
)


StatementMeta(, b86a05b0-9d43-429d-ae66-a7924d6ff466, 10, Finished, Available, Finished)

In [9]:
spark.sql("""
SELECT
    COUNT(*)               AS days,
    COUNT(trip_date)       AS non_null_dates,
    COUNT(trips_cnt)       AS non_null_trips,
    SUM(trips_cnt)         AS total_trips,
    MIN(trip_date),
    MAX(trip_date)
FROM dbo.gold_taxi_kpi_daily;
""").show()

StatementMeta(, b86a05b0-9d43-429d-ae66-a7924d6ff466, 11, Finished, Available, Finished)

+----+--------------+--------------+-----------+--------------+--------------+
|days|non_null_dates|non_null_trips|total_trips|min(trip_date)|max(trip_date)|
+----+--------------+--------------+-----------+--------------+--------------+
|  33|            33|            33|    2840785|    2024-12-31|    2025-02-01|
+----+--------------+--------------+-----------+--------------+--------------+

