In [0]:
from pyspark.sql.functions import *
from delta.tables import *

In [0]:
silver_container = 'silver'
storage_account = 'sgonpremtocloudm'
dfs_path = f"abfss://{silver_container}@{storage_account}.dfs.core.windows.net/"

In [0]:
df_customers = spark.read.format("delta").load(f"{dfs_path}Customers")
df_orders = spark.read.format("delta").load(f"{dfs_path}Orders").filter(col("IsCurrent") == True)
df_orderitems = spark.read.format("delta").load(f"{dfs_path}OrderItems").filter("IsCurrent = true")
df_payments = spark.read.format("delta").load(f"{dfs_path}Payments")

In [0]:
df_orders_joined = (
    df_orders.alias("o")
    .join(df_orderitems.alias("oi"), col("o.OrderId")==col("oi.OrderID"), "left")
    .join(df_payments.alias("p"), col("o.OrderId")==col("p.OrderID"), "left")
)

In [0]:
df_customer_summary = (
    df_orders_joined
    .groupBy("o.CustomerID")
    .agg(
        countDistinct("o.OrderId").alias("TotalOrders"),
        sum("oi.TotalPrice").alias("TotalSpent"),
        round(avg("oi.TotalPrice"),2).alias("AvgOrderValue"),
        min("o.OrderDate").alias("FirstOrderDate"),
        max("o.OrderDate").alias("RecentOrderDate")
    )
)

In [0]:
df_customer_summary_final = (
    df_customers.select("CustomerID", "FirstName", "LastName","Email")
    .join(df_customer_summary, "CustomerID","left")
    .fillna({
        "TotalOrders": 0,
        "TotalSpent": 0.0,
        "AvgOrderValue": 0.0
    })
    .withColumn("LoadDate",current_timestamp())
)