In [0]:
# create dataframe

df_orders = (spark.read.table("dev_data.test.orders_online"))

In [0]:
# groupBy groups data across partitions before aggregation.
df_orders.groupBy("customer_id").count().show()

In [0]:
%sql
select customer_id, count(*) from dev_data.test.orders_online group by customer_id

In [0]:
# we can write clean and efficient code using agg()

from pyspark.sql.functions import sum, avg, count

df_orders.groupBy("customer_id").agg(
    count("order_id").alias("total_orders"),
    sum("order_amount").alias("total_amount"),
    avg("order_amount").alias("avg_order_value")
).show()

In [0]:
#distinct
# use distinct() when you only need unique values, not metrics
df_orders.select("customer_id").distinct().count()

In [0]:
# groupBy
df_orders.groupBy("customer_id").count().show()

In [0]:
# Aggregations with conditions

from pyspark.sql.functions import when, col

df_orders.groupBy("customer_id").agg(
    sum(
        when(col("order_amount")> 200, 1).otherwise(0)
    ).alias("high_value_orders")
).show()

In [0]:
display(df_orders)

In [0]:
df_customers = (spark.read.table("dev_data.test.customers_online"))
df_customers.show()

In [0]:
# aggregation after join( with real scenario)
# classic reporting logic
# SQL + spark 
df_joined = df_customers.alias("c").join(
    df_orders.alias("o"),
    col("c.cust_id") == col("o.customer_id"), "inner"
)

df_joined.groupBy("c.city").agg(
    count("o.order_id").alias("total_orders"),
    sum("o.order_amount").alias("revenue")
).show()

In [0]:
# Total orders per customer

df_orders.groupBy("customer_id").agg(
    count("order_id").alias("total_orders")
).show()

In [0]:
# Total order amount per customer
df_orders.groupBy("customer_id").agg(
    count("order_id").alias("Total_orders"),
    sum("order_amount").alias("Total_amount")
).show()

In [0]:
# Average order amount per city

df_avg = df_joined.groupBy("c.city").agg(
    avg("o.order_amount").alias("avg_order_amount_per_city")
)

df_avg.show()

In [0]:
# Customers with total order amount > 300
df_orders.groupBy("customer_id").agg(
    sum("order_amount").alias("total_amount")
).filter(col("total_amount")>300).show()