In [3]:
from pyspark.sql import SparkSession

spark = SparkSession\
.builder\
.master("yarn")\
.appName("aggregation functions")\
.enableHiveSupport()\
.config("spark.sql.warehouse.dir","/user/itv009490/warehouse")\
.getOrCreate()

spark

In [4]:
orders_schema = 'order_id long, order_date date, customer_id long, order_status string'

In [12]:
df = spark.read.format("csv").\
option("inferSchema","true").\
option("header","true").\
load("/public/trendytech/datasets/order_data.csv")

In [13]:
df.show()

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536378|     null|PACK OF 60 DINOSA...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|     null|PACK OF 60 PINK P...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|    84991|60 TEATIME FAIRY ...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|   84519A|TOMATO CHARLIE+LO...|       6|01-12-2010 9.37|     2.95|     14688|United Kingdom|
|   536378|   85183B|CHARLIE & LOLA WA...|      48|01-12-2010 9.37|     1.25|     14688|United Kingdom|
|   536378|   85071B|RED CHARLIE+LOLA ...|      96|01-12-2010 9.37|     0.38|     14688|United Kingdom|
|   536378|    21931|JUMBO STORAGE BAG...|      10|01-12-2010 9.

In [14]:
from pyspark.sql.functions import *

In [34]:
df.cache()

InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
536378,,PACK OF 60 DINOSA...,24,01-12-2010 9.37,0.55,14688,United Kingdom
536378,,PACK OF 60 PINK P...,24,01-12-2010 9.37,0.55,14688,United Kingdom
536378,84991,60 TEATIME FAIRY ...,24,01-12-2010 9.37,0.55,14688,United Kingdom
536378,84519A,TOMATO CHARLIE+LO...,6,01-12-2010 9.37,2.95,14688,United Kingdom
536378,85183B,CHARLIE & LOLA WA...,48,01-12-2010 9.37,1.25,14688,United Kingdom
536378,85071B,RED CHARLIE+LOLA ...,96,01-12-2010 9.37,0.38,14688,United Kingdom
536378,21931,JUMBO STORAGE BAG...,10,01-12-2010 9.37,1.95,14688,United Kingdom
536378,21929,JUMBO BAG PINK VI...,10,01-12-2010 9.37,1.95,14688,United Kingdom
536380,22961,JAM MAKING SET PR...,24,01-12-2010 9.41,1.45,17809,United Kingdom
536381,22139,RETROSPOT TEA SET...,23,01-12-2010 9.41,4.25,15311,United Kingdom


In [35]:
df.count()

541782

In [36]:
df.agg(max(df.Quantity)).show()

+-------------+
|max(Quantity)|
+-------------+
|        80995|
+-------------+



In [37]:
df[['InvoiceNo']].distinct().count()

25858

In [38]:
df[['InvoiceNo']].count()

541782

#### SIMPLE AGGREGATIONS

In [47]:
df.select(count("*").alias("row_count"),countDistinct("InvoiceNo").alias("unique_invoice"),sum("quantity").alias("total quantity"),avg("UnitPrice").alias("avg unit_price")).show()

+---------+--------------+--------------+-----------------+
|row_count|unique_invoice|total quantity|   avg unit_price|
+---------+--------------+--------------+-----------------+
|   541782|         25858|       5175855|4.611565323321927|
+---------+--------------+--------------+-----------------+



In [51]:
df.selectExpr("count(*) as row_count", "count(distinct InvoiceNo) as unique_invoice",
"sum(quantity) as total_quantity",
"avg(UnitPrice) as avg_price").show()

+---------+--------------+--------------+-----------------+
|row_count|unique_invoice|total_quantity|        avg_price|
+---------+--------------+--------------+-----------------+
|   541782|         25858|       5175855|4.611565323321927|
+---------+--------------+--------------+-----------------+



In [52]:
df.createOrReplaceTempView("orders_df")

In [53]:
spark.sql("""select count(*) as row_count, count(distinct InvoiceNo) as unique_invoice,
sum(quantity) as total_quantity,
avg(UnitPrice) as avg_price from orders_df""").show()

+---------+--------------+--------------+-----------------+
|row_count|unique_invoice|total_quantity|        avg_price|
+---------+--------------+--------------+-----------------+
|   541782|         25858|       5175855|4.611565323321927|
+---------+--------------+--------------+-----------------+



#### GROUPING AGGREGATIONS