In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession\
    .builder\
    .appName("chapter-07-aggregation")\
    .getOrCreate()

import os
SPARK_BOOK_DATA_PATH = os.environ['SPARK_BOOK_DATA_PATH']

In [2]:
file_path = SPARK_BOOK_DATA_PATH + "/data/retail-data/all/*.csv"

df = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load(file_path)\
  .coalesce(3)

df.show(5,False)

+---------+---------+-----------------------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate   |UnitPrice|CustomerID|Country       |
+---------+---------+-----------------------------------+--------+--------------+---------+----------+--------------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER |6       |12/1/2010 8:26|2.55     |17850     |United Kingdom|
|536365   |71053    |WHITE METAL LANTERN                |6       |12/1/2010 8:26|3.39     |17850     |United Kingdom|
|536365   |84406B   |CREAM CUPID HEARTS COAT HANGER     |8       |12/1/2010 8:26|2.75     |17850     |United Kingdom|
|536365   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|6       |12/1/2010 8:26|3.39     |17850     |United Kingdom|
|536365   |84029E   |RED WOOLLY HOTTIE WHITE HEART.     |6       |12/1/2010 8:26|3.39     |17850     |United Kingdom|
+---------+---------+-----------------------------------

In [3]:
df.rdd.getNumPartitions()

3

In [4]:
df.count()

541909

In [5]:
df.printSchema

<bound method DataFrame.printSchema of DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: int, Country: string]>

In [6]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [7]:
schema = df.schema
schema

StructType(List(StructField(InvoiceNo,StringType,true),StructField(StockCode,StringType,true),StructField(Description,StringType,true),StructField(Quantity,IntegerType,true),StructField(InvoiceDate,StringType,true),StructField(UnitPrice,DoubleType,true),StructField(CustomerID,IntegerType,true),StructField(Country,StringType,true)))

In [8]:
df.columns

['InvoiceNo',
 'StockCode',
 'Description',
 'Quantity',
 'InvoiceDate',
 'UnitPrice',
 'CustomerID',
 'Country']

In [9]:
df.cache()

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: int, Country: string]

In [10]:
df.is_cached

True

In [11]:
df.storageLevel

StorageLevel(True, True, False, True, 1)

In [12]:
df.createOrReplaceTempView("dfTable")

In [13]:
spark.sql("select count(*) as Total from dfTable").show()

+------+
| Total|
+------+
|541909|
+------+



In [14]:
spark.sql("select InvoiceNo,StockCode,Description from dfTable limit 3").show()

+---------+---------+--------------------+
|InvoiceNo|StockCode|         Description|
+---------+---------+--------------------+
|   536365|   85123A|WHITE HANGING HEA...|
|   536365|    71053| WHITE METAL LANTERN|
|   536365|   84406B|CREAM CUPID HEART...|
+---------+---------+--------------------+



In [15]:
# COMMAND ----------

df.select(F.count("StockCode"), F.count("InvoiceDate")).show() # 541909

+----------------+------------------+
|count(StockCode)|count(InvoiceDate)|
+----------------+------------------+
|          541909|            541909|
+----------------+------------------+



In [17]:
# use count to detect columns with null
df.agg(*[F.count(c).alias(c) for c in df.columns]).show()

+---------+---------+-----------+--------+-----------+---------+----------+-------+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+-----------+--------+-----------+---------+----------+-------+
|   541909|   541909|     540455|  541909|     541909|   541909|    406829| 541909|
+---------+---------+-----------+--------+-----------+---------+----------+-------+



In [21]:
# use countDistinct for unique values in each column
df.agg(*[F.countDistinct(c).alias(c) for c in df.columns]).show()

+---------+---------+-----------+--------+-----------+---------+----------+-------+--------------+------------+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|InvoiceDate_ts|Invoice_Date|
+---------+---------+-----------+--------+-----------+---------+----------+-------+--------------+------------+
|    25900|     4070|       4223|     722|      23260|     1630|      4372|     38|         23260|         305|
+---------+---------+-----------+--------+-----------+---------+----------+-------+--------------+------------+



In [22]:
# use approx_count_distinct to estimate counts quickly
df.agg(*[F.approx_count_distinct(c, 0.1).alias(c) for c in df.columns]).show()

+---------+---------+-----------+--------+-----------+---------+----------+-------+--------------+------------+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|InvoiceDate_ts|Invoice_Date|
+---------+---------+-----------+--------+-----------+---------+----------+-------+--------------+------------+
|    25085|     3364|       4240|     698|      23267|     1694|      4336|     33|         23938|         338|
+---------+---------+-----------+--------+-----------+---------+----------+-------+--------------+------------+



In [18]:
from datetime import datetime
udf_to_date =  F.udf (lambda x: datetime.strptime(x, '%m/%d/%Y %H:%M'), DateType())
udf_to_datetime =  F.udf (lambda x: datetime.strptime(x, '%m/%d/%Y %H:%M'), TimestampType())

In [19]:
# change InvoiceDate format
df = (df
      .withColumn("InvoiceDate_ts", udf_to_datetime(F.col("InvoiceDate")))
      .withColumn("Invoice_Date", udf_to_date(F.col("InvoiceDate")))
     )

In [20]:
df.show(5,truncate=False, vertical=True)

-RECORD 0---------------------------------------------
 InvoiceNo      | 536365                              
 StockCode      | 85123A                              
 Description    | WHITE HANGING HEART T-LIGHT HOLDER  
 Quantity       | 6                                   
 InvoiceDate    | 12/1/2010 8:26                      
 UnitPrice      | 2.55                                
 CustomerID     | 17850                               
 Country        | United Kingdom                      
 InvoiceDate_ts | 2010-12-01 08:26:00                 
 Invoice_Date   | 2010-12-01                          
-RECORD 1---------------------------------------------
 InvoiceNo      | 536365                              
 StockCode      | 71053                               
 Description    | WHITE METAL LANTERN                 
 Quantity       | 6                                   
 InvoiceDate    | 12/1/2010 8:26                      
 UnitPrice      | 3.39                                
 CustomerI

In [58]:
df.groupBy("StockCode").agg(F.expr("count(StockCode)")).show(10, False)

+---------+----------------+
|StockCode|count(StockCode)|
+---------+----------------+
|22728    |810             |
|21889    |607             |
|90210B   |7               |
|21259    |296             |
|21894    |135             |
|21452    |200             |
|22121    |141             |
|90022    |21              |
|21249    |119             |
|90143    |22              |
+---------+----------------+
only showing top 10 rows



In [59]:
# df.where("StockCode in ('90026D', '90210B') ").orderBy("StockCode", desc("InvoiceDate")).show(10, False)
df.where("StockCode in ('90026D', '90210B') ").orderBy("StockCode", F.desc("InvoiceDate_ts")).show(10, False)

+---------+---------+---------------------------------+--------+----------------+---------+----------+--------------+-------------------+------------+
|InvoiceNo|StockCode|Description                      |Quantity|InvoiceDate     |UnitPrice|CustomerID|Country       |InvoiceDate_ts     |Invoice_Date|
+---------+---------+---------------------------------+--------+----------------+---------+----------+--------------+-------------------+------------+
|577315   |90026D   |GLASS BEAD HOOP NECKLACE AMETHYST|1       |11/18/2011 13:25|8.5      |17811     |United Kingdom|2011-11-18 13:25:00|2011-11-18  |
|548545   |90026D   |GLASS BEAD HOOP NECKLACE AMETHYST|1       |3/31/2011 19:12 |8.5      |13118     |United Kingdom|2011-03-31 19:12:00|2011-03-31  |
|544463   |90026D   |GLASS BEAD HOOP NECKLACE AMETHYST|1       |2/20/2011 14:31 |8.5      |12988     |United Kingdom|2011-02-20 14:31:00|2011-02-20  |
|581434   |90210B   |CLEAR ACRYLIC FACETED BANGLE     |10      |12/8/2011 16:10 |1.0      |135

In [60]:
# COMMAND ----------

# from pyspark.sql.functions import approx_count_distinct
df.select(F.approx_count_distinct("StockCode", 0.1)).show() # 3364

+--------------------------------+
|approx_count_distinct(StockCode)|
+--------------------------------+
|                            3364|
+--------------------------------+



In [61]:
# COMMAND ----------

# from pyspark.sql.functions import first, last
df.select(F.first("StockCode"), F.last("StockCode")).show()

+----------------+---------------+
|first(StockCode)|last(StockCode)|
+----------------+---------------+
|          85123A|          22138|
+----------------+---------------+



In [62]:
# COMMAND ----------

# from pyspark.sql.functions import min, max
df.select(F.min("Quantity"), F.max("Quantity")).show()

+-------------+-------------+
|min(Quantity)|max(Quantity)|
+-------------+-------------+
|       -80995|        80995|
+-------------+-------------+



In [63]:
# COMMAND ----------

# from pyspark.sql.functions import sum
df.select(F.sum("Quantity")).show() # 5176450

+-------------+
|sum(Quantity)|
+-------------+
|      5176450|
+-------------+



In [64]:
# COMMAND ----------

# from pyspark.sql.functions import sumDistinct
df.select(F.sumDistinct("Quantity")).show() # 29310

+----------------------+
|sum(DISTINCT Quantity)|
+----------------------+
|                 29310|
+----------------------+



In [24]:
# COMMAND ----------

# from pyspark.sql.functions import sum, count, avg, expr

(df.select(
    F.count("Quantity").alias("total_transactions"),
    F.sum("Quantity").alias("total_purchases"),
    F.avg("Quantity").alias("avg_purchases"),
    F.expr("mean(Quantity)").alias("mean_purchases"))
  .selectExpr(
    "total_purchases/total_transactions as avg_purchase",
    "avg_purchases",
    "mean_purchases")
 .show(vertical=True)
)

-RECORD 0--------------------------
 avg_purchase   | 9.55224954743324 
 avg_purchases  | 9.55224954743324 
 mean_purchases | 9.55224954743324 



In [49]:
# COMMAND ----------

from pyspark.sql.functions import var_pop, stddev_pop
from pyspark.sql.functions import var_samp, stddev_samp
df.select(var_pop("Quantity"), 
          var_samp("Quantity"),
          stddev_pop("Quantity"), 
          stddev_samp("Quantity")).show()

+-----------------+------------------+--------------------+---------------------+
|var_pop(Quantity)|var_samp(Quantity)|stddev_pop(Quantity)|stddev_samp(Quantity)|
+-----------------+------------------+--------------------+---------------------+
|47559.30364660928| 47559.39140929898|  218.08095663447847|    218.0811578502347|
+-----------------+------------------+--------------------+---------------------+



In [50]:
# COMMAND ----------

from pyspark.sql.functions import skewness, kurtosis
df.select(skewness("Quantity"), kurtosis("Quantity")).show()

+--------------------+------------------+
|  skewness(Quantity)|kurtosis(Quantity)|
+--------------------+------------------+
|-0.26407557610528154|  119768.054955306|
+--------------------+------------------+



In [51]:
# COMMAND ----------

from pyspark.sql.functions import corr, covar_pop, covar_samp
df.select(
        corr("InvoiceNo", "Quantity"), 
        covar_samp("InvoiceNo", "Quantity"),
        covar_pop("InvoiceNo", "Quantity")).show()

+-------------------------+-------------------------------+------------------------------+
|corr(InvoiceNo, Quantity)|covar_samp(InvoiceNo, Quantity)|covar_pop(InvoiceNo, Quantity)|
+-------------------------+-------------------------------+------------------------------+
|     4.912186085639875E-4|             1052.7280543916152|             1052.726077875511|
+-------------------------+-------------------------------+------------------------------+



In [53]:
df.select(
        corr("UnitPrice", "Quantity"), 
        covar_samp("UnitPrice", "Quantity"),
        covar_pop("UnitPrice", "Quantity")).show()

+-------------------------+-------------------------------+------------------------------+
|corr(UnitPrice, Quantity)|covar_samp(UnitPrice, Quantity)|covar_pop(UnitPrice, Quantity)|
+-------------------------+-------------------------------+------------------------------+
|     -0.00123492454487...|             -26.05876125793698|           -26.058713170968026|
+-------------------------+-------------------------------+------------------------------+



In [56]:
df.agg(F.collect_set("Country")).show(20, False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|collect_set(Country)                                                                                                                                                                                                                                                                                                                                                                              |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [58]:
# COMMAND ----------

from pyspark.sql.functions import count

df.groupBy("InvoiceNo").agg(
    count("Quantity").alias("quan"),
    expr("count(Quantity)")).show()

+---------+----+---------------+
|InvoiceNo|quan|count(Quantity)|
+---------+----+---------------+
|   536596|   6|              6|
|   536938|  14|             14|
|   537252|   1|              1|
|   537691|  20|             20|
|   538041|   1|              1|
|   538184|  26|             26|
|   538517|  53|             53|
|   538879|  19|             19|
|   539275|   6|              6|
|   539630|  12|             12|
|   540499|  24|             24|
|   540540|  22|             22|
|  C540850|   1|              1|
|   540976|  48|             48|
|   541432|   4|              4|
|   541518| 101|            101|
|   541783|  35|             35|
|   542026|   9|              9|
|   542375|   6|              6|
|  C542604|   8|              8|
+---------+----+---------------+
only showing top 20 rows



In [59]:
# COMMAND ----------

df.groupBy("InvoiceNo").agg(
        expr("avg(Quantity)"),
        expr("stddev_pop(Quantity)")).show()

+---------+------------------+--------------------+
|InvoiceNo|     avg(Quantity)|stddev_pop(Quantity)|
+---------+------------------+--------------------+
|   536596|               1.5|  1.1180339887498947|
|   536938|33.142857142857146|  20.698023172885524|
|   537252|              31.0|                 0.0|
|   537691|              8.15|   5.597097462078001|
|   538041|              30.0|                 0.0|
|   538184|12.076923076923077|   8.142590198943392|
|   538517|3.0377358490566038|  2.3946659604837897|
|   538879|21.157894736842106|  11.811070444356483|
|   539275|              26.0|  12.806248474865697|
|   539630|20.333333333333332|  10.225241100118645|
|   540499|              3.75|  2.6653642652865788|
|   540540|2.1363636363636362|  1.0572457590557278|
|  C540850|              -1.0|                 0.0|
|   540976|10.520833333333334|   6.496760677872902|
|   541432|             12.25|  10.825317547305483|
|   541518| 23.10891089108911|  20.550782784878713|
|   541783|1

In [65]:
# COMMAND ----------

from pyspark.sql.functions import col, to_date
# dfWithDate = df.withColumn("date", to_date(col("InvoiceDate"), "MM/dd/yyyy HH:mm"))
dfWithDate = df.withColumn("date", udf_to_datetime(col("InvoiceDate")))

In [67]:
dfWithDate.show(5, False)

+---------+---------+-----------------------------------+--------+--------------+---------+----------+--------------+---------------+-------------------+-------------------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate   |UnitPrice|CustomerID|Country       |InvoiceDate_std|InvoiceDate_ts     |date               |
+---------+---------+-----------------------------------+--------+--------------+---------+----------+--------------+---------------+-------------------+-------------------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER |6       |12/1/2010 8:26|2.55     |17850     |United Kingdom|null           |2010-12-01 08:26:00|2010-12-01 08:26:00|
|536365   |71053    |WHITE METAL LANTERN                |6       |12/1/2010 8:26|3.39     |17850     |United Kingdom|null           |2010-12-01 08:26:00|2010-12-01 08:26:00|
|536365   |84406B   |CREAM CUPID HEARTS COAT HANGER     |8       |12/1/2010 8:26|2.75     |17850     |United Kingdom|null         

In [68]:
dfWithDate.createOrReplaceTempView("dfWithDate")

In [69]:
# COMMAND ----------

from pyspark.sql.window import Window
from pyspark.sql.functions import desc
windowSpec = Window\
  .partitionBy("CustomerId", "date")\
  .orderBy(desc("Quantity"))\
  .rowsBetween(Window.unboundedPreceding, Window.currentRow)

In [74]:
# COMMAND ----------

from pyspark.sql.functions import max
maxPurchaseQuantity = max(F.col("Quantity")).over(windowSpec)

In [75]:
# COMMAND ----------

from pyspark.sql.functions import dense_rank, rank
purchaseDenseRank = dense_rank().over(windowSpec)
purchaseRank = rank().over(windowSpec)

In [76]:
# COMMAND ----------

from pyspark.sql.functions import col

dfWithDate.where("CustomerId IS NOT NULL").orderBy("CustomerId")\
  .select(
    col("CustomerId"),
    col("date"),
    col("Quantity"),
    purchaseRank.alias("quantityRank"),
    purchaseDenseRank.alias("quantityDenseRank"),
    maxPurchaseQuantity.alias("maxPurchaseQuantity")).show()

+----------+-------------------+--------+------------+-----------------+-------------------+
|CustomerId|               date|Quantity|quantityRank|quantityDenseRank|maxPurchaseQuantity|
+----------+-------------------+--------+------------+-----------------+-------------------+
|     12346|2011-01-18 10:01:00|   74215|           1|                1|              74215|
|     12346|2011-01-18 10:17:00|  -74215|           1|                1|             -74215|
|     12347|2010-12-07 14:57:00|      36|           1|                1|                 36|
|     12347|2010-12-07 14:57:00|      30|           2|                2|                 36|
|     12347|2010-12-07 14:57:00|      24|           3|                3|                 36|
|     12347|2010-12-07 14:57:00|      12|           4|                4|                 36|
|     12347|2010-12-07 14:57:00|      12|           4|                4|                 36|
|     12347|2010-12-07 14:57:00|      12|           4|                

In [77]:
# COMMAND ----------

dfNoNull = dfWithDate.drop()
dfNoNull.createOrReplaceTempView("dfNoNull")

In [78]:
# COMMAND ----------

rolledUpDF = dfNoNull.rollup("Date", "Country").agg(sum("Quantity"))\
  .selectExpr("Date", "Country", "`sum(Quantity)` as total_quantity")\
  .orderBy("Date")
rolledUpDF.show()

+-------------------+--------------+--------------+
|               Date|       Country|total_quantity|
+-------------------+--------------+--------------+
|               null|          null|       5176450|
|2010-12-01 08:26:00|United Kingdom|            40|
|2010-12-01 08:26:00|          null|            40|
|2010-12-01 08:28:00|United Kingdom|            12|
|2010-12-01 08:28:00|          null|            12|
|2010-12-01 08:34:00|          null|            98|
|2010-12-01 08:34:00|United Kingdom|            98|
|2010-12-01 08:35:00|          null|             3|
|2010-12-01 08:35:00|United Kingdom|             3|
|2010-12-01 08:45:00|        France|           449|
|2010-12-01 08:45:00|          null|           449|
|2010-12-01 09:00:00|United Kingdom|            80|
|2010-12-01 09:00:00|          null|            80|
|2010-12-01 09:01:00|United Kingdom|            12|
|2010-12-01 09:01:00|          null|            12|
|2010-12-01 09:02:00|          null|            88|
|2010-12-01 

In [79]:
# COMMAND ----------

from pyspark.sql.functions import sum

dfNoNull.cube("Date", "Country")\
    .agg(sum(col("Quantity")), count(col("Quantity")))\
    .select("Date", "Country", "sum(Quantity)", "count(Quantity)")\
    .orderBy("Date").show()

+----+--------------------+-------------+---------------+
|Date|             Country|sum(Quantity)|count(Quantity)|
+----+--------------------+-------------+---------------+
|null|               Japan|        25218|            358|
|null|            Portugal|        16180|           1519|
|null|           Australia|        83653|           1259|
|null|                 RSA|          352|             58|
|null|                null|      5176450|         541909|
|null|         Unspecified|         3300|            446|
|null|             Finland|        10666|            695|
|null|           Hong Kong|         4769|            288|
|null|             Germany|       117448|           9495|
|null|             Lebanon|          386|             45|
|null|              Cyprus|         6317|            622|
|null|           Singapore|         5234|            229|
|null|United Arab Emirates|          982|             68|
|null|     Channel Islands|         9479|            758|
|null|        

In [80]:
# COMMAND ----------

pivoted = dfWithDate.groupBy("date").pivot("Country").sum()


# COMMAND ----------

In [82]:
# pivoted.show()

### Window

https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#window

In [3]:
from pyspark.sql import Window

In [16]:
data = [(1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b")]
columns = ["id", "category"]
df=spark.createDataFrame(data=data,schema=columns)
df.show()

+---+--------+
| id|category|
+---+--------+
|  1|       a|
|  1|       a|
|  2|       a|
|  1|       b|
|  2|       b|
|  3|       b|
+---+--------+



In [17]:
window = (Window.partitionBy("category")
        .orderBy("id")
        .rangeBetween(Window.currentRow, 1)
    )

In [18]:
df = (df.withColumn("sum", F.sum("id").over(window))
    .sort("category", "id")
     )
df.select("category","id","sum").show()

+--------+---+---+
|category| id|sum|
+--------+---+---+
|       a|  1|  4|
|       a|  1|  4|
|       a|  2|  2|
|       b|  1|  3|
|       b|  2|  5|
|       b|  3|  3|
+--------+---+---+



In [19]:
df.storageLevel

StorageLevel(False, False, False, False, 1)

In [20]:
df.show()

+---+--------+---+
| id|category|sum|
+---+--------+---+
|  1|       a|  4|
|  1|       a|  4|
|  2|       a|  2|
|  1|       b|  3|
|  2|       b|  5|
|  3|       b|  3|
+---+--------+---+



In [21]:
df.rollup("id","category").count().orderBy("category","id").show()

+----+--------+-----+
|  id|category|count|
+----+--------+-----+
|null|    null|    6|
|   1|    null|    3|
|   2|    null|    2|
|   3|    null|    1|
|   1|       a|    2|
|   2|       a|    1|
|   1|       b|    1|
|   2|       b|    1|
|   3|       b|    1|
+----+--------+-----+



In [22]:
df.cube("id","category").count().orderBy("category","id").show()

+----+--------+-----+
|  id|category|count|
+----+--------+-----+
|null|    null|    6|
|   1|    null|    3|
|   2|    null|    2|
|   3|    null|    1|
|null|       a|    3|
|   1|       a|    2|
|   2|       a|    1|
|null|       b|    3|
|   1|       b|    1|
|   2|       b|    1|
|   3|       b|    1|
+----+--------+-----+

