In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession\
    .builder\
    .appName("chapter-03-tour")\
    .getOrCreate()

import os
SPARK_BOOK_DATA_PATH = os.environ['SPARK_BOOK_DATA_PATH']

### Spark SQL

In [2]:
file_path = SPARK_BOOK_DATA_PATH + "/data/retail-data/by-day/*.csv"

retail_df = spark.read.csv(file_path, header=True, inferSchema=True)

In [3]:
retail_df.count()

541909

In [4]:
retail_df.createOrReplaceTempView("retail_table")

In [5]:
staticSchema = retail_df.schema

In [6]:
print(staticSchema)

StructType(List(StructField(InvoiceNo,StringType,true),StructField(StockCode,StringType,true),StructField(Description,StringType,true),StructField(Quantity,IntegerType,true),StructField(InvoiceDate,StringType,true),StructField(UnitPrice,DoubleType,true),StructField(CustomerID,DoubleType,true),StructField(Country,StringType,true)))


In [7]:
retail_df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [8]:
retail_df.describe().show()

+-------+------------------+------------------+--------------------+-----------------+-------------------+-----------------+------------------+-----------+
|summary|         InvoiceNo|         StockCode|         Description|         Quantity|        InvoiceDate|        UnitPrice|        CustomerID|    Country|
+-------+------------------+------------------+--------------------+-----------------+-------------------+-----------------+------------------+-----------+
|  count|            541909|            541909|              540455|           541909|             541909|           541909|            406829|     541909|
|   mean|  559965.752026781|27623.240210938104|             20713.0| 9.55224954743324|               null|4.611113626089641|15287.690570239585|       null|
| stddev|13428.417280796697|16799.737628427683|                 NaN|218.0811578502335|               null|96.75985306117963| 1713.600303321597|       null|
|    min|            536365|             10002| 4 PURPLE FLOCK D

In [9]:
df = spark.sql("select * from retail_table limit 5")

In [13]:
df.show(truncate=False)   # disable truncate to show description in full

+---------+---------+-------------------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                    |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+---------+---------+-------------------------------+--------+-------------------+---------+----------+--------------+
|580538   |23084    |RABBIT NIGHT LIGHT             |48      |2011-12-05 08:38:00|1.79     |14075.0   |United Kingdom|
|580538   |23077    |DOUGHNUT LIP GLOSS             |20      |2011-12-05 08:38:00|1.25     |14075.0   |United Kingdom|
|580538   |22906    |12 MESSAGE CARDS WITH ENVELOPES|24      |2011-12-05 08:38:00|1.65     |14075.0   |United Kingdom|
|580538   |21914    |BLUE HARMONICA IN BOX          |24      |2011-12-05 08:38:00|1.25     |14075.0   |United Kingdom|
|580538   |22467    |GUMBALL COAT RACK              |6       |2011-12-05 08:38:00|2.55     |14075.0   |United Kingdom|
+---------+---------+---------------------------

In [14]:
# COMMAND ----------

from pyspark.sql.functions import window, column, desc, col

retail_df.selectExpr(
    "CustomerId",
    "(UnitPrice * Quantity) as total_cost",
    "InvoiceDate")\
  .groupBy(col("CustomerId"), window(col("InvoiceDate"), "1 day"))\
  .sum("total_cost")\
  .sort(desc("sum(total_cost)"))\
  .show(5)

+----------+--------------------+------------------+
|CustomerId|              window|   sum(total_cost)|
+----------+--------------------+------------------+
|   17450.0|[2011-09-19 20:00...|          71601.44|
|      null|[2011-11-13 19:00...|          55316.08|
|      null|[2011-11-06 19:00...|          42939.17|
|      null|[2011-03-28 20:00...| 33521.39999999998|
|      null|[2011-12-07 19:00...|31975.590000000007|
+----------+--------------------+------------------+
only showing top 5 rows



In [17]:
# col() can be omitted
retail_df.selectExpr(
    "CustomerId",
    "(UnitPrice * Quantity) as total_cost",
    "InvoiceDate")\
  .groupBy("CustomerId", window("InvoiceDate", "1 day"))\
  .sum("total_cost")\
  .sort(desc("sum(total_cost)"))\
  .withColumnRenamed("sum(total_cost)", "sum_total_cost")\
  .withColumnRenamed("window", "InvoiceDateWindow")\
  .show(5, truncate=False)

+----------+------------------------------------------+------------------+
|CustomerId|InvoiceDateWindow                         |sum_total_cost    |
+----------+------------------------------------------+------------------+
|17450.0   |[2011-09-19 20:00:00, 2011-09-20 20:00:00]|71601.44          |
|null      |[2011-11-13 19:00:00, 2011-11-14 19:00:00]|55316.08          |
|null      |[2011-11-06 19:00:00, 2011-11-07 19:00:00]|42939.17          |
|null      |[2011-03-28 20:00:00, 2011-03-29 20:00:00]|33521.39999999998 |
|null      |[2011-12-07 19:00:00, 2011-12-08 19:00:00]|31975.590000000007|
+----------+------------------------------------------+------------------+
only showing top 5 rows



### Spark Streaming

In [19]:
# COMMAND ----------

streamingDataFrame = spark\
    .readStream\
    .format("csv")\
    .schema(staticSchema)\
    .option("maxFilesPerTrigger", 1)\
    .option("header", "true")\
    .load(SPARK_BOOK_DATA_PATH + "/data/retail-data/by-day/*.csv")

In [20]:
# COMMAND ----------

purchaseByCustomerPerHour = streamingDataFrame\
  .selectExpr(
    "CustomerId",
    "(UnitPrice * Quantity) as total_cost",
    "InvoiceDate")\
  .groupBy(col("CustomerId"), window(col("InvoiceDate"), "1 day"))\
  .sum("total_cost")

In [21]:
# COMMAND ----------

purchaseByCustomerPerHour\
    .writeStream\
    .format("memory")\
    .queryName("customer_purchases")\
    .outputMode("complete")\
    .start()

<pyspark.sql.streaming.StreamingQuery at 0x7f551e51af10>

a SQL table `customer_purchases` is created by the `queryName`

#### use `Ctrl-Enter` to execute below cell repeatly to see streaming result as more data are read

In [22]:
# COMMAND ----------

spark.sql("""
  SELECT *
  FROM customer_purchases
  ORDER BY `sum(total_cost)` DESC
  """)\
  .show(5)

+----------+--------------------+------------------+
|CustomerId|              window|   sum(total_cost)|
+----------+--------------------+------------------+
|      null|[2010-11-30 19:00...|12584.299999999988|
|   13777.0|[2010-11-30 19:00...|           6585.16|
|   16029.0|[2010-11-30 19:00...|           3702.12|
|   16210.0|[2010-11-30 19:00...|2474.7399999999993|
|   12433.0|[2010-11-30 19:00...|1919.1400000000008|
+----------+--------------------+------------------+
only showing top 5 rows



In [33]:
# COMMAND ----------

spark.sql("""
  SELECT *
  FROM customer_purchases
  ORDER BY `sum(total_cost)` DESC
  """)\
  .show(5,truncate=False)

+----------+------------------------------------------+------------------+
|CustomerId|window                                    |sum(total_cost)   |
+----------+------------------------------------------+------------------+
|null      |[2010-12-20 19:00:00, 2010-12-21 19:00:00]|31347.479999999938|
|18102.0   |[2010-12-06 19:00:00, 2010-12-07 19:00:00]|25920.37          |
|null      |[2010-12-09 19:00:00, 2010-12-10 19:00:00]|25399.560000000012|
|null      |[2010-12-16 19:00:00, 2010-12-17 19:00:00]|25375.189999999766|
|null      |[2010-12-05 19:00:00, 2010-12-06 19:00:00]|23395.099999999904|
+----------+------------------------------------------+------------------+
only showing top 5 rows



### Spark ML Pipeline

In [34]:
# COMMAND ----------

from pyspark.sql.functions import date_format, col

preppedDataFrame = retail_df\
  .na.fill(0)\
  .withColumn("day_of_week", date_format(col("InvoiceDate"), "EEEE"))\
  .coalesce(5)

preppedDataFrame.show(3, truncate=False)

+---------+---------+-------------------------------+--------+-------------------+---------+----------+--------------+-----------+
|InvoiceNo|StockCode|Description                    |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |day_of_week|
+---------+---------+-------------------------------+--------+-------------------+---------+----------+--------------+-----------+
|580538   |23084    |RABBIT NIGHT LIGHT             |48      |2011-12-05 08:38:00|1.79     |14075.0   |United Kingdom|Monday     |
|580538   |23077    |DOUGHNUT LIP GLOSS             |20      |2011-12-05 08:38:00|1.25     |14075.0   |United Kingdom|Monday     |
|580538   |22906    |12 MESSAGE CARDS WITH ENVELOPES|24      |2011-12-05 08:38:00|1.65     |14075.0   |United Kingdom|Monday     |
+---------+---------+-------------------------------+--------+-------------------+---------+----------+--------------+-----------+
only showing top 3 rows



In [35]:
# COMMAND ----------

trainDataFrame = preppedDataFrame\
  .where("InvoiceDate < '2011-07-01'")

testDataFrame = preppedDataFrame\
  .where("InvoiceDate >= '2011-07-01'")

In [36]:
trainDataFrame.show(3)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|day_of_week|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
|   537226|    22811|SET OF 6 T-LIGHTS...|       6|2010-12-06 08:34:00|     2.95|   15987.0|United Kingdom|     Monday|
|   537226|    21713|CITRONELLA CANDLE...|       8|2010-12-06 08:34:00|      2.1|   15987.0|United Kingdom|     Monday|
|   537226|    22927|GREEN GIANT GARDE...|       2|2010-12-06 08:34:00|     5.95|   15987.0|United Kingdom|     Monday|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
only showing top 3 rows



In [37]:
testDataFrame.show(3)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|day_of_week|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
|   580538|    23084|  RABBIT NIGHT LIGHT|      48|2011-12-05 08:38:00|     1.79|   14075.0|United Kingdom|     Monday|
|   580538|    23077| DOUGHNUT LIP GLOSS |      20|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|     Monday|
|   580538|    22906|12 MESSAGE CARDS ...|      24|2011-12-05 08:38:00|     1.65|   14075.0|United Kingdom|     Monday|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
only showing top 3 rows



In [38]:
# COMMAND ----------

from pyspark.ml.feature import StringIndexer

indexer = StringIndexer()\
  .setInputCol("day_of_week")\
  .setOutputCol("day_of_week_index")

In [39]:
# COMMAND ----------

from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder()\
  .setInputCol("day_of_week_index")\
  .setOutputCol("day_of_week_encoded")

In [40]:
# COMMAND ----------

from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler()\
  .setInputCols(["UnitPrice", "Quantity", "day_of_week_encoded"])\
  .setOutputCol("features")

In [41]:
# COMMAND ----------

from pyspark.ml import Pipeline

transformationPipeline = Pipeline()\
  .setStages([indexer, encoder, vectorAssembler])

In [42]:
# COMMAND ----------

fittedPipeline = transformationPipeline.fit(trainDataFrame)

In [43]:
# COMMAND ----------

transformedTraining = fittedPipeline.transform(trainDataFrame)

In [45]:
transformedTraining.show(5, truncate=False)

+---------+---------+------------------------------+--------+-------------------+---------+----------+--------------+-----------+-----------------+-------------------+---------------------------+
|InvoiceNo|StockCode|Description                   |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |day_of_week|day_of_week_index|day_of_week_encoded|features                   |
+---------+---------+------------------------------+--------+-------------------+---------+----------+--------------+-----------+-----------------+-------------------+---------------------------+
|537226   |22811    |SET OF 6 T-LIGHTS CACTI       |6       |2010-12-06 08:34:00|2.95     |15987.0   |United Kingdom|Monday     |2.0              |(5,[2],[1.0])      |(7,[0,1,4],[2.95,6.0,1.0]) |
|537226   |21713    |CITRONELLA CANDLE FLOWERPOT   |8       |2010-12-06 08:34:00|2.1      |15987.0   |United Kingdom|Monday     |2.0              |(5,[2],[1.0])      |(7,[0,1,4],[2.1,8.0,1.0])  |
|537226   |22927    

### Spark ML Clustering

In [46]:
# COMMAND ----------

from pyspark.ml.clustering import KMeans

kmeans = KMeans()\
  .setK(20)\
  .setSeed(10)

In [47]:
# COMMAND ----------

kmModel = kmeans.fit(transformedTraining)

In [48]:
type(kmModel)

pyspark.ml.clustering.KMeansModel

In [51]:
kmModel.summary

<pyspark.ml.clustering.KMeansSummary at 0x7f551744e1f0>