In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
orders_df = spark.read \
.format("csv") \
.option("inferSchema","true") \
.option("header","true") \
.load("/public/trendytech/datasets/windowdatamodified.csv")

In [3]:
orders_df.show()

+--------------+-------+-----------+-------------+------------+
|       country|weeknum|numinvoices|totalquantity|invoicevalue|
+--------------+-------+-----------+-------------+------------+
|         Spain|     49|          1|           67|      174.72|
|       Germany|     48|         11|         1795|      1600.0|
|     Lithuania|     48|          3|          622|     1598.06|
|       Germany|     49|         12|         1852|      1800.0|
|       Bahrain|     51|          1|           54|      205.74|
|       Iceland|     49|          1|          319|      711.79|
|         India|     51|          5|           95|       300.0|
|     Australia|     50|          2|          133|      387.95|
|         Italy|     49|          1|           -2|       -17.0|
|         India|     49|          5|         1280|      3284.1|
|         Spain|     50|          2|          400|     1049.01|
|United Kingdom|     51|        200|        28782|    75103.46|
|        Norway|     49|          1|    

In [4]:
orders_df.orderBy("country", "invoicevalue").show()

+---------------+-------+-----------+-------------+------------+
|        country|weeknum|numinvoices|totalquantity|invoicevalue|
+---------------+-------+-----------+-------------+------------+
|      Australia|     49|          1|          214|       258.9|
|      Australia|     48|          1|          107|      358.25|
|      Australia|     50|          2|          133|      387.95|
|        Austria|     50|          2|            3|      257.04|
|        Bahrain|     51|          1|           54|      205.74|
|        Belgium|     50|          2|          285|      625.16|
|        Belgium|     48|          1|          528|       800.0|
|        Belgium|     51|          2|          942|       800.0|
|Channel Islands|     49|          1|           80|      363.53|
|         Cyprus|     50|          1|          917|     1590.82|
|        Denmark|     49|          1|          454|      1281.5|
|        Finland|     50|          1|         1254|       892.8|
|         France|     49|

In [5]:
from pyspark.sql import *

In [6]:
myWindow = Window.partitionBy("Country") \
.orderBy("weeknum") \
.rowsBetween(Window.unboundedPreceding, Window.currentRow)

In [7]:
from pyspark.sql.functions import *

In [8]:
orders_df.withColumn("running_total", sum("invoicevalue").over(myWindow)).show()

+-------+-------+-----------+-------------+------------+------------------+
|country|weeknum|numinvoices|totalquantity|invoicevalue|     running_total|
+-------+-------+-----------+-------------+------------+------------------+
| Sweden|     50|          3|         3714|      2646.3|            2646.3|
|Germany|     48|         11|         1795|      1600.0|            1600.0|
|Germany|     49|         12|         1852|      1800.0|            3400.0|
|Germany|     50|         15|         1973|      1800.0|            5200.0|
|Germany|     51|          5|         1103|      1600.0|            6800.0|
| France|     48|          4|         1299|       500.0|             500.0|
| France|     49|          9|         2303|       500.0|            1000.0|
| France|     50|          6|          529|      537.32|1537.3200000000002|
| France|     51|          5|          847|       500.0|2037.3200000000002|
|Belgium|     48|          1|          528|       800.0|             800.0|
|Belgium|   

In [9]:
mywindow = Window.partitionBy("country") \
.orderBy(desc("invoicevalue"))

In [10]:
result_df = orders_df.withColumn("rank", rank().over(mywindow))

In [11]:
result_df.show()

+-------+-------+-----------+-------------+------------+----+
|country|weeknum|numinvoices|totalquantity|invoicevalue|rank|
+-------+-------+-----------+-------------+------------+----+
| Sweden|     50|          3|         3714|      2646.3|   1|
|Germany|     49|         12|         1852|      1800.0|   1|
|Germany|     50|         15|         1973|      1800.0|   1|
|Germany|     48|         11|         1795|      1600.0|   3|
|Germany|     51|          5|         1103|      1600.0|   3|
| France|     50|          6|          529|      537.32|   1|
| France|     51|          5|          847|       500.0|   2|
| France|     49|          9|         2303|       500.0|   2|
| France|     48|          4|         1299|       500.0|   2|
|Belgium|     48|          1|          528|       800.0|   1|
|Belgium|     51|          2|          942|       800.0|   1|
|Belgium|     50|          2|          285|      625.16|   3|
|Finland|     50|          1|         1254|       892.8|   1|
|  India

In [13]:
result_df = orders_df.withColumn("rank", row_number().over(mywindow))

In [14]:
result_df.show()

+-------+-------+-----------+-------------+------------+----+
|country|weeknum|numinvoices|totalquantity|invoicevalue|rank|
+-------+-------+-----------+-------------+------------+----+
| Sweden|     50|          3|         3714|      2646.3|   1|
|Germany|     49|         12|         1852|      1800.0|   1|
|Germany|     50|         15|         1973|      1800.0|   2|
|Germany|     48|         11|         1795|      1600.0|   3|
|Germany|     51|          5|         1103|      1600.0|   4|
| France|     50|          6|          529|      537.32|   1|
| France|     51|          5|          847|       500.0|   2|
| France|     49|          9|         2303|       500.0|   3|
| France|     48|          4|         1299|       500.0|   4|
|Belgium|     48|          1|          528|       800.0|   1|
|Belgium|     51|          2|          942|       800.0|   2|
|Belgium|     50|          2|          285|      625.16|   3|
|Finland|     50|          1|         1254|       892.8|   1|
|  India

In [15]:
result_df.select("*").where("rank == 1").show()

+---------------+-------+-----------+-------------+------------+----+
|        country|weeknum|numinvoices|totalquantity|invoicevalue|rank|
+---------------+-------+-----------+-------------+------------+----+
|         Sweden|     50|          3|         3714|      2646.3|   1|
|        Germany|     49|         12|         1852|      1800.0|   1|
|         France|     50|          6|          529|      537.32|   1|
|        Belgium|     48|          1|          528|       800.0|   1|
|        Finland|     50|          1|         1254|       892.8|   1|
|          India|     49|          5|         1280|      3284.1|   1|
|          Italy|     48|          1|          164|       427.8|   1|
|      Lithuania|     48|          3|          622|     1598.06|   1|
|         Norway|     48|          1|         1852|     1919.14|   1|
|          Spain|     50|          2|          400|     1049.01|   1|
|        Denmark|     49|          1|          454|      1281.5|   1|
|        Iceland|   

In [16]:
result_df.select("*").where("rank == 1").drop("rank").show()

+---------------+-------+-----------+-------------+------------+
|        country|weeknum|numinvoices|totalquantity|invoicevalue|
+---------------+-------+-----------+-------------+------------+
|         Sweden|     50|          3|         3714|      2646.3|
|        Germany|     49|         12|         1852|      1800.0|
|         France|     50|          6|          529|      537.32|
|        Belgium|     48|          1|          528|       800.0|
|        Finland|     50|          1|         1254|       892.8|
|          India|     49|          5|         1280|      3284.1|
|          Italy|     48|          1|          164|       427.8|
|      Lithuania|     48|          3|          622|     1598.06|
|         Norway|     48|          1|         1852|     1919.14|
|          Spain|     50|          2|          400|     1049.01|
|        Denmark|     49|          1|          454|      1281.5|
|        Iceland|     49|          1|          319|      711.79|
|         Israel|     50|

In [17]:
mywindow = Window.partitionBy("country") \
.orderBy("weeknum")

In [18]:
orders_df.orderBy("country","weeknum").show(50)

+---------------+-------+-----------+-------------+------------+
|        country|weeknum|numinvoices|totalquantity|invoicevalue|
+---------------+-------+-----------+-------------+------------+
|      Australia|     48|          1|          107|      358.25|
|      Australia|     49|          1|          214|       258.9|
|      Australia|     50|          2|          133|      387.95|
|        Austria|     50|          2|            3|      257.04|
|        Bahrain|     51|          1|           54|      205.74|
|        Belgium|     48|          1|          528|       800.0|
|        Belgium|     50|          2|          285|      625.16|
|        Belgium|     51|          2|          942|       800.0|
|Channel Islands|     49|          1|           80|      363.53|
|         Cyprus|     50|          1|          917|     1590.82|
|        Denmark|     49|          1|          454|      1281.5|
|        Finland|     50|          1|         1254|       892.8|
|         France|     48|

In [19]:
results_df = orders_df.withColumn("previous_week",lag("invoiceValue").over(mywindow))

In [20]:
results_df.show(10)

+-------+-------+-----------+-------------+------------+-------------+
|country|weeknum|numinvoices|totalquantity|invoicevalue|previous_week|
+-------+-------+-----------+-------------+------------+-------------+
| Sweden|     50|          3|         3714|      2646.3|         null|
|Germany|     48|         11|         1795|      1600.0|         null|
|Germany|     49|         12|         1852|      1800.0|       1600.0|
|Germany|     50|         15|         1973|      1800.0|       1800.0|
|Germany|     51|          5|         1103|      1600.0|       1800.0|
| France|     48|          4|         1299|       500.0|         null|
| France|     49|          9|         2303|       500.0|        500.0|
| France|     50|          6|          529|      537.32|        500.0|
| France|     51|          5|          847|       500.0|       537.32|
|Belgium|     48|          1|          528|       800.0|         null|
+-------+-------+-----------+-------------+------------+-------------+
only s

In [21]:
final_df = results_df.withColumn("invoice_diff",expr("invoicevalue - previous_week"))

In [22]:
final_df.show()

+-------+-------+-----------+-------------+------------+-------------+-------------------+
|country|weeknum|numinvoices|totalquantity|invoicevalue|previous_week|       invoice_diff|
+-------+-------+-----------+-------------+------------+-------------+-------------------+
| Sweden|     50|          3|         3714|      2646.3|         null|               null|
|Germany|     48|         11|         1795|      1600.0|         null|               null|
|Germany|     49|         12|         1852|      1800.0|       1600.0|              200.0|
|Germany|     50|         15|         1973|      1800.0|       1800.0|                0.0|
|Germany|     51|          5|         1103|      1600.0|       1800.0|             -200.0|
| France|     48|          4|         1299|       500.0|         null|               null|
| France|     49|          9|         2303|       500.0|        500.0|                0.0|
| France|     50|          6|          529|      537.32|        500.0|  37.32000000000005|

In [23]:
res_df = Window.partitionBy("Country")

In [24]:
result = orders_df.withColumn("total_invoice_value",sum("invoicevalue").over(res_df))

In [25]:
result.show()

+-------+-------+-----------+-------------+------------+-------------------+
|country|weeknum|numinvoices|totalquantity|invoicevalue|total_invoice_value|
+-------+-------+-----------+-------------+------------+-------------------+
| Sweden|     50|          3|         3714|      2646.3|             2646.3|
|Germany|     48|         11|         1795|      1600.0|             6800.0|
|Germany|     49|         12|         1852|      1800.0|             6800.0|
|Germany|     50|         15|         1973|      1800.0|             6800.0|
|Germany|     51|          5|         1103|      1600.0|             6800.0|
| France|     51|          5|          847|       500.0| 2037.3200000000002|
| France|     49|          9|         2303|       500.0| 2037.3200000000002|
| France|     48|          4|         1299|       500.0| 2037.3200000000002|
| France|     50|          6|          529|      537.32| 2037.3200000000002|
|Belgium|     48|          1|          528|       800.0|            2225.16|