In [26]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import * #for window

spark = SparkSession\
.builder\
.master("yarn")\
.appName("window functions")\
.enableHiveSupport()\
.config("spark.sql.warehouse.dir","/user/itv009490/warehouse")\
.getOrCreate()

spark

In [27]:
df = spark.read.format("csv").\
option("inferSchema","true").\
option("header","true").\
load("/public/trendytech/datasets/windowdata.csv")

In [6]:
df.sort("country").show()

+---------------+-------+-----------+-------------+------------+
|        country|weeknum|numinvoices|totalquantity|invoicevalue|
+---------------+-------+-----------+-------------+------------+
|      Australia|     49|          1|          214|       258.9|
|      Australia|     48|          1|          107|      358.25|
|      Australia|     50|          2|          133|      387.95|
|        Austria|     50|          2|            3|      257.04|
|        Bahrain|     51|          1|           54|      205.74|
|        Belgium|     48|          1|          528|       346.1|
|        Belgium|     50|          2|          285|      625.16|
|        Belgium|     51|          2|          942|      838.65|
|Channel Islands|     49|          1|           80|      363.53|
|         Cyprus|     50|          1|          917|     1590.82|
|        Denmark|     49|          1|          454|      1281.5|
|        Finland|     50|          1|         1254|       892.8|
|         France|     49|

In [10]:
grpd_cntry = df.groupBy("country").agg(max(col("totalquantity")).alias("max")).sort("country").show()

+---------------+----+
|        country| max|
+---------------+----+
|      Australia| 214|
|        Austria|   3|
|        Bahrain|  54|
|        Belgium| 942|
|Channel Islands|  80|
|         Cyprus| 917|
|        Denmark| 454|
|        Finland|1254|
|         France|2303|
|        Germany|1973|
|        Iceland| 319|
|          India|2822|
|         Israel| -56|
|          Italy| 164|
|          Japan|3897|
|      Lithuania| 622|
|    Netherlands|6714|
|         Norway|1852|
|         Poland| 140|
|       Portugal| 726|
+---------------+----+
only showing top 20 rows



In [28]:
my_window = Window.partitionBy("country").orderBy("weeknum").rowsBetween(Window.unboundedPreceding,Window.currentRow)

In [29]:
res=df.withColumn("runningTotal",sum("invoicevalue").over(my_window))

In [32]:
res.sort("country","weeknum").show()

+---------------+-------+-----------+-------------+------------+------------------+
|        country|weeknum|numinvoices|totalquantity|invoicevalue|      runningTotal|
+---------------+-------+-----------+-------------+------------+------------------+
|      Australia|     48|          1|          107|      358.25|            358.25|
|      Australia|     49|          1|          214|       258.9|            617.15|
|      Australia|     50|          2|          133|      387.95|1005.0999999999999|
|        Austria|     50|          2|            3|      257.04|            257.04|
|        Bahrain|     51|          1|           54|      205.74|            205.74|
|        Belgium|     48|          1|          528|       346.1|             346.1|
|        Belgium|     50|          2|          285|      625.16|            971.26|
|        Belgium|     51|          2|          942|      838.65|1809.9099999999999|
|Channel Islands|     49|          1|           80|      363.53|            

In [33]:
my_window = Window.partitionBy("country").orderBy("weeknum").rowsBetween(-2,Window.currentRow) #2 prev rows

In [34]:
res=df.withColumn("runningTotal",sum("invoicevalue").over(my_window))

In [35]:
res.sort("country","weeknum").show()

+---------------+-------+-----------+-------------+------------+------------------+
|        country|weeknum|numinvoices|totalquantity|invoicevalue|      runningTotal|
+---------------+-------+-----------+-------------+------------+------------------+
|      Australia|     48|          1|          107|      358.25|            358.25|
|      Australia|     49|          1|          214|       258.9|            617.15|
|      Australia|     50|          2|          133|      387.95|1005.0999999999999|
|        Austria|     50|          2|            3|      257.04|            257.04|
|        Bahrain|     51|          1|           54|      205.74|            205.74|
|        Belgium|     48|          1|          528|       346.1|             346.1|
|        Belgium|     50|          2|          285|      625.16|            971.26|
|        Belgium|     51|          2|          942|      838.65|1809.9099999999999|
|Channel Islands|     49|          1|           80|      363.53|            

In [42]:
#rolling sum of quantity

my_window_new = Window.partitionBy("country").orderBy("weeknum").rowsBetween(Window.unboundedPreceding,Window.currentRow)

In [43]:
res2 = df.withColumn("running_quantity_sum",sum("totalquantity").over(my_window_new))

In [44]:
res2.show()

+-------+-------+-----------+-------------+------------+--------------------+
|country|weeknum|numinvoices|totalquantity|invoicevalue|running_quantity_sum|
+-------+-------+-----------+-------------+------------+--------------------+
| Sweden|     50|          3|         3714|      2646.3|                3714|
|Germany|     48|         11|         1795|     3309.75|                1795|
|Germany|     49|         12|         1852|     4521.39|                3647|
|Germany|     50|         15|         1973|     5065.79|                5620|
|Germany|     51|          5|         1103|     1665.91|                6723|
| France|     48|          4|         1299|     2808.16|                1299|
| France|     49|          9|         2303|     4527.01|                3602|
| France|     50|          6|          529|      537.32|                4131|
| France|     51|          5|          847|     1702.87|                4978|
|Belgium|     48|          1|          528|       346.1|        

In [45]:
#rolling sum of quantity

my_window_new = Window.partitionBy("country").orderBy("weeknum").rowsBetween(-1,Window.currentRow)
res3 = df.withColumn("running_quantity_sum",sum("totalquantity").over(my_window_new))
res3.show()

+-------+-------+-----------+-------------+------------+--------------------+
|country|weeknum|numinvoices|totalquantity|invoicevalue|running_quantity_sum|
+-------+-------+-----------+-------------+------------+--------------------+
| Sweden|     50|          3|         3714|      2646.3|                3714|
|Germany|     48|         11|         1795|     3309.75|                1795|
|Germany|     49|         12|         1852|     4521.39|                3647|
|Germany|     50|         15|         1973|     5065.79|                3825|
|Germany|     51|          5|         1103|     1665.91|                3076|
| France|     48|          4|         1299|     2808.16|                1299|
| France|     49|          9|         2303|     4527.01|                3602|
| France|     50|          6|          529|      537.32|                2832|
| France|     51|          5|          847|     1702.87|                1376|
|Belgium|     48|          1|          528|       346.1|        