In [29]:
import pyspark
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [30]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

In [31]:
from datetime import datetime
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, TimestampType

In [32]:
schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("visited_on", DateType(), True),
    StructField("amount", IntegerType(), True)
])
data = [
    (1, "Jhon", datetime.strptime("2019-01-01", "%Y-%m-%d"), 100),
    (2, "Daniel", datetime.strptime("2019-01-02", "%Y-%m-%d"), 110),
    (3, "Jade", datetime.strptime("2019-01-03", "%Y-%m-%d"), 120),
    (4, "Khaled", datetime.strptime("2019-01-04", "%Y-%m-%d"), 130),
    (5, "Winston", datetime.strptime("2019-01-05", "%Y-%m-%d"), 110),
    (6, "Elvis", datetime.strptime("2019-01-06", "%Y-%m-%d"), 140),
    (7, "Anna", datetime.strptime("2019-01-07", "%Y-%m-%d"), 150),
    (8, "Maria", datetime.strptime("2019-01-08", "%Y-%m-%d"), 80),
    (9, "Jaze", datetime.strptime("2019-01-09", "%Y-%m-%d"), 110),
    (1, "Jhon", datetime.strptime("2019-01-10", "%Y-%m-%d"), 130),
    (3, "Jade", datetime.strptime("2019-01-10", "%Y-%m-%d"), 150)
]

df = spark.createDataFrame(data, schema)
df = df.withColumn("visited_on", col("visited_on").cast("timestamp"))
df.show()
print(df.dtypes)

+-----------+-------+-------------------+------+
|customer_id|   name|         visited_on|amount|
+-----------+-------+-------------------+------+
|          1|   Jhon|2019-01-01 00:00:00|   100|
|          2| Daniel|2019-01-02 00:00:00|   110|
|          3|   Jade|2019-01-03 00:00:00|   120|
|          4| Khaled|2019-01-04 00:00:00|   130|
|          5|Winston|2019-01-05 00:00:00|   110|
|          6|  Elvis|2019-01-06 00:00:00|   140|
|          7|   Anna|2019-01-07 00:00:00|   150|
|          8|  Maria|2019-01-08 00:00:00|    80|
|          9|   Jaze|2019-01-09 00:00:00|   110|
|          1|   Jhon|2019-01-10 00:00:00|   130|
|          3|   Jade|2019-01-10 00:00:00|   150|
+-----------+-------+-------------------+------+

[('customer_id', 'int'), ('name', 'string'), ('visited_on', 'timestamp'), ('amount', 'int')]


In [33]:
df.createOrReplaceTempView("Customer")

In [34]:
result = spark.sql(
    """
    SELECT visited_on, SUM(amount) as amount
    FROM Customer
    GROUP BY visited_on
    """
)
result.show()

+-------------------+------+
|         visited_on|amount|
+-------------------+------+
|2019-01-01 00:00:00|   100|
|2019-01-02 00:00:00|   110|
|2019-01-03 00:00:00|   120|
|2019-01-04 00:00:00|   130|
|2019-01-05 00:00:00|   110|
|2019-01-06 00:00:00|   140|
|2019-01-07 00:00:00|   150|
|2019-01-08 00:00:00|    80|
|2019-01-09 00:00:00|   110|
|2019-01-10 00:00:00|   280|
+-------------------+------+



In [40]:
result = spark.sql(
    """
    SELECT visited_on, SUM(amount) OVER (ORDER BY visited_on RANGE BETWEEN interval 6 day preceding AND CURRENT ROW) AS sum_seven_days
    FROM (SELECT visited_on, SUM(amount) as amount
    FROM Customer
    GROUP BY visited_on) AS sum_by_date
    """
)
result.show()

23/11/21 19:42:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/21 19:42:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/21 19:42:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------------------+--------------+
|         visited_on|sum_seven_days|
+-------------------+--------------+
|2019-01-01 00:00:00|           100|
|2019-01-02 00:00:00|           210|
|2019-01-03 00:00:00|           330|
|2019-01-04 00:00:00|           460|
|2019-01-05 00:00:00|           570|
|2019-01-06 00:00:00|           710|
|2019-01-07 00:00:00|           860|
|2019-01-08 00:00:00|           840|
|2019-01-09 00:00:00|           840|
|2019-01-10 00:00:00|          1000|
+-------------------+--------------+



23/11/21 19:42:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/21 19:42:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/21 19:42:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/21 19:42:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [37]:
result = spark.sql(
    """
    SELECT visited_on, sum_seven_days AS amount, ROUND(sum_seven_days/7, 2) AS average_amount
    FROM (SELECT visited_on, SUM(amount) OVER (ORDER BY visited_on RANGE BETWEEN interval 6 day preceding AND CURRENT ROW) AS sum_seven_days
    FROM (SELECT visited_on, SUM(amount) as amount
    FROM Customer
    GROUP BY visited_on) AS sum_by_date) as result
    WHERE DATEDIFF(visited_on, (SELECT MIN(visited_on) FROM Customer)) >= 6 
    """
)
result.show()

23/11/21 19:41:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/21 19:41:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/21 19:41:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/21 19:41:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/21 19:41:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/21 19:41:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------------------+------+--------------+
|         visited_on|amount|average_amount|
+-------------------+------+--------------+
|2019-01-07 00:00:00|   860|        122.86|
|2019-01-08 00:00:00|   840|         120.0|
|2019-01-09 00:00:00|   840|         120.0|
|2019-01-10 00:00:00|  1000|        142.86|
+-------------------+------+--------------+

