In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder.appName("test").getOrCreate()

### Q1: LAG

In [3]:
data = [
    (1, "2024-01-01", "I1", 10, 1000),
    (2, "2024-01-15", "I2", 20, 2000),
    (3, "2024-02-01", "I3", 10, 1500),
    (4, "2024-02-15", "I4", 20, 2500),
    (5, "2024-03-01", "I5", 30, 3000),
    (6, "2024-03-10", "I6", 40, 3500),
    (7, "2024-03-20", "I7", 20, 2500),
    (8, "2024-03-30", "I8", 10, 1000),
]
schema = ["so_id", "so_date", "item_id", "item_qty", "item_value"]
df = spark.createDataFrame(data, schema)
df.show()

+-----+----------+-------+--------+----------+
|so_id|   so_date|item_id|item_qty|item_value|
+-----+----------+-------+--------+----------+
|    1|2024-01-01|     I1|      10|      1000|
|    2|2024-01-15|     I2|      20|      2000|
|    3|2024-02-01|     I3|      10|      1500|
|    4|2024-02-15|     I4|      20|      2500|
|    5|2024-03-01|     I5|      30|      3000|
|    6|2024-03-10|     I6|      40|      3500|
|    7|2024-03-20|     I7|      20|      2500|
|    8|2024-03-30|     I8|      10|      1000|
+-----+----------+-------+--------+----------+



In [4]:
df = df.withColumn("so_date", col("so_date").cast(DateType()))
df.printSchema()

root
 |-- so_id: long (nullable = true)
 |-- so_date: date (nullable = true)
 |-- item_id: string (nullable = true)
 |-- item_qty: long (nullable = true)
 |-- item_value: long (nullable = true)



In [5]:
df1 = df.select(
    month(col("so_date")).alias("month"),
    year(col("so_date")).alias("year"),
    col("item_value"),
)
df1.show()

+-----+----+----------+
|month|year|item_value|
+-----+----+----------+
|    1|2024|      1000|
|    1|2024|      2000|
|    2|2024|      1500|
|    2|2024|      2500|
|    3|2024|      3000|
|    3|2024|      3500|
|    3|2024|      2500|
|    3|2024|      1000|
+-----+----+----------+



In [6]:
df2 = df1.groupBy("month", "year").agg(sum("item_value").alias("total_sales"))
df2.show()

+-----+----+-----------+
|month|year|total_sales|
+-----+----+-----------+
|    1|2024|       3000|
|    2|2024|       4000|
|    3|2024|      10000|
+-----+----+-----------+



In [7]:
df3 = df2.select(
    "*",
    lag(col("total_sales"))
    .over(Window.orderBy(col("month"), col("year")))
    .alias("prev_sales"),
)
df3.show()

+-----+----+-----------+----------+
|month|year|total_sales|prev_sales|
+-----+----+-----------+----------+
|    1|2024|       3000|      null|
|    2|2024|       4000|      3000|
|    3|2024|      10000|      4000|
+-----+----+-----------+----------+



In [8]:
df3.select(
    "*",
    (col("total_sales") - col("prev_sales") * 100 / col("total_sales")).alias(
        "pct_diff_prev_month"
    ),
).show()

+-----+----+-----------+----------+-------------------+
|month|year|total_sales|prev_sales|pct_diff_prev_month|
+-----+----+-----------+----------+-------------------+
|    1|2024|       3000|      null|               null|
|    2|2024|       4000|      3000|             3925.0|
|    3|2024|      10000|      4000|             9960.0|
+-----+----+-----------+----------+-------------------+



### Q2: Strategy

In [9]:
data = [
    (0, 0, "start", 0.712),
    (0, 0, "end", 1.520),
    (0, 1, "start", 3.140),
    (0, 1, "end", 4.120),
    (1, 0, "start", 0.550),
    (1, 0, "end", 1.550),
    (1, 1, "start", 0.430),
    (1, 1, "end", 1.420),
    (2, 0, "start", 4.100),
    (2, 0, "end", 4.512),
    (2, 1, "start", 2.500),
    (2, 1, "end", 5.000),
]
schema = ["machine_id", "process_id", "activity_id", "timestamp"]
df = spark.createDataFrame(data, schema)
df.show()

+----------+----------+-----------+---------+
|machine_id|process_id|activity_id|timestamp|
+----------+----------+-----------+---------+
|         0|         0|      start|    0.712|
|         0|         0|        end|     1.52|
|         0|         1|      start|     3.14|
|         0|         1|        end|     4.12|
|         1|         0|      start|     0.55|
|         1|         0|        end|     1.55|
|         1|         1|      start|     0.43|
|         1|         1|        end|     1.42|
|         2|         0|      start|      4.1|
|         2|         0|        end|    4.512|
|         2|         1|      start|      2.5|
|         2|         1|        end|      5.0|
+----------+----------+-----------+---------+



In [10]:
df1 = df.select(
    "machine_id",
    "process_id",
    when(col("activity_id") == "start", col("timestamp")).alias("start_time"),
    when(col("activity_id") == "end", col("timestamp")).alias("end_time"),
)
df1.show()

+----------+----------+----------+--------+
|machine_id|process_id|start_time|end_time|
+----------+----------+----------+--------+
|         0|         0|     0.712|    null|
|         0|         0|      null|    1.52|
|         0|         1|      3.14|    null|
|         0|         1|      null|    4.12|
|         1|         0|      0.55|    null|
|         1|         0|      null|    1.55|
|         1|         1|      0.43|    null|
|         1|         1|      null|    1.42|
|         2|         0|       4.1|    null|
|         2|         0|      null|   4.512|
|         2|         1|       2.5|    null|
|         2|         1|      null|     5.0|
+----------+----------+----------+--------+



In [11]:
df2 = (
    df1
    .groupBy(col("machine_id"), col("process_id"))
    .agg((max(col("end_time")) - max(col("start_time"))).alias("diff"))
)
df2.show()

+----------+----------+------------------+
|machine_id|process_id|              diff|
+----------+----------+------------------+
|         0|         0|             0.808|
|         0|         1|              0.98|
|         1|         0|               1.0|
|         1|         1|              0.99|
|         2|         0|0.4119999999999999|
|         2|         1|               2.5|
+----------+----------+------------------+



In [12]:
df3 = df2.groupBy(col("machine_id")).agg(avg(col("diff")).alias("avg_processing_time"))
df3.show()

+----------+-------------------+
|machine_id|avg_processing_time|
+----------+-------------------+
|         0|              0.894|
|         1|              0.995|
|         2|              1.456|
+----------+-------------------+

