In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType
from pyspark.sql import Window, functions as F
data = [
    (1, 2022, 70000),
    (1, 2023, 75000),
    (1, 2024, 80000),
    (2, 2022, 60000),
    (2, 2023, 58000),
    (3, 2023, 90000),
    (3, 2024, 95000)
]


schema = StructType([
    StructField("emp_id", IntegerType(), True),
    StructField("year", IntegerType(), True),
    StructField("salary", IntegerType(), True)
])
df = spark.createDataFrame(data, schema)
df.show()


+------+----+------+
|emp_id|year|salary|
+------+----+------+
|     1|2022| 70000|
|     1|2023| 75000|
|     1|2024| 80000|
|     2|2022| 60000|
|     2|2023| 58000|
|     3|2023| 90000|
|     3|2024| 95000|
+------+----+------+



In [0]:
window = Window.partitionBy(F.col("emp_id")).orderBy("year")
df = df.withColumn("previous_salary", F.lag("salary").over(window))
df.show()

+------+----+------+---------------+
|emp_id|year|salary|previous_salary|
+------+----+------+---------------+
|     1|2022| 70000|           NULL|
|     1|2023| 75000|          70000|
|     1|2024| 80000|          75000|
|     2|2022| 60000|           NULL|
|     2|2023| 58000|          60000|
|     3|2023| 90000|           NULL|
|     3|2024| 95000|          90000|
+------+----+------+---------------+



In [0]:
df.filter(F.col("salary")> F.col("previous_salary")).select(F.col("emp_id"), F.col("year"), F.col("salary")).show()

+------+----+------+
|emp_id|year|salary|
+------+----+------+
|     1|2023| 75000|
|     1|2024| 80000|
|     3|2024| 95000|
+------+----+------+

