In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, col

# Create a SparkSession
spark = SparkSession.builder \
    .appName("Example") \
    .getOrCreate()

# Sample data
data = [("01-01-2024", "won"),
        ("02-01-2024", "won"),
        ("03-01-2024", "won"),
        ("04-01-2024", "lost"),
        ("05-01-2024", "lost"),
        ("06-01-2024", "lost"),
        ("07-01-2024", "won"),
        ("08-01-2024", "won"),
        ("09-01-2024", "won")]

# Create a DataFrame from the sample data
df = spark.createDataFrame(data, ["event_date", "event_status"])

# Apply to_date function to event_date column
df_date = df.select(to_date(col("event_date"), 'dd-MM-yyyy').alias("event_date"), col("event_status"))

# Show the DataFrame
df_date.show()

+----------+------------+
|event_date|event_status|
+----------+------------+
|2024-01-01|         won|
|2024-01-02|         won|
|2024-01-03|         won|
|2024-01-04|        lost|
|2024-01-05|        lost|
|2024-01-06|        lost|
|2024-01-07|         won|
|2024-01-08|         won|
|2024-01-09|         won|
+----------+------------+



In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window
windowSpec = Window.orderBy(col("event_date"))
df_date=df.select(col("event_date"),col("event_status"),when(col("event_status")!=lag(col("event_status")).over(windowSpec),1).otherwise(0).alias("event_change"))
display(df_date)

event_date,event_status,event_change
01-01-2024,won,0
02-01-2024,won,0
03-01-2024,won,0
04-01-2024,lost,1
05-01-2024,lost,0
06-01-2024,lost,0
07-01-2024,won,1
08-01-2024,won,0
09-01-2024,won,0


In [0]:
df_evensum=df_date.withColumn("event_sum",sum(col("event_change")).over(Window.orderBy(col("event_date"))))
display(df_evensum)

event_date,event_status,event_change,event_sum
01-01-2024,won,0,0
02-01-2024,won,0,0
03-01-2024,won,0,0
04-01-2024,lost,1,1
05-01-2024,lost,0,1
06-01-2024,lost,0,1
07-01-2024,won,1,2
08-01-2024,won,0,2
09-01-2024,won,0,2


In [0]:
df_final=df_evensum.groupBy(col("event_sum"),"event_status").agg(first(col("event_date")),last(col("event_date")))
display(df_final)

event_sum,event_status,first(event_date),last(event_date)
0,won,01-01-2024,03-01-2024
1,lost,04-01-2024,06-01-2024
2,won,07-01-2024,09-01-2024
