In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.appName("test").getOrCreate()

In [3]:
data = [
    ["01-06-2020", "Won"],
    ["02-06-2020", "Won"],
    ["03-06-2020", "Won"],
    ["04-06-2020", "Lost"],
    ["05-06-2020", "Lost"],
    ["06-06-2020", "Lost"],
    ["07-06-2020", "Won"],
]
df = spark.createDataFrame(data, ["event_date", "event_status"])
df = df.withColumn("event_date", to_date("event_date", "dd-MM-yyyy"))
df.show()

+----------+------------+
|event_date|event_status|
+----------+------------+
|2020-06-01|         Won|
|2020-06-02|         Won|
|2020-06-03|         Won|
|2020-06-04|        Lost|
|2020-06-05|        Lost|
|2020-06-06|        Lost|
|2020-06-07|         Won|
+----------+------------+



In [4]:
df.printSchema()

root
 |-- event_date: date (nullable = true)
 |-- event_status: string (nullable = true)



In [5]:
df1 = df.withColumn(
    "event_change",
    when(
        col("event_status") != lag(col("event_status")).over(Window.orderBy("event_date")),
        1,
    ).otherwise(0),
)
df1.show()

+----------+------------+------------+
|event_date|event_status|event_change|
+----------+------------+------------+
|2020-06-01|         Won|           0|
|2020-06-02|         Won|           0|
|2020-06-03|         Won|           0|
|2020-06-04|        Lost|           1|
|2020-06-05|        Lost|           0|
|2020-06-06|        Lost|           0|
|2020-06-07|         Won|           1|
+----------+------------+------------+



In [6]:
df2 = df1.withColumn(
    "event_group", sum("event_change").over(Window.orderBy("event_date"))
)
df2.show()

+----------+------------+------------+-----------+
|event_date|event_status|event_change|event_group|
+----------+------------+------------+-----------+
|2020-06-01|         Won|           0|          0|
|2020-06-02|         Won|           0|          0|
|2020-06-03|         Won|           0|          0|
|2020-06-04|        Lost|           1|          1|
|2020-06-05|        Lost|           0|          1|
|2020-06-06|        Lost|           0|          1|
|2020-06-07|         Won|           1|          2|
+----------+------------+------------+-----------+



In [7]:
df3 = (
    df2.groupBy("event_group", "event_status")
    .agg(first("event_date").alias("start_date"), last("event_date").alias("end_date"))
    .drop("event_group")
    .drop("event_change")
)
df3.show()

+------------+----------+----------+
|event_status|start_date|  end_date|
+------------+----------+----------+
|         Won|2020-06-01|2020-06-03|
|        Lost|2020-06-04|2020-06-06|
|         Won|2020-06-07|2020-06-07|
+------------+----------+----------+



In [8]:
df.createOrReplaceTempView("events")

In [9]:
spark.sql("SELECT * FROM events").show()

+----------+------------+
|event_date|event_status|
+----------+------------+
|2020-06-01|         Won|
|2020-06-02|         Won|
|2020-06-03|         Won|
|2020-06-04|        Lost|
|2020-06-05|        Lost|
|2020-06-06|        Lost|
|2020-06-07|         Won|
+----------+------------+



In [10]:
query = """
WITH cte AS (
    SELECT 
        event_date, 
        event_status,
        CASE WHEN event_status != LAG(event_status) OVER (ORDER BY event_date) THEN 1 ELSE 0 END AS event_change
    FROM events
),
cte2 AS (
    SELECT
        event_date, 
        event_status,
        SUM(event_change) OVER (ORDER BY event_date) AS event_group
    FROM cte
)
SELECT
    event_status,
    FIRST(event_date) AS start_date,
    LAST(event_date) AS end_date
FROM cte2
GROUP BY event_group, event_status
ORDER BY event_group
"""

spark.sql(query).show()

+------------+----------+----------+
|event_status|start_date|  end_date|
+------------+----------+----------+
|         Won|2020-06-01|2020-06-03|
|        Lost|2020-06-04|2020-06-06|
|         Won|2020-06-07|2020-06-07|
+------------+----------+----------+

