In [1]:
import pyspark
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/25 09:45:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/11/25 09:45:49 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [31]:
from datetime import datetime
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, TimestampType

In [3]:
schema = StructType([
    StructField("player_id", IntegerType(), True),
    StructField("device_id", IntegerType(), True),
    StructField("event_date", StringType(), True),  # Dates are often treated as strings in PySpark and then converted as needed
    StructField("games_played", IntegerType(), True)
])
data = [
    (1, 2, '2016-03-01', 5),
    (1, 2, '2016-05-02', 6),
    (1, 3, '2017-06-25', 1),
    (3, 1, '2016-03-02', 0),
    (3, 4, '2018-07-03', 5)
]
df = spark.createDataFrame(data, schema)
df.show()
print(df.dtypes)

                                                                                

+---------+---------+----------+------------+
|player_id|device_id|event_date|games_played|
+---------+---------+----------+------------+
|        1|        2|2016-03-01|           5|
|        1|        2|2016-05-02|           6|
|        1|        3|2017-06-25|           1|
|        3|        1|2016-03-02|           0|
|        3|        4|2018-07-03|           5|
+---------+---------+----------+------------+

[('player_id', 'int'), ('device_id', 'int'), ('event_date', 'string'), ('games_played', 'int')]


In [7]:
df.createOrReplaceTempView("Activity")

In [9]:
result = spark.sql(
    """
    SELECT player_id, event_date, 
        sum(games_played) 
            OVER (PARTITION BY player_id ORDER BY event_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) 
            AS games_played_so_far
    FROM Activity
    """
)
result.show()



+---------+----------+-------------------+
|player_id|event_date|games_played_so_far|
+---------+----------+-------------------+
|        1|2016-03-01|                  5|
|        1|2016-05-02|                 11|
|        1|2017-06-25|                 12|
|        3|2016-03-02|                  0|
|        3|2018-07-03|                  5|
+---------+----------+-------------------+



                                                                                

In [14]:
result = spark.sql(
    """
    SELECT a.player_id, a.event_date, sum(b.games_played) AS games_played_so_far
    FROM Activity a LEFT JOIN  Activity b ON a.player_id = b.player_id AND a.event_date >= b.event_date
    GROUP BY a.player_id, a.event_date
    """
)
result.show()

[Stage 16:>                                                         (0 + 8) / 8]

+---------+----------+-------------------+
|player_id|event_date|games_played_so_far|
+---------+----------+-------------------+
|        1|2016-03-01|                  5|
|        1|2016-05-02|                 11|
|        1|2017-06-25|                 12|
|        3|2016-03-02|                  0|
|        3|2018-07-03|                  5|
+---------+----------+-------------------+



                                                                                