In [3]:
import polars as pl

In [42]:
df = (
    pl.read_excel('Data for Student Hiring Project - Citizen Science .xlsx', read_options={"has_header": False})
    .rename({'column_1': 'user_id', 'column_2': 'timestamp'})
    .with_columns(pl.col('timestamp').str.to_datetime())
)

In [43]:
df.shape

(99999, 2)

In [44]:
df.head()

user_id,timestamp
str,datetime[μs]
"""54999c8d9cd118282b001784""",2014-12-23 04:53:19
"""54999c91bb7b56040d0011e8""",2014-12-23 04:53:20
"""54999c8fbb7b565d7e000a7c""",2014-12-23 04:53:21
"""54999cb29cd1184d77000539""",2014-12-23 04:53:24
"""54068fab91ad6b597e000f97""",2014-12-23 04:53:28


In [110]:
def find_sessions(df, threshold):
    return (
        df.sort(['user_id', 'timestamp'])
        .with_columns((pl.col('timestamp').diff().cast(pl.Int64) > threshold).fill_null(True).alias('ts_diff'))
        .with_columns((pl.col('user_id').ne(pl.col('user_id').shift())).fill_null(True).alias('user_diff'))
        .with_columns([
            (pl.col("ts_diff") | pl.col("user_diff")).alias("new_session_mark")
        ])
        .with_columns([
            pl.col("new_session_mark").cum_sum().alias("session")
        ])
    )

In [112]:
df_with_sessions = df.pipe(find_sessions, threshold=1_000_000 * 60 * 30)

In [114]:
df_with_sessions.tail(10)

user_id,timestamp,ts_diff,user_diff,new_session_mark,session
str,datetime[μs],bool,bool,bool,u32
"""54aaaba29cd1180cf7000070""",2015-01-05 03:23:09,False,False,False,2288
"""54aaaba29cd1180cf7000070""",2015-01-05 03:23:37,False,False,False,2288
"""54aaaba29cd1180cf7000070""",2015-01-05 03:24:01,False,False,False,2288
"""54aaaba29cd1180cf7000070""",2015-01-05 03:24:17,False,False,False,2288
"""54aaaba29cd1180cf7000070""",2015-01-05 03:24:30,False,False,False,2288
"""54aac9549cd11825500000f8""",2015-01-05 05:26:57,True,True,True,2289
"""54aac9549cd11825500000f8""",2015-01-05 05:27:23,False,False,False,2289
"""54aac9549cd11825500000f8""",2015-01-05 05:27:48,False,False,False,2289
"""54aac9549cd11825500000f8""",2015-01-05 05:27:52,False,False,False,2289
"""54aac9549cd11825500000f8""",2015-01-05 05:28:02,False,False,False,2289
