In [1]:
import polars as pl

from sklearn.model_selection import GroupShuffleSplit
import numpy as np

## The data was dowloaded locally, and is read from there (the local disk).
### apparently no header

In [18]:
df = (
    pl.read_excel('Data for Student Hiring Project - Citizen Science .xlsx', read_options={"has_header": False})
    .rename({'column_1': 'user_id', 'column_2': 'timestamp'})
    .with_columns(pl.col('timestamp').str.to_datetime())
)

In [19]:
df.shape

(99999, 2)

In [20]:
df.head()

user_id,timestamp
str,datetime[μs]
"""54999c8d9cd118282b001784""",2014-12-23 04:53:19
"""54999c91bb7b56040d0011e8""",2014-12-23 04:53:20
"""54999c8fbb7b565d7e000a7c""",2014-12-23 04:53:21
"""54999cb29cd1184d77000539""",2014-12-23 04:53:24
"""54068fab91ad6b597e000f97""",2014-12-23 04:53:28


In [21]:
def find_sessions(df, threshold: int):
    """Identifing the sessions. giving an id for each session. Also giving a numeric id to a user.
    """
    return (
        df.sort(['user_id', 'timestamp'])
        .with_columns((pl.col('timestamp').diff().cast(pl.Int64) > threshold).fill_null(True).alias('ts_diff'))
        .with_columns((pl.col('user_id').ne(pl.col('user_id').shift())).fill_null(True).alias('user_diff'))
        .with_columns([
            (pl.col("ts_diff") | pl.col("user_diff")).alias("new_session_mark")
        ])
        .with_columns([
            pl.col("new_session_mark").cum_sum().alias("session")
        ])
        .with_columns([
            pl.col("user_diff").cum_sum().alias("user")
        ])
    )

### We add session ids based on the user_id and whether 30 minutes have passed.

In [22]:
df_with_sessions = df.pipe(find_sessions, threshold=1_000_000 * 60 * 30)

In [23]:
df_with_sessions.tail(10)

user_id,timestamp,ts_diff,user_diff,new_session_mark,session,user
str,datetime[μs],bool,bool,bool,u32,u32
"""54aaaba29cd1180cf7000070""",2015-01-05 03:23:09,False,False,False,2288,942
"""54aaaba29cd1180cf7000070""",2015-01-05 03:23:37,False,False,False,2288,942
"""54aaaba29cd1180cf7000070""",2015-01-05 03:24:01,False,False,False,2288,942
"""54aaaba29cd1180cf7000070""",2015-01-05 03:24:17,False,False,False,2288,942
"""54aaaba29cd1180cf7000070""",2015-01-05 03:24:30,False,False,False,2288,942
"""54aac9549cd11825500000f8""",2015-01-05 05:26:57,True,True,True,2289,943
"""54aac9549cd11825500000f8""",2015-01-05 05:27:23,False,False,False,2289,943
"""54aac9549cd11825500000f8""",2015-01-05 05:27:48,False,False,False,2289,943
"""54aac9549cd11825500000f8""",2015-01-05 05:27:52,False,False,False,2289,943
"""54aac9549cd11825500000f8""",2015-01-05 05:28:02,False,False,False,2289,943


### For the sake of train/test split, I've decided to group by users, after all if the same user is used in both, we can potentially leak information.

In [24]:
groups = df_with_sessions['user']

### The requested .75/.25 split here is based on users. We may end with a differnt ratio with respect to the hits (or also with respect to sessions). 

In [27]:
gss = GroupShuffleSplit(n_splits=2, train_size=.75, random_state=1)

In [28]:
for i, (train_index, test_index) in enumerate(gss.split(df_with_sessions, groups=groups)):
    print(f"Fold {i}:")
    print(f"  Train: index={len(train_index)}, group={len(np.unique(groups[train_index]))}")
    print(f"  Test:  index={len(test_index)}, group={len(np.unique(groups[test_index]))}")

Fold 0:
  Train: index=70774, group=707
  Test:  index=29225, group=236
Fold 1:
  Train: index=76157, group=707
  Test:  index=23842, group=236
