In [1]:
import polars as pl

#from sklearn.model_selection import GroupShuffleSplit
import numpy as np

from sklearn.model_selection import StratifiedGroupKFold

## The data was dowloaded locally, and is read from there (the local disk).
### apparently no header

In [2]:
df = (
    pl.read_excel('Data for Student Hiring Project - Citizen Science .xlsx', read_options={"has_header": False})
    .rename({'column_1': 'user_id', 'column_2': 'timestamp'})
    .with_columns(pl.col('timestamp').str.to_datetime())
)

In [3]:
df.shape

(99999, 2)

In [4]:
df.head()

user_id,timestamp
str,datetime[μs]
"""54999c8d9cd118282b001784""",2014-12-23 04:53:19
"""54999c91bb7b56040d0011e8""",2014-12-23 04:53:20
"""54999c8fbb7b565d7e000a7c""",2014-12-23 04:53:21
"""54999cb29cd1184d77000539""",2014-12-23 04:53:24
"""54068fab91ad6b597e000f97""",2014-12-23 04:53:28


In [136]:
def find_sessions(df, threshold_in_sec: int):
    """Identifing the sessions. giving an id for each session. Also giving a numeric id to a user.
    """
    return (
        df.sort(['user_id', 'timestamp'])
        .with_columns((pl.col('timestamp').diff().dt.total_seconds() > threshold_in_sec).fill_null(True).alias('ts_diff'))
        .with_columns((pl.col('user_id').ne(pl.col('user_id').shift())).fill_null(True).alias('user_diff'))
        .with_columns([
            (pl.col("ts_diff") | pl.col("user_diff")).alias("new_session_mark")
        ])
        .with_columns([
            pl.col("new_session_mark").cum_sum().alias("session")
        ])
        .with_columns([
            pl.col("user_diff").cum_sum().alias("user")
        ])
    )

### We add session ids based on the user_id and whether 30 minutes have passed.

In [137]:
df_with_sessions = df.pipe(find_sessions, threshold_in_sec=60 * 30)

In [138]:
df_with_sessions.tail(10)

user_id,timestamp,ts_diff,user_diff,new_session_mark,session,user
str,datetime[μs],bool,bool,bool,u32,u32
"""54aaaba29cd1180cf7000070""",2015-01-05 03:23:09,False,False,False,2288,942
"""54aaaba29cd1180cf7000070""",2015-01-05 03:23:37,False,False,False,2288,942
"""54aaaba29cd1180cf7000070""",2015-01-05 03:24:01,False,False,False,2288,942
"""54aaaba29cd1180cf7000070""",2015-01-05 03:24:17,False,False,False,2288,942
"""54aaaba29cd1180cf7000070""",2015-01-05 03:24:30,False,False,False,2288,942
"""54aac9549cd11825500000f8""",2015-01-05 05:26:57,True,True,True,2289,943
"""54aac9549cd11825500000f8""",2015-01-05 05:27:23,False,False,False,2289,943
"""54aac9549cd11825500000f8""",2015-01-05 05:27:48,False,False,False,2289,943
"""54aac9549cd11825500000f8""",2015-01-05 05:27:52,False,False,False,2289,943
"""54aac9549cd11825500000f8""",2015-01-05 05:28:02,False,False,False,2289,943


In [139]:
df_with_sessions.head()

user_id,timestamp,ts_diff,user_diff,new_session_mark,session,user
str,datetime[μs],bool,bool,bool,u32,u32
"""4d5835cc2a82e278e6000028""",2015-01-05 03:04:02,True,True,True,1,1
"""4d5835cc2a82e278e6000028""",2015-01-05 03:04:53,False,False,False,1,1
"""4d5835cc2a82e278e6000028""",2015-01-05 03:05:45,False,False,False,1,1
"""4d5835cc2a82e278e6000028""",2015-01-05 03:06:10,False,False,False,1,1
"""4d5835cc2a82e278e6000028""",2015-01-05 03:06:25,False,False,False,1,1


### note above both session ids and user ids start with '1'

### Let's learn/predict the additional time in the system (in seconds). Therefore < 5 * 60 means "about to disengage".

In [140]:
df_with_target = (
    df_with_sessions
    .with_columns(
        ((pl.col("timestamp").max().over("session") - pl.col('timestamp')).dt.total_seconds()).alias('target')
    )
)

In [141]:
df_with_target['target'].plot.kde()

In [142]:
df_with_target['target'].log().plot.kde()

In [143]:
y = (df_with_target['target'] < 5 * 60).alias('disengage')
y.value_counts() # note used here '<' rather than '<='. important?

disengage,count
bool,u32
False,67819
True,32180


### For the sake of train/test split, I've decided to group by users, after all if the same user is used in both, we can potentially leak information.

In [144]:
groups = df_with_sessions['user']

In [145]:
# ### The requested .75/.25 split here is based on users. We may end with a differnt ratio with respect to the hits (or also with respect to sessions). 

In [146]:
# gss = GroupShuffleSplit(n_splits=2, train_size=.75, random_state=1)

In [147]:
# for i, (train_index, test_index) in enumerate(gss.split(df_with_sessions, groups=groups)):
#     print(f"Fold {i}:")
#     print(f"  Train: index={len(train_index)}, group={len(np.unique(groups[train_index]))}")
#     print(f"  Test:  index={len(test_index)}, group={len(np.unique(groups[test_index]))}")

In [148]:
sgkf = StratifiedGroupKFold(n_splits=4)

In [149]:
for i, (train_index, test_index) in enumerate(sgkf.split(df_with_target, y, groups)):
     print(f"Fold {i}:")
     print(f"  Train: index={len(train_index)}")
     print(f"         group={len(np.unique(groups[train_index]))}")
     print(f"  Test:  index={len(test_index)}")
     print(f"         group={len(np.unique(groups[test_index]))}")

Fold 0:
  Train: index=74976
         group=703
  Test:  index=25023
         group=240
Fold 1:
  Train: index=75013
         group=691
  Test:  index=24986
         group=252
Fold 2:
  Train: index=75013
         group=726
  Test:  index=24986
         group=217
Fold 3:
  Train: index=74995
         group=709
  Test:  index=25004
         group=234


In [150]:
df_with_target = (
    df_with_target
    .with_columns(pl.repeat(-1, df_with_target.shape[0]).alias('fold'))
)

In [151]:
for i, (train_index, test_index) in enumerate(sgkf.split(df_with_target, y, groups)):
    df_with_target[test_index, 'fold'] = i

In [152]:
df_with_target['fold'].value_counts()

fold,count
i32,u32
0,25023
1,24986
3,25004
2,24986


In [153]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

In [154]:
train_index = df_with_target['fold'] != 0
test_index = df_with_target['fold'] == 0

### Dummy baseline

In [155]:
dummy_model = DummyClassifier()
dummy_model.fit(df_with_target.filter(train_index).select(['fold', 'user']), y.filter(train_index))
predictions = dummy_model.predict(df_with_target.filter(test_index).select(['fold', 'user']))

In [156]:
accuracy_score(y.filter(test_index), predictions)

0.6780961515405827

In [157]:
df_with_target.head()

user_id,timestamp,ts_diff,user_diff,new_session_mark,session,user,target,fold
str,datetime[μs],bool,bool,bool,u32,u32,i64,i32
"""4d5835cc2a82e278e6000028""",2015-01-05 03:04:02,True,True,True,1,1,2898,2
"""4d5835cc2a82e278e6000028""",2015-01-05 03:04:53,False,False,False,1,1,2847,2
"""4d5835cc2a82e278e6000028""",2015-01-05 03:05:45,False,False,False,1,1,2795,2
"""4d5835cc2a82e278e6000028""",2015-01-05 03:06:10,False,False,False,1,1,2770,2
"""4d5835cc2a82e278e6000028""",2015-01-05 03:06:25,False,False,False,1,1,2755,2


In [324]:
df_with_features = (
    df_with_target
    .with_columns((pl.col('timestamp') - pl.col('timestamp').dt.date()).dt.total_minutes().alias('time_in_min'))
    .with_columns((pl.col('timestamp').dt.date()).dt.weekday().alias('weekday'))
    .sort(['user_id','timestamp']).with_columns(rn_log = pl.col("timestamp").cum_count().over("session").log())
    .with_columns(diff_from_prev = pl.col('timestamp').diff().dt.total_seconds().fill_null(0))
    .with_columns(diff_from_prev=pl.when(pl.col("new_session_mark"))
                    .then(0)
                    .otherwise(pl.col("diff_from_prev"))
                 )
    # .head()
)

In [325]:
df_with_features.head(20)

user_id,timestamp,ts_diff,user_diff,new_session_mark,session,user,target,fold,time_in_min,weekday,rn_log,diff_from_prev
str,datetime[μs],bool,bool,bool,u32,u32,i64,i32,i64,i8,f64,i64
"""4d5835cc2a82e278e6000028""",2015-01-05 03:04:02,true,true,true,1,1,2898,2,184,1,0.0,0
"""4d5835cc2a82e278e6000028""",2015-01-05 03:04:53,false,false,false,1,1,2847,2,184,1,0.693147,51
"""4d5835cc2a82e278e6000028""",2015-01-05 03:05:45,false,false,false,1,1,2795,2,185,1,1.098612,52
"""4d5835cc2a82e278e6000028""",2015-01-05 03:06:10,false,false,false,1,1,2770,2,186,1,1.386294,25
"""4d5835cc2a82e278e6000028""",2015-01-05 03:06:25,false,false,false,1,1,2755,2,186,1,1.609438,15
…,…,…,…,…,…,…,…,…,…,…,…,…
"""4d5835cc2a82e278e6000028""",2015-01-05 03:17:01,false,false,false,1,1,2119,2,197,1,2.772589,49
"""4d5835cc2a82e278e6000028""",2015-01-05 03:17:24,false,false,false,1,1,2096,2,197,1,2.833213,23
"""4d5835cc2a82e278e6000028""",2015-01-05 03:17:54,false,false,false,1,1,2066,2,197,1,2.890372,30
"""4d5835cc2a82e278e6000028""",2015-01-05 03:17:59,false,false,false,1,1,2061,2,197,1,2.944439,5


In [326]:
features = ['time_in_min', 'weekday', 'rn_log', 'diff_from_prev']

In [381]:
from sklearn.linear_model import PoissonRegressor
# from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [459]:
pr = make_pipeline(
    StandardScaler(),
    RandomForestRegressor(max_depth=6, n_estimators=40)
) # PoissonRegressor())

In [460]:
pr.fit(df_with_features.filter(train_index)[features], df_with_features.filter(train_index)['target'])

In [461]:
predictions = pr.predict(df_with_features.filter(test_index)[features])

In [462]:
final_predictions = predictions < 5 * 60

In [463]:
accuracy_score(y.filter(test_index), final_predictions)

0.6846501218878631

In [464]:
sum(final_predictions) / len(final_predictions)

0.008472205570874795

In [465]:
pl.DataFrame(predictions).plot.kde()

In [466]:
df_with_features['weekday'].plot.kde()

In [467]:
df_with_features['time_in_min'].plot.kde()

In [302]:
df_with_features['rn_log'].plot.kde()

In [468]:
for fold in range(4):
    train_index = df_with_target['fold'] != 0
    test_index = df_with_target['fold'] == 0
    pr.fit(df_with_features.filter(train_index)[features], df_with_features.filter(train_index)['target'])
    predictions = pr.predict(df_with_features.filter(test_index)[features])
    final_predictions = predictions < 5 * 60
    acc = accuracy_score(y.filter(test_index), final_predictions)
    print(fold, acc)

0 0.6847700115893378
1 0.6844503057187388
2 0.6848499380569876
3 0.6847700115893378
