In [1]:
import polars as pl

#from sklearn.model_selection import GroupShuffleSplit
import numpy as np

from sklearn.model_selection import StratifiedGroupKFold

## The data was dowloaded locally, and is read from there (the local disk).
### apparently no header

In [2]:
df = (
    pl.read_excel('Data for Student Hiring Project - Citizen Science .xlsx', read_options={"has_header": False})
    .rename({'column_1': 'user_id', 'column_2': 'timestamp'})
    .with_columns(pl.col('timestamp').str.to_datetime())
)

In [3]:
df.shape

(99999, 2)

In [4]:
df.head()

user_id,timestamp
str,datetime[μs]
"""54999c8d9cd118282b001784""",2014-12-23 04:53:19
"""54999c91bb7b56040d0011e8""",2014-12-23 04:53:20
"""54999c8fbb7b565d7e000a7c""",2014-12-23 04:53:21
"""54999cb29cd1184d77000539""",2014-12-23 04:53:24
"""54068fab91ad6b597e000f97""",2014-12-23 04:53:28


In [5]:
def find_sessions(df, threshold_in_sec: int):
    """Identifing the sessions. giving an id for each session. Also giving a numeric id to a user.
    """
    return (
        df.sort(['user_id', 'timestamp'])
        .with_columns((pl.col('timestamp').diff().dt.total_seconds() > threshold_in_sec).fill_null(True).alias('ts_diff'))
        .with_columns((pl.col('user_id').ne(pl.col('user_id').shift())).fill_null(True).alias('user_diff'))
        .with_columns([
            (pl.col("ts_diff") | pl.col("user_diff")).alias("new_session_mark")
        ])
        .with_columns([
            pl.col("new_session_mark").cum_sum().alias("session")
        ])
        .with_columns([
            pl.col("user_diff").cum_sum().alias("user")
        ])
    )

### We add session ids based on the user_id and whether 30 minutes have passed.

In [6]:
df_with_sessions = df.pipe(find_sessions, threshold_in_sec=60 * 30)

In [7]:
df_with_sessions.tail(10)

user_id,timestamp,ts_diff,user_diff,new_session_mark,session,user
str,datetime[μs],bool,bool,bool,u32,u32
"""54aaaba29cd1180cf7000070""",2015-01-05 03:23:09,False,False,False,2288,942
"""54aaaba29cd1180cf7000070""",2015-01-05 03:23:37,False,False,False,2288,942
"""54aaaba29cd1180cf7000070""",2015-01-05 03:24:01,False,False,False,2288,942
"""54aaaba29cd1180cf7000070""",2015-01-05 03:24:17,False,False,False,2288,942
"""54aaaba29cd1180cf7000070""",2015-01-05 03:24:30,False,False,False,2288,942
"""54aac9549cd11825500000f8""",2015-01-05 05:26:57,True,True,True,2289,943
"""54aac9549cd11825500000f8""",2015-01-05 05:27:23,False,False,False,2289,943
"""54aac9549cd11825500000f8""",2015-01-05 05:27:48,False,False,False,2289,943
"""54aac9549cd11825500000f8""",2015-01-05 05:27:52,False,False,False,2289,943
"""54aac9549cd11825500000f8""",2015-01-05 05:28:02,False,False,False,2289,943


In [8]:
df_with_sessions.head()

user_id,timestamp,ts_diff,user_diff,new_session_mark,session,user
str,datetime[μs],bool,bool,bool,u32,u32
"""4d5835cc2a82e278e6000028""",2015-01-05 03:04:02,True,True,True,1,1
"""4d5835cc2a82e278e6000028""",2015-01-05 03:04:53,False,False,False,1,1
"""4d5835cc2a82e278e6000028""",2015-01-05 03:05:45,False,False,False,1,1
"""4d5835cc2a82e278e6000028""",2015-01-05 03:06:10,False,False,False,1,1
"""4d5835cc2a82e278e6000028""",2015-01-05 03:06:25,False,False,False,1,1


### note above both session ids and user ids start with '1'

### Let's learn/predict the additional time in the system (in seconds). Therefore < 5 * 60 means "about to disengage".

In [9]:
df_with_target = (
    df_with_sessions
    .with_columns(
        ((pl.col("timestamp").max().over("session") - pl.col('timestamp')).dt.total_seconds()).alias('target')
    )
    # .with_columns(
    #     target=pl.when(pl.col('target') > 6000).then(6000).otherwise(pl.col('target'))
    # )
)

In [10]:
df_with_target['target'].plot.kde()

%opts magic unavailable (pyparsing cannot be imported)
%compositor magic unavailable (pyparsing cannot be imported)


In [11]:
df_with_target['target'].log().plot.kde()

In [12]:
y = (df_with_target['target'] < 5 * 60).alias('disengage')
y.value_counts() # note used here '<' rather than '<='. important?

disengage,count
bool,u32
True,32180
False,67819


### For the sake of train/test split, I've decided to group by users, after all if the same user is used in both, we can potentially leak information.

In [13]:
groups = df_with_sessions['user']

In [14]:
# ### The requested .75/.25 split here is based on users. We may end with a differnt ratio with respect to the hits (or also with respect to sessions). 

In [15]:
# gss = GroupShuffleSplit(n_splits=2, train_size=.75, random_state=1)

In [16]:
# for i, (train_index, test_index) in enumerate(gss.split(df_with_sessions, groups=groups)):
#     print(f"Fold {i}:")
#     print(f"  Train: index={len(train_index)}, group={len(np.unique(groups[train_index]))}")
#     print(f"  Test:  index={len(test_index)}, group={len(np.unique(groups[test_index]))}")

In [17]:
sgkf = StratifiedGroupKFold(n_splits=4)

In [18]:
for i, (train_index, test_index) in enumerate(sgkf.split(df_with_target, y, groups)):
     print(f"Fold {i}:")
     print(f"  Train: index={len(train_index)}")
     print(f"         group={len(np.unique(groups[train_index]))}")
     print(f"  Test:  index={len(test_index)}")
     print(f"         group={len(np.unique(groups[test_index]))}")

Fold 0:
  Train: index=74976
         group=703
  Test:  index=25023
         group=240
Fold 1:
  Train: index=75013
         group=691
  Test:  index=24986
         group=252
Fold 2:
  Train: index=75013
         group=726
  Test:  index=24986
         group=217
Fold 3:
  Train: index=74995
         group=709
  Test:  index=25004
         group=234


In [19]:
df_with_target = (
    df_with_target
    .with_columns(pl.repeat(-1, df_with_target.shape[0]).alias('fold'))
)

In [20]:
for i, (train_index, test_index) in enumerate(sgkf.split(df_with_target, y, groups)):
    df_with_target[test_index, 'fold'] = i

In [21]:
df_with_target['fold'].value_counts()

fold,count
i32,u32
1,24986
2,24986
0,25023
3,25004


In [22]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

In [23]:
train_index = df_with_target['fold'] != 0
test_index = df_with_target['fold'] == 0

### Dummy baseline

In [24]:
dummy_model = DummyClassifier()
dummy_model.fit(df_with_target.filter(train_index).select(['fold', 'user']), y.filter(train_index))
predictions = dummy_model.predict(df_with_target.filter(test_index).select(['fold', 'user']))

In [25]:
accuracy_score(y.filter(test_index), predictions)

0.6780961515405827

In [26]:
df_with_target.head()

user_id,timestamp,ts_diff,user_diff,new_session_mark,session,user,target,fold
str,datetime[μs],bool,bool,bool,u32,u32,i64,i32
"""4d5835cc2a82e278e6000028""",2015-01-05 03:04:02,True,True,True,1,1,2898,2
"""4d5835cc2a82e278e6000028""",2015-01-05 03:04:53,False,False,False,1,1,2847,2
"""4d5835cc2a82e278e6000028""",2015-01-05 03:05:45,False,False,False,1,1,2795,2
"""4d5835cc2a82e278e6000028""",2015-01-05 03:06:10,False,False,False,1,1,2770,2
"""4d5835cc2a82e278e6000028""",2015-01-05 03:06:25,False,False,False,1,1,2755,2


In [27]:
df_with_features = (
    df_with_target
    .with_columns((pl.col('timestamp') - pl.col('timestamp').dt.date()).dt.total_minutes().alias('time_in_min'))
    .with_columns((pl.col('timestamp').dt.date()).dt.weekday().alias('weekday'))
    .sort(['user_id','timestamp']).with_columns(rn = pl.col("timestamp").cum_count().over("session"))
    .with_columns(rn_log=pl.col('rn').log())
    .with_columns(diff_from_prev = pl.col('timestamp').diff().dt.total_seconds().fill_null(0))
    .with_columns(diff_from_prev=pl.when(pl.col("new_session_mark"))
                    .then(0)
                    .otherwise(pl.col("diff_from_prev"))
                 )
    .with_columns(
        ((pl.col('timestamp') - pl.col("timestamp").min().over("session")).dt.total_seconds()).alias('since_start_of_session')
    )
    .with_columns(
        avg_per_hit=pl.col('since_start_of_session') / pl.col('rn')
    )
)

In [28]:
df_with_features.head(20)

user_id,timestamp,ts_diff,user_diff,new_session_mark,session,user,target,fold,time_in_min,weekday,rn,rn_log,diff_from_prev,since_start_of_session,avg_per_hit
str,datetime[μs],bool,bool,bool,u32,u32,i64,i32,i64,i8,u32,f64,i64,i64,f64
"""4d5835cc2a82e278e6000028""",2015-01-05 03:04:02,true,true,true,1,1,2898,2,184,1,1,0.0,0,0,0.0
"""4d5835cc2a82e278e6000028""",2015-01-05 03:04:53,false,false,false,1,1,2847,2,184,1,2,0.693147,51,51,25.5
"""4d5835cc2a82e278e6000028""",2015-01-05 03:05:45,false,false,false,1,1,2795,2,185,1,3,1.098612,52,103,34.333333
"""4d5835cc2a82e278e6000028""",2015-01-05 03:06:10,false,false,false,1,1,2770,2,186,1,4,1.386294,25,128,32.0
"""4d5835cc2a82e278e6000028""",2015-01-05 03:06:25,false,false,false,1,1,2755,2,186,1,5,1.609438,15,143,28.6
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""4d5835cc2a82e278e6000028""",2015-01-05 03:17:01,false,false,false,1,1,2119,2,197,1,16,2.772589,49,779,48.6875
"""4d5835cc2a82e278e6000028""",2015-01-05 03:17:24,false,false,false,1,1,2096,2,197,1,17,2.833213,23,802,47.176471
"""4d5835cc2a82e278e6000028""",2015-01-05 03:17:54,false,false,false,1,1,2066,2,197,1,18,2.890372,30,832,46.222222
"""4d5835cc2a82e278e6000028""",2015-01-05 03:17:59,false,false,false,1,1,2061,2,197,1,19,2.944439,5,837,44.052632


In [29]:
features = ['time_in_min', 'weekday', 'rn_log', 'diff_from_prev'] # , 'avg_per_hit', 'since_start_of_session']

In [30]:
from sklearn.linear_model import PoissonRegressor
# from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LogisticRegression

In [31]:
model = make_pipeline(
    # StandardScaler(),
    # PoissonRegressor()
    RandomForestRegressor(max_depth=6, n_estimators=40, random_state=1)
    #  LogisticRegression()
)

In [42]:
model.fit(df_with_features.filter(train_index)[features], df_with_features.filter(train_index)['target'])

In [43]:
predictions = model.predict(df_with_features.filter(test_index)[features])
#final_predictions = model.predict(df_with_features.filter(test_index)[features])

In [44]:
final_predictions = predictions < 5 * 60

In [45]:
accuracy_score(y.filter(test_index), final_predictions)

0.684090545512718

In [46]:
sum(final_predictions) / len(final_predictions)

0.009878419452887538

In [47]:
pl.DataFrame(predictions).plot.kde()

In [48]:
df_with_features['weekday'].plot.kde()

In [49]:
df_with_features['time_in_min'].plot.kde()

In [50]:
df_with_features['rn_log'].plot.kde()

In [51]:
for fold in range(4):
    train_index = df_with_target['fold'] != fold
    test_index = df_with_target['fold'] == fold
    model.fit(df_with_features.filter(train_index)[features], df_with_features.filter(train_index)['target'])
    predictions = model.predict(df_with_features.filter(test_index)[features])
    final_predictions = predictions < 5 * 60
    acc = accuracy_score(y.filter(test_index), final_predictions)
    print(fold, acc)

0 0.6843304160172641
1 0.6801809013047306
2 0.6840630753221805
3 0.684090545512718
