In [1]:
import altair as alt
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.model_selection import GridSearchCV, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from datetime import datetime

In [2]:
sessions = pd.read_csv("sessions.csv")
sessions = sessions.drop(columns = ['original_start_time', 'original_end_time'])
sessions.head()

Unnamed: 0,hashedEmail,start_time,end_time
0,bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431...,30/06/2024 18:12,30/06/2024 18:24
1,36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f5...,17/06/2024 23:33,17/06/2024 23:46
2,f8f5477f5a2e53616ae37421b1c660b971192bd8ff77e3...,25/07/2024 17:34,25/07/2024 17:57
3,bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431...,25/07/2024 03:22,25/07/2024 03:58
4,36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f5...,25/05/2024 16:01,25/05/2024 16:12


In [3]:
sessions['start_time'] = pd.to_datetime(sessions['start_time'], format='%d/%m/%Y %H:%M')
sessions['end_time'] = pd.to_datetime(sessions['end_time'], format='%d/%m/%Y %H:%M')

sessions['start_hour'] = sessions['start_time'].dt.hour
sessions['end_hour'] = sessions['end_time'].dt.hour
sessions['start_min'] = sessions['start_time'].dt.minute
sessions['end_min'] = sessions['end_time'].dt.minute
sessions.head()

Unnamed: 0,hashedEmail,start_time,end_time,start_hour,end_hour,start_min,end_min
0,bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431...,2024-06-30 18:12:00,2024-06-30 18:24:00,18,18.0,12,24.0
1,36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f5...,2024-06-17 23:33:00,2024-06-17 23:46:00,23,23.0,33,46.0
2,f8f5477f5a2e53616ae37421b1c660b971192bd8ff77e3...,2024-07-25 17:34:00,2024-07-25 17:57:00,17,17.0,34,57.0
3,bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431...,2024-07-25 03:22:00,2024-07-25 03:58:00,3,3.0,22,58.0
4,36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f5...,2024-05-25 16:01:00,2024-05-25 16:12:00,16,16.0,1,12.0


In [4]:
sessions = sessions.drop(columns = ["start_time","end_time","hashedEmail"])
sessions['start_hour'] = sessions['start_hour'].astype(float)
sessions['end_hour'] = sessions['end_hour'].astype(float)
sessions['start_min'] = sessions['start_min'].astype(float)
sessions['end_min'] = sessions['end_min'].astype(float)
sessions.head()

Unnamed: 0,start_hour,end_hour,start_min,end_min
0,18.0,18.0,12.0,24.0
1,23.0,23.0,33.0,46.0
2,17.0,17.0,34.0,57.0
3,3.0,3.0,22.0,58.0
4,16.0,16.0,1.0,12.0


In [5]:
# Calculate start and end times in minutes since midnight
sessions['start_time_in_minutes'] = sessions['start_hour'] * 60 + sessions['start_min']
sessions['end_time_in_minutes'] = sessions['end_hour'] * 60 + sessions['end_min']
sessions.head()

Unnamed: 0,start_hour,end_hour,start_min,end_min,start_time_in_minutes,end_time_in_minutes
0,18.0,18.0,12.0,24.0,1092.0,1104.0
1,23.0,23.0,33.0,46.0,1413.0,1426.0
2,17.0,17.0,34.0,57.0,1054.0,1077.0
3,3.0,3.0,22.0,58.0,202.0,238.0
4,16.0,16.0,1.0,12.0,961.0,972.0


In [6]:
# Create an empty list to store active players count
activity_metric = []

# Loop through every 10 minutes of the day (0 to 1439 minutes)
for minute in range(0, 24 * 60, 10):  # Step size of 10
    active_players = 0
    # Check each session if it's active during this time interval
    for _, session in sessions.iterrows():
        if session['start_time_in_minutes'] <= minute < session['end_time_in_minutes']:
            active_players += 1
    # Append the count of active players for this 10-minute interval
    activity_metric.append(active_players)

# Create a DataFrame with the results
activity_df = pd.DataFrame({
    'time_in_minutes': range(0, 24 * 60, 10),  # 10-minute intervals
    'active_players': activity_metric
})

activity_df.head()

Unnamed: 0,time_in_minutes,active_players
0,0,3
1,10,21
2,20,38
3,30,49
4,40,60


In [7]:
# Split the data into training and testing sets
activity_training, activity_testing = train_test_split(activity_df, test_size=0.25, random_state=100)

# Prepare the train and test sets
X_train = activity_training[["time_in_minutes"]]
X_test = activity_testing[["time_in_minutes"]]  
y_train = activity_training["active_players"]  
y_test = activity_testing["active_players"]  


In [8]:
# Create the pipeline
pipeline = make_pipeline(StandardScaler(),KNeighborsRegressor(),)

#Cross Validation
activity_cv = pd.DataFrame(cross_validate(estimator = pipeline, cv = 5, X = X_train, y = y_train,
                                          scoring="neg_root_mean_squared_error",return_train_score=True,))

activity_cv

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.003172,0.001687,-6.878689,-7.886816
1,0.003562,0.001431,-12.423365,-7.394498
2,0.002298,0.001763,-6.636949,-7.364907
3,0.002248,0.001369,-11.102295,-8.105823
4,0.002222,0.001318,-8.597896,-6.330986


In [9]:
param_grid = {"kneighborsregressor__n_neighbors":range(1,50,1)}
activity_tuned = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring="neg_root_mean_squared_error")
activity_results = pd.DataFrame(activity_tuned.fit(X_train, y_train).cv_results_) 
activity_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kneighborsregressor__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002691,0.000373,0.001638,0.000157,1,{'kneighborsregressor__n_neighbors': 1},-7.769404,-6.292131,-5.364022,-6.747133,-4.961759,-6.22689,0.999936,2
1,0.002199,2e-05,0.001433,9.7e-05,2,{'kneighborsregressor__n_neighbors': 2},-5.493798,-7.012165,-3.316625,-5.598682,-6.349916,-5.554237,1.246691,1
2,0.002191,4.8e-05,0.001344,2.3e-05,3,{'kneighborsregressor__n_neighbors': 3},-5.801863,-8.94399,-4.720469,-7.298982,-6.642416,-6.681544,1.422749,3
3,0.002148,2.3e-05,0.001299,6e-06,4,{'kneighborsregressor__n_neighbors': 4},-6.114904,-10.794438,-7.151367,-9.56463,-7.125574,-8.150182,1.742559,4
4,0.0022,6.4e-05,0.001321,5e-06,5,{'kneighborsregressor__n_neighbors': 5},-6.878689,-12.423365,-6.636949,-11.102295,-8.597896,-9.127839,2.293601,5


In [10]:
activity_min = activity_tuned.best_params_
activity_best_RMSPE = -activity_tuned.best_score_
activity_min

{'kneighborsregressor__n_neighbors': 2}

In [11]:
activity_best_RMSPE

np.float64(5.554237030768104)

In [12]:
activity_prediction = activity_tuned.predict(X_test)
activity_summary = mean_squared_error(y_true = y_test, 
                                      y_pred = activity_prediction)**(1/2)
activity_summary

np.float64(4.360491817317043)

In [14]:
activity_preds_train = activity_training.assign(predictions= activity_tuned.predict(X_train))
base_plot = alt.Chart(activity_preds_train).mark_circle(opacity=0.4).encode(
    x=alt.X("time_in_minutes").title("Time in minutes (from midnight)"),
    y=alt.Y("active_players").title("Number of players active"))

line = alt.Chart(activity_preds_train).mark_line(color="black").encode(
    x="time_in_minutes",
    y="predictions")
activity_plot=alt.layer(base_plot,line)
activity_plot

In [16]:
activity_preds_test = activity_testing.assign(predictions= activity_tuned.predict(X_test))
base_plot = alt.Chart(activity_preds_test).mark_circle(opacity=0.4).encode(
    x=alt.X("time_in_minutes").title("Time in minutes (from midnight)"),
    y=alt.Y("active_players").title("Number of players active"))

line = alt.Chart(activity_preds_train).mark_line(color="black").encode(
    x="time_in_minutes",
    y="predictions")
activity_plot=alt.layer(base_plot,line)
activity_plot