In [2]:
library(tidyverse)
library(tidymodels)
library(lubridate)
library(ggplot2)
library(cowplot)

players <- read_csv("data/players.csv")
sessions <- read_csv("data/sessions.csv")
players <- players %>%
  mutate(hashedEmail = str_sub(hashedEmail, 1, 10))
sessions <- sessions %>%
  mutate(hashedEmail = str_sub(hashedEmail, 1, 10))
head(players)
head(sessions)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.1.1 ──

[32m✔[39m [34mbroom       [39m 1.0.6     [32m✔[39m [34mrsample     [39

experience,subscribe,hashedEmail,played_hours,name,gender,Age
<chr>,<lgl>,<chr>,<dbl>,<chr>,<chr>,<dbl>
Pro,True,f6daba428a,30.3,Morgan,Male,9
Veteran,True,f3c813577c,3.8,Christian,Male,17
Veteran,False,b674dd7ee0,0.0,Blake,Male,17
Amateur,True,23fe711e0e,0.7,Flora,Female,21
Regular,True,7dc01f10bf,0.1,Kylie,Male,21
Amateur,True,f58aad5996,0.0,Adrian,Female,17


hashedEmail,start_time,end_time,original_start_time,original_end_time
<chr>,<chr>,<chr>,<dbl>,<dbl>
bfce39c89d,30/06/2024 18:12,30/06/2024 18:24,1719770000000.0,1719770000000.0
36d9cbb4c6,17/06/2024 23:33,17/06/2024 23:46,1718670000000.0,1718670000000.0
f8f5477f5a,25/07/2024 17:34,25/07/2024 17:57,1721930000000.0,1721930000000.0
bfce39c89d,25/07/2024 03:22,25/07/2024 03:58,1721880000000.0,1721880000000.0
36d9cbb4c6,25/05/2024 16:01,25/05/2024 16:12,1716650000000.0,1716650000000.0
bfce39c89d,23/06/2024 15:08,23/06/2024 17:10,1719160000000.0,1719160000000.0


In [3]:
sessions <- sessions %>%
  mutate(
    start_dt = dmy_hm(start_time),
    end_dt   = dmy_hm(end_time),
    duration_mins = as.numeric(difftime(end_dt, start_dt, units = "mins")),
    hour = hour(start_dt),
    wday = wday(start_dt, label = TRUE),
    night = if_else(hour >= 20 | hour < 6, 1, 0),
    weekend = if_else(wday %in% c("Sat", "Sun"), 1, 0)
  )

session_features <- sessions %>%
  group_by(hashedEmail) %>%
  summarize(
    n_sessions = n(),
    avg_duration = mean(duration_mins, na.rm = TRUE),
    prop_night = mean(night, na.rm = TRUE),
    prop_weekend = mean(weekend, na.rm = TRUE),
    .groups = "drop"
  )
session_features <- session_features %>%
  mutate(hashedEmail = str_sub(hashedEmail, 1, 10))
head(session_features)

hashedEmail,n_sessions,avg_duration,prop_night,prop_weekend
<chr>,<int>,<dbl>,<dbl>,<dbl>
0088b5e134,2,53.0,1.0,0.0
060aca80f8,1,30.0,1.0,0.0
0ce7bfa910,1,11.0,1.0,0.0
0d4d71be33,13,32.15385,0.6923077,0.4615385
0d70dd9cac,2,35.0,0.5,0.5
11006065e9,1,10.0,0.0,1.0


In [4]:
modeling_data <- players %>%
  left_join(session_features, by = "hashedEmail") %>%
  replace_na(list(n_sessions = 0, avg_duration = 0, 
                  prop_night = 0, prop_weekend = 0)) %>%
  mutate(subscribe = if_else(subscribe, "Subscribed", "Not Subscribed") %>% 
                     factor(levels = c("Not Subscribed", "Subscribed")),
         experience = factor(experience, 
                             levels = c("Beginner", "Amateur", "Regular", 
                                        "Veteran", "Pro"),
                             ordered = FALSE)) %>%
  select(-hashedEmail, -name)

glimpse(modeling_data)

Rows: 196
Columns: 9
$ experience   [3m[90m<fct>[39m[23m Pro, Veteran, Veteran, Amateur, Regular, Amateur, Regular…
$ subscribe    [3m[90m<fct>[39m[23m Subscribed, Subscribed, Not Subscribed, Subscribed, Subsc…
$ played_hours [3m[90m<dbl>[39m[23m 30.3, 3.8, 0.0, 0.7, 0.1, 0.0, 0.0, 0.0, 0.1, 0.0, 1.6, 0…
$ gender       [3m[90m<chr>[39m[23m "Male", "Male", "Male", "Female", "Male", "Female", "Fema…
$ Age          [3m[90m<dbl>[39m[23m 9, 17, 17, 21, 21, 17, 19, 21, 17, 22, 23, 17, 25, 22, 17…
$ n_sessions   [3m[90m<int>[39m[23m 27, 3, 1, 1, 1, 0, 0, 1, 1, 0, 3, 0, 1, 1, 0, 0, 0, 37, 2…
$ avg_duration [3m[90m<dbl>[39m[23m 74.77778, 85.00000, 5.00000, 50.00000, 9.00000, 0.00000, …
$ prop_night   [3m[90m<dbl>[39m[23m 0.8518519, 1.0000000, 1.0000000, 1.0000000, 1.0000000, 0.…
$ prop_weekend [3m[90m<dbl>[39m[23m 0.5555556, 0.0000000, 0.0000000, 1.0000000, 0.0000000, 0.…


In [5]:
set.seed(123)
data_split <- initial_split(modeling_data, prop = 0.75, strata = subscribe)
train_data <- training(data_split)
test_data  <- testing(data_split)

In [6]:
subscription_recipe <- recipe(subscribe ~ experience + played_hours + n_sessions + 
                                avg_duration + prop_night + prop_weekend, 
                              data = train_data)%>%
  step_mutate(subscribe = factor(subscribe, levels = c("Not Subscribed", "Subscribed")))

logistic_spec <- logistic_reg() %>%
  set_engine("glm") %>%
  set_mode("classification")

subscription_workflow <- workflow() %>%
  add_recipe(subscription_recipe) %>%
  add_model(logistic_spec)

subscription_recipe
logistic_spec



[36m──[39m [1mRecipe[22m [36m──────────────────────────────────────────────────────────────────────[39m



── Inputs 

Number of variables by role

outcome:   1
predictor: 6



── Operations 

[36m•[39m Variable mutation for: [34mfactor(subscribe, levels = c("Not Subscribed",[39m
  [34m"Subscribed"))[39m



Logistic Regression Model Specification (classification)

Computational engine: glm 


In [7]:
set.seed(123)
cv_folds <- vfold_cv(train_data, v = 5, strata = subscribe)

cv_results <- fit_resamples(
  subscription_workflow,
  resamples = cv_folds,
  metrics = metric_set(accuracy, roc_auc),
  control = control_resamples(save_pred = TRUE)
)

cv_metrics <- collect_metrics(cv_results)
cv_metrics


There were issues with some computations   [1m[33mA[39m[22m: x1

There were issues with some computations   [1m[33mA[39m[22m: x5





.metric,.estimator,mean,n,std_err,.config
<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
accuracy,binary,0.7210181,5,0.006325144,Preprocessor1_Model1
roc_auc,binary,0.4641543,5,0.021288113,Preprocessor1_Model1


In [8]:
final_results <- last_fit(subscription_workflow, split = data_split, 
                          metrics = metric_set(accuracy, roc_auc))

test_metrics <- collect_metrics(final_results)
test_metrics


There were issues with some computations   [1m[33mA[39m[22m: x1

There were issues with some computations   [1m[33mA[39m[22m: x1





.metric,.estimator,.estimate,.config
<chr>,<chr>,<dbl>,<chr>
accuracy,binary,0.7346939,Preprocessor1_Model1
roc_auc,binary,0.4433761,Preprocessor1_Model1
