In [79]:
library(tidyverse)
library(dplyr)
library(tidyr)
library(recipes)
library(tidymodels)
options(repr.matrix.max.rows = 6)

── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.1.1 ──

[32m✔[39m [34mbroom       [39m 1.0.6     [32m✔[39m [34mrsample     [39m 1.2.1
[32m✔[39m [34mdials       [39m 1.3.0     [32m✔[39m [34mtune        [39m 1.1.2
[32m✔[39m [34minfer       [39m 1.0.7     [32m✔[39m [34mworkflows   [39m 1.1.4
[32m✔[39m [34mmodeldata   [39m 1.4.0     [32m✔[39m [34mworkflowsets[39m 1.0.1
[32m✔[39m [34mparsnip     [39m 1.2.1     [32m✔[39m [34myardstick   [39m 1.3.1

── [1mConflicts[22m ───────────────────────────────────────── tidymodels_conflicts() ──
[31m✖[39m [34mscales[39m::[32mdiscard()[39m masks [34mpurrr[39m::discard()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m   masks [34mstats[39m::filter()
[31m✖[39m [34mrecipes[39m::[32mfixed()[39m  masks [34mstringr[39m::fixed()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m      masks [34mstats[39m::lag()
[31m✖[39m [34myardstick[39m::[32mspec()[39m masks [34m

In [92]:
players_csv <- read_csv('players.csv')
players_csv

[1mRows: [22m[34m196[39m [1mColumns: [22m[34m7[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (4): experience, hashedEmail, name, gender
[32mdbl[39m (2): played_hours, Age
[33mlgl[39m (1): subscribe

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


experience,subscribe,hashedEmail,played_hours,name,gender,Age
<chr>,<lgl>,<chr>,<dbl>,<chr>,<chr>,<dbl>
Pro,TRUE,f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d,30.3,Morgan,Male,9
Veteran,TRUE,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa939732842f2312358a88e9,3.8,Christian,Male,17
Veteran,FALSE,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3c5a9d2118eb7ccbb28,0.0,Blake,Male,17
⋮,⋮,⋮,⋮,⋮,⋮,⋮
Amateur,FALSE,d572f391d452b76ea2d7e5e53a3d38bfd7499c7399db299bd4fedb06a46ad5bb,0.0,Dylan,Prefer not to say,57
Amateur,FALSE,f19e136ddde68f365afc860c725ccff54307dedd13968e896a9f890c40aea436,2.3,Harlow,Male,17
Pro,TRUE,d9473710057f7d42f36570f0be83817a4eea614029ff90cf50d8889cdd729d11,0.2,Ahmed,Other,


In [96]:
#-------------------------- Making Dummy Variables for Experience and Gender --------------------------#

# Dummy variables for experience variable
# (Note that experience = Amateur is not listed as a variable as it is the value assigned if all other variables are 0 value)
exp_dummy_rec <- players_csv |>
    select(experience) |>
    mutate(Original_exp = experience)

dummy_recipe_exp <- recipe(~., data = exp_dummy_rec)

exp_dummy <- dummy_recipe_exp |>
    step_dummy(experience) |>
    prep(training = exp_dummy_rec) |>
    bake(new_data = NULL, Original_exp, starts_with('experience')) |>
    distinct()

# Dummy variables for gender variable
# (Note that gender = agender is not listed as a variable as it is the value assigned if all other variables are 0 value)
gender_dummy_rec <- players_csv |>
    select(gender) |>
    mutate(Original_gender = gender)

dummy_recipe_gender <- recipe(~., data = gender_dummy_rec)

gender_dummy <- dummy_recipe_gender |>
    step_dummy(gender) |>
    prep(training = gender_dummy_rec) |>
    bake(new_data = NULL, Original_gender, starts_with('gender')) |>
    distinct()

#exp_dummy
#gender_dummy

In [99]:
#-------------------------------------- Making Combined Dataset ---------------------------------------#

# Contains the players_csv datset along with the dummy variables for each for use in the model
players_csv_combined <- bind_rows(players_csv, exp_dummy, gender_dummy)

#players_csv_combined

In [118]:
possible_predictors <- players_csv_combined |>
    select(Age,
           experience_Beginner,
           experience_Pro,
           experience_Regular,
           experience_Veteran,
           gender_Female,
           gender_Two.Spirited,
           gender_Male,
           gender_Non.binary,
           gender_Other,
           gender_Prefer.not.to.say)

#possible_predictors

played_hours_formula <- paste('played_hours', '~', paste(possible_predictors, collapse = '+'))
#played_hours_formula

possible_recipes <- recipe(as.formula(model_string), data = players_training)
possible_recipes



[36m──[39m [1mRecipe[22m [36m──────────────────────────────────────────────────────────────────────[39m



── Inputs 

Number of variables by role

outcome: 1



In [126]:
#----------------------------------- Making Linear Regression Model -----------------------------------#

# SPLITTING DATA SETS INTO TESTING AND TRAINING SETS:
players_split <- players_csv_combined |>
    initial_split(prop = 0.75, strata = played_hours)
players_training <- training(players_split)
players_testing <- testing(players_split)

# FINDING BEST PREDICTOR COMBINATION VIA FOWARD SELECTION:

# Making tbl containing all the predictors
possible_predictors <- colnames(players_csv_combined |> select(Age,
                                                               experience_Beginner,
                                                               experience_Pro,
                                                               experience_Regular,
                                                               experience_Veteran,
                                                               gender_Female,
                                                               gender_Two.Spirited,
                                                               gender_Male,
                                                               gender_Non.binary,
                                                               gender_Other,
                                                               gender_Prefer.not.to.say)

# Finding most meaningful predictors

# Empty tibble for accuracy value
accuracy_val <- tibble(predictor_combination = character(),
                       accuracy = numeric())

# Making model specification
players_spec <- linear_reg() |>
    set_engine('lm') |>
    set_mode('regression')

# Store the number of predictors
n_total <- length(possible_predictors)

# Store selected predictors
selected <- c()

# for every size from 1 to the total number of predictors
    for (i in 1:n_total) {
        
# for every predictor still not added yet
    accs <- list()
    models <- list()
    for (j in 1:length(possible_predictors)) {
        
# create a model string for this combination of predictors
        preds_new <- c(selected, possible_predictors[[j]])
        print(possible_predictors)
        print(preds_new)
        model_string <- paste('played_hours', '~', paste(preds_new, collapse = '+'))
        print(model_string)

# create a recipe from each combination of predictors
possible_recipes <- recipe(as.formula(model_string), data = players_training)

# getting summary data into model
test_fit <- workflow() |>
          add_recipe(possible_recipes) |>
          add_model(players_spec) |>
          collect_metrics() |>
          filter(.metric == "accuracy") |>
          summarize(mx = max(mean))
        acc <- acc$mx |> unlist()

        # add this result to the dataframe
        accs[[j]] <- acc
        models[[j]] <- model_string
    }
    jstar <- which.max(unlist(accs))
    accuracies <- accuracies |>
      add_row(size = i,
              model_string = models[[jstar]],
              accuracy = accs[[jstar]])
    selected <- c(selected, names[[jstar]])
    names <- names[-jstar]
}
accuracies


 [1] "Age"                      "experience_Beginner"     
 [3] "experience_Pro"           "experience_Regular"      
 [5] "experience_Veteran"       "gender_Female"           
 [7] "gender_Two.Spirited"      "gender_Male"             
 [9] "gender_Non.binary"        "gender_Other"            
[11] "gender_Prefer.not.to.say"
[1] "Age"


ERROR: Error in eval(expr, envir, enclos): object 'predictor_combination' not found
