In [None]:
library(tidyverse)
library(dplyr)
library(tidyr)
library(recipes)
library(tidymodels)
options(repr.matrix.max.rows = 6)

In [None]:
players_csv <- read_csv('players.csv')
#players_csv

In [None]:
#-------------------------- Making Dummy Variables for Experience and Gender --------------------------#

# Dummy variables for experience variable
# (Note that experience = Amateur is not listed as a variable as it is the value assigned if all other variables are 0 value)
exp_dummy_rec <- players_csv |>
    select(experience) |>
    mutate(Original_exp = experience)

dummy_recipe_exp <- recipe(~., data = exp_dummy_rec)

exp_dummy <- dummy_recipe_exp |>
    step_dummy(experience) |>
    prep(training = exp_dummy_rec) |>
    bake(new_data = NULL, Original_exp, starts_with('experience')) |>
    distinct()

# Dummy variables for gender variable
# (Note that gender = agender is not listed as a variable as it is the value assigned if all other variables are 0 value)
gender_dummy_rec <- players_csv |>
    select(gender) |>
    mutate(Original_gender = gender)

dummy_recipe_gender <- recipe(~., data = gender_dummy_rec)

gender_dummy <- dummy_recipe_gender |>
    step_dummy(gender) |>
    prep(training = gender_dummy_rec) |>
    bake(new_data = NULL, Original_gender, starts_with('gender')) |>
    distinct()

#exp_dummy
#gender_dummy

In [None]:
#-------------------------------------- Making Combined Dataset ---------------------------------------#

# Contains the players_csv datset along with the dummy variables for each for use in the model
players_csv_combined <- bind_rows(players_csv, exp_dummy, gender_dummy)

players_csv_combined

In [None]:
#----------------------------------- Making Linear Regression Model -----------------------------------#

# SPLITTING DATA SETS INTO TESTING AND TRAINING SETS:
players_split <- players_csv_combined |>
    initial_split(prop = 0.75, strata = played_hours)
players_training <- training(players_split)
players_testing <- testing(players_split)

# FINDING BEST PREDICTOR COMBINATION VIA FOWARD SELECTION:

# create tbl containing all the predictors
possible_predictors <- colnames(players_csv_combined |> select(Age, gender, experience
                                                               #experience_Beginner,
                                                               #experience_Pro,
                                                               #experience_Regular,
                                                               #experience_Veteran,
                                                               #gender_Female,
                                                               #gender_Two.Spirited,
                                                               #gender_Male,
                                                               #gender_Non.binary,
                                                               #gender_Other,
                                                               #gender_Prefer.not.to.say
                                                              ))

# Finding most meaningful predictors

# create an empty tibble to store the results
accuracies <- tibble(size = integer(),
                     model_string = character(),
                     rmse = numeric())

# create a model specification
players_spec <- linear_reg() |>
    set_engine('lm') |>
    set_mode("regression")

# create a 5-fold cross-validation object
players_vfold <- vfold_cv(players_training, v = 5)

# store the total number of predictors
n_total <- length(possible_predictors)

# stores selected predictors
selected <- c()

# for every size from 1 to the total number of predictors
for (i in 1:n_total) {
    
# for every predictor still not added yet
accs <- list()
models <- list()
for (j in 1:length(possible_predictors)) {
        
# create a model string for this combination of predictors
preds_new <- c(selected, possible_predictors[[j]])
model_string <- paste('played_hours', "~", paste(preds_new, collapse="+"))

# create a recipe from the model string
players_recipe <- recipe(as.formula(model_string), data = players_training) |>
    step_dummy(gender, experience
               #experience_Beginner,
               #experience_Pro,
               #experience_Regular,
               #experience_Veteran,
               #gender_Female,
               #gender_Two.Spirited,
               #gender_Male,
               #gender_Non.binary,
               #gender_Other,
               #gender_Prefer.not.to.say
              ) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())
    
# tune model
acc <- workflow() |>
    add_recipe(players_recipe) |>
    add_model(players_spec) |>
    fit_resamples(resamples = players_vfold, metrics = metric_set(rmse)) |>
    collect_metrics() |>
    filter(.metric == 'rmse') |>
    summarize(mx = min(mean))
    acc <- acc$mx |> unlist()

        # add this result to the dataframe
        accs[[j]] <- acc
        models[[j]] <- model_string
    }
    jstar <- which.min(unlist(accs))
    accuracies <- accuracies |>
      add_row(size = i,
              model_string = models[[jstar]],
              rmse = accs[[jstar]])
    selected <- c(selected, possible_predictors[[jstar]])
    possible_predictors <- possible_predictors[-jstar]
}
accuracies