In [11]:
library(tidyverse)
library(tidymodels)

set.seed(123)

url_players <- "https://raw.githubusercontent.com/Markus888888/dsci_100_project/refs/heads/main/players.csv"
players_data <- read_csv(url_players) |> 
    rename(age = Age, hashed_email = hashedEmail) |>
    drop_na() |>
    mutate(subscribe = as_factor(subscribe))

players_split <- initial_split(players_data, prop = 0.8, strata = subscribe)

players_training <- training(players_split)
players_testing <- testing(players_split)

players_recipe <- recipe(subscribe ~ age + played_hours, data = players_training) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())

players_vfold <- vfold_cv(players_training, v = 10, strata = subscribe)

k_vals <- tibble(neighbors = seq(from = 1, to = 50, by = 1))

players_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
    set_mode("classification") |>
    set_engine("kknn")

players_fit <- workflow() |>
    add_recipe(players_recipe) |>
    add_model(players_spec) |>
    tune_grid(resamples = players_vfold, grid = k_vals)

players_best_k_result <- players_fit |>
    collect_metrics() |>
    filter(.metric == "accuracy") |>
    arrange(desc(mean)) |>
    slice(1) |>
    pull(neighbors)
    

players_best_k_result



[1mRows: [22m[34m196[39m [1mColumns: [22m[34m7[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (4): experience, hashedEmail, name, gender
[32mdbl[39m (2): played_hours, Age
[33mlgl[39m (1): subscribe

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
