tidymodels · topepo · Aug 19, 2022 · Aug 18, 2022 · Aug 18, 2022 · Aug 18, 2022
diff --git a/classwork/05-classwork.qmd b/classwork/05-classwork.qmd
@@ -52,7 +52,7 @@ nhl_train <- analysis(nhl_val$splits[[1]])
 set.seed(100)
 nhl_train %>% 
   sample_n(200) %>%
-  plot_nhl_shots(emphasis = position)
+  plot_nhl_shots(emphasis = shooter_type)
 
 # Your code here!
 
@@ -191,14 +191,37 @@ autoplot(roc_curve_points)
 
 Compute and plot an ROC curve for your current model.
 
+What data is being used for this ROC curve plot?
+
 ```{r}
 # Your code here!
 
 ```
 
-## Your turn
+## Collapsing factor levels
 
-What data is being used for this ROC curve plot?
+```{r}
+nhl_other_rec <-
+  recipe(on_goal ~ ., data = nhl_train) %>%
+  # Any player with <= 0.01% of shots is set to "other"
+  step_other(shooter, threshold = 0.001) %>%
+  step_dummy(all_nominal_predictors()) %>%
+  step_zv(all_predictors())
+```
+
+## Does othering help? 
+
+```{r}
+nhl_other_wflow <-
+  nhl_glm_wflow %>%
+  update_recipe(nhl_other_rec)
+
+nhl_other_res <-
+  nhl_other_wflow %>%
+  fit_resamples(nhl_val, control = ctrl)
+
+collect_metrics(nhl_other_res)
+```
 
 ## Player effects
 
@@ -207,7 +230,7 @@ library(embed)
 
 nhl_effect_rec <-
   recipe(on_goal ~ ., data = nhl_train) %>%
-  step_lencode_mixed(player, outcome = vars(on_goal)) %>%
+  step_lencode_mixed(shooter, goaltender, outcome = vars(on_goal)) %>%
   step_dummy(all_nominal_predictors()) %>%
   step_zv(all_predictors())
 ```
@@ -221,7 +244,7 @@ nhl_effect_wflow <-
 
 nhl_effect_res <-
   nhl_effect_wflow %>%
-  fit_resamples(nhl_val)
+  fit_resamples(nhl_val, control = ctrl)
 
 collect_metrics(nhl_effect_res)
 ```
@@ -231,36 +254,40 @@ collect_metrics(nhl_effect_res)
 ```{r}
 # angle
 nhl_angle_rec <-
-  nhl_indicators %>%
+  nhl_effect_rec %>%
   step_mutate(
-    angle = abs(atan2(abs(coord_y), (89 - abs(coord_x))) * (180 / pi))
+    angle = abs( atan2(abs(coord_y), (89 - coord_x) ) * (180 / pi) )
   )
 
-# distance
-nhl_distance_rec <-
+# defensive zone
+nhl_zone_rec <-
   nhl_angle_rec %>%
   step_mutate(
-    distance = sqrt((89 - abs(coord_x))^2 + abs(coord_y)^2),
-    distance = log(distance)
+    defensive_zone = ifelse(coord_x <= -25.5, 1, 0)
   )
 
 # behind goal line
 nhl_behind_rec <-
-  nhl_distance_rec %>%
+  nhl_zone_rec %>%
   step_mutate(
-    behind_goal_line = ifelse(abs(coord_x) >= 89, 1, 0)
+    behind_goal_line = ifelse(coord_x >= 89, 1, 0)
   )
 ```
 
 ## Fit different recipes
 
 ```{r}
+no_coord_rec <- 
+  nhl_indicators %>% 
+  step_rm(starts_with("coord"))
+
 set.seed(9)
 
 nhl_glm_set_res <-
   workflow_set(
-    list(`1_dummy` = nhl_indicators, `2_angle` = nhl_angle_rec, 
-         `3_dist` = nhl_distance_rec, `4_bgl` = nhl_behind_rec),
+    list(`1_no_coord` = no_coord_rec,   `2_other` = nhl_other_rec, 
+         `3_effects`  = nhl_effect_rec, `4_angle` = nhl_angle_rec, 
+         `5_zone`     = nhl_zone_rec,   `6_bgl`   = nhl_behind_rec),
     list(logistic = logistic_reg())
   ) %>%
   workflow_map(fn = "fit_resamples", resamples = nhl_val, verbose = TRUE, control = ctrl)
@@ -294,3 +321,13 @@ collect_metrics(nhl_glm_set_res) %>%
   geom_point(size = 3) +
   labs(y = NULL, x = "ROC AUC (validation set)")
 ```
+
+## Debugging a recipe
+
+```{r}
+nhl_angle_fit <- prep(nhl_angle_rec)
+
+tidy(nhl_angle_fit, number = 1) %>% slice(1:4)
+
+bake(nhl_angle_fit, nhl_train %>% slice(1:3), starts_with("coord"), angle, shooter)
+```
diff --git a/classwork/06-classwork.qmd b/classwork/06-classwork.qmd
@@ -1,5 +1,5 @@
 ---
-title: "3 - Tuning Hyperparameters - Classwork"
+title: "6 - Tuning Hyperparameters - Classwork"
 subtitle: "Machine learning with tidymodels"
 editor_options: 
   chunk_output_type: console
@@ -30,25 +30,24 @@ nhl_val <- validation_split(nhl_train_and_val, prop = 0.80)
 
 nhl_train <- analysis(nhl_val$splits[[1]])
 
-nhl_distance_rec <-
+nhl_position_rec <-
   recipe(on_goal ~ ., data = nhl_train) %>%
-  step_lencode_mixed(player, outcome = vars(on_goal)) %>%
+  step_lencode_mixed(shooter, goaltender, outcome = vars(on_goal)) %>%
   step_other(all_nominal_predictors()) %>% # TODO: keep this?
   step_dummy(all_nominal_predictors()) %>%
   step_zv(all_predictors()) %>%
   step_mutate(
-    angle = abs(atan2(abs(coord_y), (89 - abs(coord_x))) * (180 / pi)),
-    distance = sqrt((89 - abs(coord_x))^2 + abs(coord_y)^2),
-    distance = log(distance)
+    angle = abs( atan2(abs(coord_y), (89 - coord_x) ) * (180 / pi)),
+    behind_goal_line = ifelse(coord_x >= 89, 1, 0)
   )
 
-nhl_distance_wflow <-
+nhl_position_wflow <-
   workflow() %>%
-  add_recipe(nhl_distance_rec) %>%
+  add_recipe(nhl_position_rec) %>%
   add_model(logistic_reg())
 
-nhl_distance_res <-
-  nhl_distance_wflow %>%
+nhl_position_res <-
+  nhl_position_wflow %>%
   fit_resamples(nhl_val)
 ```
 
@@ -57,18 +56,16 @@ nhl_distance_res <-
 ```{r}
 glm_rec <-
   recipe(on_goal ~ ., data = nhl_train) %>%
-  step_lencode_mixed(player, outcome = vars(on_goal)) %>%
+  step_lencode_mixed(shooter, goaltender, outcome = vars(on_goal)) %>%
   step_dummy(all_nominal_predictors()) %>%
   step_mutate(
-    angle = abs(atan2(abs(coord_y), (89 - abs(coord_x))) * (180 / pi)),
-    distance = sqrt((89 - abs(coord_x))^2 + abs(coord_y)^2),
-    distance = log(distance),
-    behind_goal_line = ifelse(abs(coord_x) >= 89, 1, 0)
+    angle = abs( atan2(abs(coord_y), (89 - coord_x) ) * (180 / pi) ),
+    defensive_zone = ifelse(coord_x <= -25.5, 1, 0),
+    behind_goal_line = ifelse(coord_x >= 89, 1, 0)
   ) %>%
-  step_rm(coord_x, coord_y) %>%
   step_zv(all_predictors()) %>%
   step_ns(angle, deg_free = tune("angle")) %>%
-  step_ns(distance, deg_free = tune("distance")) %>%
+  step_ns(coord_x, deg_free = tune("coord_x")) %>%
   step_normalize(all_numeric_predictors())
 
 glm_spline_wflow <-
@@ -80,7 +77,7 @@ glm_spline_wflow <-
 ## Create a grid
 
 ```{r}
-set.seed(2)
+set.seed(12)
 grid <- 
   glm_spline_wflow %>% 
   extract_parameter_set_dials() %>% 
@@ -101,16 +98,16 @@ Try creating a regular grid.
 ## Update parameter ranges
 
 ```{r}
-set.seed(2)
+set.seed(12)
 grid <- 
   glm_spline_wflow %>% 
   extract_parameter_set_dials() %>% 
-  update(angle = spline_degree(c(2L, 20L)),
-         distance = spline_degree(c(2L, 20L))) %>% 
+  update(angle = spline_degree(c(2L, 50L)),
+         coord_x = spline_degree(c(2L, 50L))) %>% 
   grid_latin_hypercube(size = 25)
 
 grid %>% 
-  ggplot(aes(angle, distance)) +
+  ggplot(aes(angle, coord_x)) +
   geom_point(size = 4)
 ```
 
@@ -123,6 +120,7 @@ ctrl <- control_grid(save_pred = TRUE, parallel_over = "everything")
 glm_spline_res <-
   glm_spline_wflow %>%
   tune_grid(resamples = nhl_val, grid = grid, control = ctrl)
+
 glm_spline_res
 ```
 
@@ -157,33 +155,20 @@ show_best(glm_spline_res, metric = "roc_auc")
 select_best(glm_spline_res, metric = "roc_auc")
 ```
 
-## Your turn
-
-Try an alternative selection strategy.
-
-Read the docs for `select_by_pct_loss()`.
-
-Try choosing a model that has a simpler (less "wiggly") relationship for `distance`.
-
-```{r}
-# Your code here!
-
-```
-
 ## Boosted trees
 
 ```{r}
 xgb_spec <-
   boost_tree(
-    trees = 500, min_n = tune(), stop_iter = tune(), tree_depth = tune(),
+    trees = tune(), min_n = tune(), tree_depth = tune(),
     learn_rate = tune(), loss_reduction = tune()
   ) %>%
   set_mode("classification") %>% 
-  set_engine("xgboost", validation = 1/10) # <- for better early stopping
+  set_engine("xgboost") 
 
 xgb_rec <- 
   recipe(on_goal ~ ., data = nhl_train) %>% 
-  step_lencode_mixed(player, outcome = vars(on_goal)) %>% 
+  step_lencode_mixed(shooter, goaltender, outcome = vars(on_goal)) %>% 
   step_dummy(all_nominal_predictors()) %>%
   step_zv(all_predictors())
 
@@ -218,7 +203,7 @@ set.seed(9)
 
 xgb_res <-
   xgb_wflow %>%
-  tune_grid(resamples = nhl_val, grid = 15, control = ctrl) # automatic grid now!
+  tune_grid(resamples = nhl_val, grid = 30, control = ctrl) # automatic grid now!
 ```
 
 ## Your turn 
@@ -246,12 +231,10 @@ autoplot(xgb_res)
 coord_rec <- 
   xgb_rec %>%
   step_mutate(
-    angle = abs(atan2(abs(coord_y), (89 - abs(coord_x))) * (180 / pi)),
-    distance = sqrt((89 - abs(coord_x))^2 + abs(coord_y)^2),
-    distance = log(distance),
-    behind_goal_line = ifelse(abs(coord_x) >= 89, 1, 0)
-  ) %>% 
-  step_rm(coord_x, coord_y)
+    angle = abs( atan2(abs(coord_y), (89 - coord_x) ) * (180 / pi) ),
+    defensive_zone = ifelse(coord_x <= -25.5, 1, 0),
+    behind_goal_line = ifelse(coord_x >= 89, 1, 0)
+  )
 
 xgb_coord_wflow <- 
   workflow() %>% 
@@ -261,14 +244,16 @@ xgb_coord_wflow <-
 set.seed(9)
 xgb_coord_res <-
   xgb_coord_wflow %>%
-  tune_grid(resamples = nhl_val, grid = 20, control = ctrl)
+  tune_grid(resamples = nhl_val, grid = 30, control = ctrl)
 ```
 
 ## Did the machine figure it out? 
 
 ```{r}
-show_best(xgb_res, metric = "roc_auc")
-show_best(xgb_coord_res, metric = "roc_auc")
+# no extra features
+show_best(xgb_res, metric = "roc_auc", n = 3)
+# with additional coordinate features
+show_best(xgb_coord_res, metric = "roc_auc", n = 3)
 ```
 
 ## Compare models
@@ -282,22 +267,11 @@ glm_spline_res %>%
 
 ```{r}
 # Best boosting results
-xgb_coord_res %>% 
+xgb_res %>% 
   show_best(metric = "roc_auc", n = 1) %>% 
   select(.metric, .estimator, mean, n, std_err, .config)
 ```
 
-## Your turn
-
-Can you get better ROC results with xgboost?
-
-Try increasing `learn_rate` beyond the original range.
-
-```{r}
-# Your code here!
-
-```
-
 ## Updating the workflow
 
 ```{r}
@@ -366,7 +340,7 @@ glm_explainer <- explain_tidymodels(
   final_glm_spline_wflow,
   data = dplyr::select(nhl_train, -on_goal),
   # DALEX required an integer for factors:
-  y = as.integer(nhl_train$on_goal),
+  y = as.integer(nhl_train$on_goal) - 1,
   verbose = FALSE
 )
 ```
@@ -381,13 +355,13 @@ pdp_coord_x <- model_profile(
   glm_explainer,
   variables = "coord_x",
   N = 500,
-  groups = "position"
+  groups = "strength"
 )
 ```
 
 ## Your turn 
 
-Try grouping by another variable, like `game_type` or `dow`.
+Try grouping by another variable, like `extra_attacker` or `game_seconds`.
 
 ```{r}
 # Your code here!