Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 52 additions & 15 deletions classwork/05-classwork.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ nhl_train <- analysis(nhl_val$splits[[1]])
set.seed(100)
nhl_train %>%
sample_n(200) %>%
plot_nhl_shots(emphasis = position)
plot_nhl_shots(emphasis = shooter_type)

# Your code here!

Expand Down Expand Up @@ -191,14 +191,37 @@ autoplot(roc_curve_points)

Compute and plot an ROC curve for your current model.

What data is being used for this ROC curve plot?

```{r}
# Your code here!

```

## Your turn
## Collapsing factor levels

What data is being used for this ROC curve plot?
```{r}
nhl_other_rec <-
recipe(on_goal ~ ., data = nhl_train) %>%
# Any player with <= 0.01% of shots is set to "other"
step_other(shooter, threshold = 0.001) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors())
```

## Does othering help?

```{r}
nhl_other_wflow <-
nhl_glm_wflow %>%
update_recipe(nhl_other_rec)

nhl_other_res <-
nhl_other_wflow %>%
fit_resamples(nhl_val, control = ctrl)

collect_metrics(nhl_other_res)
```

## Player effects

Expand All @@ -207,7 +230,7 @@ library(embed)

nhl_effect_rec <-
recipe(on_goal ~ ., data = nhl_train) %>%
step_lencode_mixed(player, outcome = vars(on_goal)) %>%
step_lencode_mixed(shooter, goaltender, outcome = vars(on_goal)) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors())
```
Expand All @@ -221,7 +244,7 @@ nhl_effect_wflow <-

nhl_effect_res <-
nhl_effect_wflow %>%
fit_resamples(nhl_val)
fit_resamples(nhl_val, control = ctrl)

collect_metrics(nhl_effect_res)
```
Expand All @@ -231,36 +254,40 @@ collect_metrics(nhl_effect_res)
```{r}
# angle
nhl_angle_rec <-
nhl_indicators %>%
nhl_effect_rec %>%
step_mutate(
angle = abs(atan2(abs(coord_y), (89 - abs(coord_x))) * (180 / pi))
angle = abs( atan2(abs(coord_y), (89 - coord_x) ) * (180 / pi) )
)

# distance
nhl_distance_rec <-
# defensive zone
nhl_zone_rec <-
nhl_angle_rec %>%
step_mutate(
distance = sqrt((89 - abs(coord_x))^2 + abs(coord_y)^2),
distance = log(distance)
defensive_zone = ifelse(coord_x <= -25.5, 1, 0)
)

# behind goal line
nhl_behind_rec <-
nhl_distance_rec %>%
nhl_zone_rec %>%
step_mutate(
behind_goal_line = ifelse(abs(coord_x) >= 89, 1, 0)
behind_goal_line = ifelse(coord_x >= 89, 1, 0)
)
```

## Fit different recipes

```{r}
no_coord_rec <-
nhl_indicators %>%
step_rm(starts_with("coord"))

set.seed(9)

nhl_glm_set_res <-
workflow_set(
list(`1_dummy` = nhl_indicators, `2_angle` = nhl_angle_rec,
`3_dist` = nhl_distance_rec, `4_bgl` = nhl_behind_rec),
list(`1_no_coord` = no_coord_rec, `2_other` = nhl_other_rec,
`3_effects` = nhl_effect_rec, `4_angle` = nhl_angle_rec,
`5_zone` = nhl_zone_rec, `6_bgl` = nhl_behind_rec),
list(logistic = logistic_reg())
) %>%
workflow_map(fn = "fit_resamples", resamples = nhl_val, verbose = TRUE, control = ctrl)
Expand Down Expand Up @@ -294,3 +321,13 @@ collect_metrics(nhl_glm_set_res) %>%
geom_point(size = 3) +
labs(y = NULL, x = "ROC AUC (validation set)")
```

## Debugging a recipe

```{r}
nhl_angle_fit <- prep(nhl_angle_rec)

tidy(nhl_angle_fit, number = 1) %>% slice(1:4)

bake(nhl_angle_fit, nhl_train %>% slice(1:3), starts_with("coord"), angle, shooter)
```
100 changes: 37 additions & 63 deletions classwork/06-classwork.qmd
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
title: "3 - Tuning Hyperparameters - Classwork"
title: "6 - Tuning Hyperparameters - Classwork"
subtitle: "Machine learning with tidymodels"
editor_options:
chunk_output_type: console
Expand Down Expand Up @@ -30,25 +30,24 @@ nhl_val <- validation_split(nhl_train_and_val, prop = 0.80)

nhl_train <- analysis(nhl_val$splits[[1]])

nhl_distance_rec <-
nhl_position_rec <-
recipe(on_goal ~ ., data = nhl_train) %>%
step_lencode_mixed(player, outcome = vars(on_goal)) %>%
step_lencode_mixed(shooter, goaltender, outcome = vars(on_goal)) %>%
step_other(all_nominal_predictors()) %>% # TODO: keep this?
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_mutate(
angle = abs(atan2(abs(coord_y), (89 - abs(coord_x))) * (180 / pi)),
distance = sqrt((89 - abs(coord_x))^2 + abs(coord_y)^2),
distance = log(distance)
angle = abs( atan2(abs(coord_y), (89 - coord_x) ) * (180 / pi)),
behind_goal_line = ifelse(coord_x >= 89, 1, 0)
)

nhl_distance_wflow <-
nhl_position_wflow <-
workflow() %>%
add_recipe(nhl_distance_rec) %>%
add_recipe(nhl_position_rec) %>%
add_model(logistic_reg())

nhl_distance_res <-
nhl_distance_wflow %>%
nhl_position_res <-
nhl_position_wflow %>%
fit_resamples(nhl_val)
```

Expand All @@ -57,18 +56,16 @@ nhl_distance_res <-
```{r}
glm_rec <-
recipe(on_goal ~ ., data = nhl_train) %>%
step_lencode_mixed(player, outcome = vars(on_goal)) %>%
step_lencode_mixed(shooter, goaltender, outcome = vars(on_goal)) %>%
step_dummy(all_nominal_predictors()) %>%
step_mutate(
angle = abs(atan2(abs(coord_y), (89 - abs(coord_x))) * (180 / pi)),
distance = sqrt((89 - abs(coord_x))^2 + abs(coord_y)^2),
distance = log(distance),
behind_goal_line = ifelse(abs(coord_x) >= 89, 1, 0)
angle = abs( atan2(abs(coord_y), (89 - coord_x) ) * (180 / pi) ),
defensive_zone = ifelse(coord_x <= -25.5, 1, 0),
behind_goal_line = ifelse(coord_x >= 89, 1, 0)
) %>%
step_rm(coord_x, coord_y) %>%
step_zv(all_predictors()) %>%
step_ns(angle, deg_free = tune("angle")) %>%
step_ns(distance, deg_free = tune("distance")) %>%
step_ns(coord_x, deg_free = tune("coord_x")) %>%
step_normalize(all_numeric_predictors())

glm_spline_wflow <-
Expand All @@ -80,7 +77,7 @@ glm_spline_wflow <-
## Create a grid

```{r}
set.seed(2)
set.seed(12)
grid <-
glm_spline_wflow %>%
extract_parameter_set_dials() %>%
Expand All @@ -101,16 +98,16 @@ Try creating a regular grid.
## Update parameter ranges

```{r}
set.seed(2)
set.seed(12)
grid <-
glm_spline_wflow %>%
extract_parameter_set_dials() %>%
update(angle = spline_degree(c(2L, 20L)),
distance = spline_degree(c(2L, 20L))) %>%
update(angle = spline_degree(c(2L, 50L)),
coord_x = spline_degree(c(2L, 50L))) %>%
grid_latin_hypercube(size = 25)

grid %>%
ggplot(aes(angle, distance)) +
ggplot(aes(angle, coord_x)) +
geom_point(size = 4)
```

Expand All @@ -123,6 +120,7 @@ ctrl <- control_grid(save_pred = TRUE, parallel_over = "everything")
glm_spline_res <-
glm_spline_wflow %>%
tune_grid(resamples = nhl_val, grid = grid, control = ctrl)

glm_spline_res
```

Expand Down Expand Up @@ -157,33 +155,20 @@ show_best(glm_spline_res, metric = "roc_auc")
select_best(glm_spline_res, metric = "roc_auc")
```

## Your turn

Try an alternative selection strategy.

Read the docs for `select_by_pct_loss()`.

Try choosing a model that has a simpler (less "wiggly") relationship for `distance`.

```{r}
# Your code here!

```

## Boosted trees

```{r}
xgb_spec <-
boost_tree(
trees = 500, min_n = tune(), stop_iter = tune(), tree_depth = tune(),
trees = tune(), min_n = tune(), tree_depth = tune(),
learn_rate = tune(), loss_reduction = tune()
) %>%
set_mode("classification") %>%
set_engine("xgboost", validation = 1/10) # <- for better early stopping
set_engine("xgboost")

xgb_rec <-
recipe(on_goal ~ ., data = nhl_train) %>%
step_lencode_mixed(player, outcome = vars(on_goal)) %>%
step_lencode_mixed(shooter, goaltender, outcome = vars(on_goal)) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors())

Expand Down Expand Up @@ -218,7 +203,7 @@ set.seed(9)

xgb_res <-
xgb_wflow %>%
tune_grid(resamples = nhl_val, grid = 15, control = ctrl) # automatic grid now!
tune_grid(resamples = nhl_val, grid = 30, control = ctrl) # automatic grid now!
```

## Your turn
Expand Down Expand Up @@ -246,12 +231,10 @@ autoplot(xgb_res)
coord_rec <-
xgb_rec %>%
step_mutate(
angle = abs(atan2(abs(coord_y), (89 - abs(coord_x))) * (180 / pi)),
distance = sqrt((89 - abs(coord_x))^2 + abs(coord_y)^2),
distance = log(distance),
behind_goal_line = ifelse(abs(coord_x) >= 89, 1, 0)
) %>%
step_rm(coord_x, coord_y)
angle = abs( atan2(abs(coord_y), (89 - coord_x) ) * (180 / pi) ),
defensive_zone = ifelse(coord_x <= -25.5, 1, 0),
behind_goal_line = ifelse(coord_x >= 89, 1, 0)
)

xgb_coord_wflow <-
workflow() %>%
Expand All @@ -261,14 +244,16 @@ xgb_coord_wflow <-
set.seed(9)
xgb_coord_res <-
xgb_coord_wflow %>%
tune_grid(resamples = nhl_val, grid = 20, control = ctrl)
tune_grid(resamples = nhl_val, grid = 30, control = ctrl)
```

## Did the machine figure it out?

```{r}
show_best(xgb_res, metric = "roc_auc")
show_best(xgb_coord_res, metric = "roc_auc")
# no extra features
show_best(xgb_res, metric = "roc_auc", n = 3)
# with additional coordinate features
show_best(xgb_coord_res, metric = "roc_auc", n = 3)
```

## Compare models
Expand All @@ -282,22 +267,11 @@ glm_spline_res %>%

```{r}
# Best boosting results
xgb_coord_res %>%
xgb_res %>%
show_best(metric = "roc_auc", n = 1) %>%
select(.metric, .estimator, mean, n, std_err, .config)
```

## Your turn

Can you get better ROC results with xgboost?

Try increasing `learn_rate` beyond the original range.

```{r}
# Your code here!

```

## Updating the workflow

```{r}
Expand Down Expand Up @@ -366,7 +340,7 @@ glm_explainer <- explain_tidymodels(
final_glm_spline_wflow,
data = dplyr::select(nhl_train, -on_goal),
# DALEX required an integer for factors:
y = as.integer(nhl_train$on_goal),
y = as.integer(nhl_train$on_goal) - 1,
verbose = FALSE
)
```
Expand All @@ -381,13 +355,13 @@ pdp_coord_x <- model_profile(
glm_explainer,
variables = "coord_x",
N = 500,
groups = "position"
groups = "strength"
)
```

## Your turn

Try grouping by another variable, like `game_type` or `dow`.
Try grouping by another variable, like `extra_attacker` or `game_seconds`.

```{r}
# Your code here!
Expand Down
Loading