Permalink
Cannot retrieve contributors at this time
FES/07_Detecting_Interaction_Effects/7_04_The_Brute-Force_Approach_to_Identifying_Predictive_Interactions/ames_glmnet.R
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
332 lines (280 sloc)
9.14 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # ------------------------------------------------------------------------------ | |
| # Feature Engineering and Selection: A Practical Approach for Predictive Models | |
| # by Max Kuhn and Kjell Johnson | |
| # | |
| # ------------------------------------------------------------------------------ | |
| # | |
| # Code for Section 7.5.3 at | |
| # https://bookdown.org/max/FES/detecting-interaction-effects.html#approaches-when-complete-enumeration-is-practically-impossible | |
| # | |
| # ------------------------------------------------------------------------------ | |
| # | |
| # Code requires these packages: | |
| library(caret) | |
| library(glmnet) | |
| library(tidymodels) | |
| library(AmesHousing) | |
| library(gridExtra) | |
| library(stringr) | |
| source("../clean_value.R") | |
| # ------------------------------------------------------------------------------ | |
| ames <- make_ames() | |
| set.seed(955) | |
| ames_split <- initial_split(ames) | |
| ames_train <- training(ames_split) | |
| set.seed(24873) | |
| ames_folds <- vfold_cv(ames_train) | |
| ames_ind <- rsample2caret(ames_folds) | |
| # ------------------------------------------------------------------------------ | |
| main_rec <- | |
| recipe(Sale_Price ~ Bldg_Type + Neighborhood + Year_Built + | |
| Gr_Liv_Area + Full_Bath + Year_Sold + Lot_Area + | |
| Central_Air + Longitude + Latitude + MS_SubClass + | |
| Alley + Lot_Frontage + Pool_Area + Garage_Finish + | |
| Foundation + Land_Contour + Roof_Style, | |
| data = ames_train) %>% | |
| step_log(Sale_Price, base = 10) %>% | |
| step_BoxCox(Lot_Area, Gr_Liv_Area, Lot_Frontage) %>% | |
| step_other(Neighborhood, threshold = 0.05) %>% | |
| step_dummy(all_nominal()) %>% | |
| step_zv(all_predictors()) %>% | |
| step_bs(Longitude, Latitude, options = list(df = 5)) %>% | |
| step_center(all_predictors()) %>% | |
| step_scale(all_predictors()) | |
| # ------------------------------------------------------------------------------ | |
| int_vars <- | |
| main_rec %>% | |
| pluck("var_info") %>% | |
| dplyr::filter(role == "predictor") %>% | |
| pull(variable) | |
| interactions <- t(combn(as.character(int_vars), 2)) | |
| colnames(interactions) <- c("var1", "var2") | |
| interactions <- | |
| interactions %>% | |
| as_tibble() %>% | |
| mutate( | |
| term = | |
| paste0( | |
| "starts_with('", | |
| var1, | |
| "'):starts_with('", | |
| var2, | |
| "')" | |
| ) | |
| ) %>% | |
| pull(term) %>% | |
| paste(collapse = "+") | |
| interactions <- paste("~", interactions) | |
| interactions <- as.formula(interactions) | |
| int_rec <- | |
| recipe(Sale_Price ~ Bldg_Type + Neighborhood + Year_Built + | |
| Gr_Liv_Area + Full_Bath + Year_Sold + Lot_Area + | |
| Central_Air + Longitude + Latitude + MS_SubClass + | |
| Alley + Lot_Frontage + Pool_Area + Garage_Finish + | |
| Foundation + Land_Contour + Roof_Style, | |
| data = ames_train) %>% | |
| step_log(Sale_Price, base = 10) %>% | |
| step_BoxCox(Lot_Area, Gr_Liv_Area, Lot_Frontage) %>% | |
| step_other(Neighborhood, threshold = 0.05) %>% | |
| step_dummy(all_nominal()) %>% | |
| step_interact(interactions) %>% | |
| step_zv(all_predictors()) %>% | |
| step_bs(Longitude, Latitude, options = list(df = 5)) %>% | |
| step_center(all_predictors()) %>% | |
| step_scale(all_predictors()) | |
| # ------------------------------------------------------------------------------ | |
| ctrl <- | |
| trainControl( | |
| method = "cv", | |
| index = ames_ind$index, | |
| indexOut = ames_ind$indexOut | |
| ) | |
| glmn_grid <- expand.grid(alpha = seq(.2, 1, by = .2), lambda = 10^seq(-4, -1, by = 0.1)) | |
| main_glmn <- | |
| train(main_rec, | |
| data = ames_train, | |
| method = "glmnet", | |
| tuneGrid = glmn_grid, | |
| trControl = ctrl | |
| ) | |
| int_glmn <- | |
| train(int_rec, | |
| data = ames_train, | |
| method = "glmnet", | |
| tuneGrid = glmn_grid, | |
| trControl = ctrl | |
| ) | |
| int_glmn$bestTune | |
| tune_plot <- | |
| int_glmn %>% | |
| pluck("results") %>% | |
| mutate(alpha = factor(alpha)) %>% | |
| ggplot(aes(x = lambda, y = RMSE, col = alpha)) + | |
| geom_line() + | |
| geom_point() + | |
| scale_x_log10() + | |
| ggtitle("(a)") + | |
| theme( | |
| legend.position = c(0.50, 0.75), | |
| legend.background = element_blank() | |
| ) + | |
| labs( | |
| colour = "Mixing Percentage", | |
| x = "Regularization Parameter", | |
| y = "RMSE (Cross-Validation)" | |
| ) | |
| top_interaction <- | |
| ggplot(ames_train, aes(x = Year_Built*Gr_Liv_Area, y = Sale_Price)) + | |
| geom_point(alpha = 0.5) + | |
| scale_y_log10() + | |
| scale_x_log10() + | |
| geom_smooth(se = FALSE) + | |
| ylab("Sale_Price") + | |
| xlab("Year Built * Living Area") + | |
| ggtitle("(b)") | |
| # https://bookdown.org/max/FES/complete-enumeration.html#fig:interactions-lasso | |
| # grid.arrange(tune_plot, top_interaction, nrow = 1) | |
| clean_value <- function(x) { | |
| x <- str_replace(x, "Neighborhood_", "Neighborhood: ") | |
| x <- str_replace(x, "MS_SubClass_", "MS SubClass: ") | |
| x <- str_replace(x, "Land_Contour_", "Land Contour: ") | |
| x <- str_replace(x, "Roof_Style_", "Roof Style: ") | |
| x <- str_replace(x, "Foundation_", "Foundation: ") | |
| x <- str_replace(x, "Garage_Finish_", "Garage Finish: ") | |
| x <- str_replace(x, "Central_Air", "Central Air:") | |
| x <- str_replace(x, "Bldg_Type", "Building Type") | |
| x <- str_replace(x, "Alley", "Alley:") | |
| x <- str_replace(x, "Gr_Liv_Area", "Living Area") | |
| x <- str_replace(x, "_bs_1", " (spline)") | |
| x <- str_replace(x, "_bs_2", " (spline)") | |
| x <- str_replace(x, "_bs_3", " (spline)") | |
| x <- str_replace_all(x, "_", " ") | |
| x | |
| } | |
| # https://bookdown.org/max/FES/complete-enumeration.html#tab:interactions-lasso-top | |
| lasso_int_coefs <- | |
| int_glmn %>% | |
| pluck("finalModel") %>% | |
| coef(s = int_glmn %>% pluck("bestTune") %>% pluck("lambda")) %>% | |
| as.matrix() %>% | |
| as.data.frame() %>% | |
| setNames("Coefficient") %>% | |
| rownames_to_column("Predictor") %>% | |
| dplyr::filter(grepl("_x_", Predictor)) %>% | |
| arrange(desc(abs(Coefficient))) %>% | |
| dplyr::filter(Predictor != "(Intercept)") %>% | |
| slice(1:15) %>% | |
| mutate( | |
| split = str_split(Predictor, "_x_"), | |
| split = map(split, sort), | |
| var_1 = map_chr(split, pluck, 1), | |
| var_2 = map_chr(split, pluck, 2), | |
| var_1 = map_chr(var_1, clean_value), | |
| var_2 = map_chr(var_2, clean_value) | |
| ) %>% | |
| dplyr::select(`Predictor 1` = var_1, `Predictor 2` = var_2, Coefficient) | |
| # ------------------------------------------------------------------------------ | |
| main_info <- | |
| list( | |
| all = | |
| main_rec %>% | |
| prep(ames_train) %>% | |
| juice(all_predictors()) %>% | |
| ncol(), | |
| main = length(predictors(main_glmn)), | |
| perf = getTrainPerf(main_glmn) | |
| ) | |
| int_info <- | |
| list( | |
| all = | |
| int_rec %>% | |
| prep(ames_train) %>% | |
| juice(all_predictors()) %>% | |
| ncol(), | |
| main = sum(!grepl("_x_", predictors(int_glmn))), | |
| int = sum(grepl("_x_", predictors(int_glmn))), | |
| perf = getTrainPerf(int_glmn) | |
| ) | |
| save(main_info, int_info, int_glmn, file = "ames_glmnet.RData") | |
| # ------------------------------------------------------------------------------ | |
| # Determine predictors involved with main effects | |
| main_vars <- tibble(predictor = predictors(main_glmn)) | |
| all_dummies <- | |
| main_rec %>% | |
| prep(ames_train) %>% | |
| tidy(number = 4) %>% | |
| mutate(predictor = main_rec$steps[[4]]$naming(terms, columns)) | |
| qual_vars <- | |
| inner_join(all_dummies, main_vars, by = "predictor") %>% | |
| distinct(terms) %>% | |
| pull(terms) | |
| quant_vars <- | |
| summary(main_rec) %>% | |
| dplyr::select(predictor = variable) %>% | |
| inner_join(main_vars, by = "predictor") %>% | |
| pull(predictor) | |
| used_main <- c(qual_vars, quant_vars) | |
| # make their interactions | |
| interaction_subset <- t(combn(as.character(used_main), 2)) | |
| colnames(interaction_subset) <- c("var1", "var2") | |
| interaction_subset <- | |
| interaction_subset %>% | |
| as_tibble() %>% | |
| mutate( | |
| term = | |
| paste0( | |
| "starts_with('", | |
| var1, | |
| "'):starts_with('", | |
| var2, | |
| "')" | |
| ) | |
| ) %>% | |
| pull(term) %>% | |
| paste(collapse = "+") | |
| interaction_subset <- paste("~", interaction_subset) | |
| interaction_subset <- as.formula(interaction_subset) | |
| two_stage_rec <- | |
| recipe(Sale_Price ~ Bldg_Type + Neighborhood + Year_Built + | |
| Gr_Liv_Area + Full_Bath + Year_Sold + Lot_Area + | |
| Central_Air + Longitude + Latitude + MS_SubClass + | |
| Alley + Lot_Frontage + Pool_Area + Garage_Finish + | |
| Foundation + Land_Contour + Roof_Style, | |
| data = ames_train) %>% | |
| step_log(Sale_Price, base = 10) %>% | |
| step_BoxCox(Lot_Area, Gr_Liv_Area, Lot_Frontage) %>% | |
| step_other(Neighborhood, threshold = 0.05) %>% | |
| step_dummy(all_nominal()) %>% | |
| step_interact(interaction_subset) %>% | |
| step_zv(all_predictors()) %>% | |
| step_bs(Longitude, Latitude, options = list(df = 5)) %>% | |
| step_center(all_predictors()) %>% | |
| step_scale(all_predictors()) | |
| two_stage_glmn <- | |
| train(two_stage_rec, | |
| data = ames_train, | |
| method = "glmnet", | |
| tuneGrid = glmn_grid, | |
| trControl = ctrl | |
| ) | |
| two_stage_info <- | |
| list( | |
| all = | |
| two_stage_rec %>% | |
| prep(ames_train) %>% | |
| juice(all_predictors()) %>% | |
| ncol(), | |
| main = sum(!grepl("_x_", predictors(two_stage_glmn))), | |
| int = sum(grepl("_x_", predictors(two_stage_glmn))), | |
| perf = getTrainPerf(two_stage_glmn) | |
| ) | |
| save(used_main, two_stage_info, two_stage_glmn, file = "ames_glmnet_two_stage.RData") | |
| # notes from | |
| # https://bookdown.org/max/FES/approaches-when-complete-enumeration-is-practically-impossible.html | |
| two_stage_glmn$bestTune | |
| two_stage_info$main | |
| two_stage_info$int | |
| # ------------------------------------------------------------------------------ | |
| sessionInfo() | |
| if (!interactive()) | |
| q("no") |