# Code for Machine Learning With tidymodels (Regression)

Author: Shinin Varongchayakul

Date: 20 Mar 2025

Language: R

## Step 0. Prepare the Packages & Dataset

### Install and Load the Library

In [29]:
# Install
install.packages("tidymodels")
install.packages("MASS")
install.packages("dplyr")

"package 'tidymodels' is in use and will not be installed"
"package 'MASS' is in use and will not be installed"
"package 'dplyr' is in use and will not be installed"


In [1]:
# Load
library(tidymodels)
library(MASS)
library(dplyr)

── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.3.0 ──

[32m✔[39m [34mbroom       [39m 1.0.7     [32m✔[39m [34mrecipes     [39m 1.2.0
[32m✔[39m [34mdials       [39m 1.4.0     [32m✔[39m [34mrsample     [39m 1.2.1
[32m✔[39m [34mdplyr       [39m 1.1.4     [32m✔[39m [34mtibble      [39m 3.2.1
[32m✔[39m [34mggplot2     [39m 3.5.1     [32m✔[39m [34mtidyr       [39m 1.3.1
[32m✔[39m [34minfer       [39m 1.0.7     [32m✔[39m [34mtune        [39m 1.3.0
[32m✔[39m [34mmodeldata   [39m 1.4.0     [32m✔[39m [34mworkflows   [39m 1.2.0
[32m✔[39m [34mparsnip     [39m 1.3.1     [32m✔[39m [34mworkflowsets[39m 1.1.0
[32m✔[39m [34mpurrr       [39m 1.0.4     [32m✔[39m [34myardstick   [39m 1.3.2

── [1mConflicts[22m ───────────────────────────────────────── tidymodels_conflicts() ──
[31m✖[39m [34mpurrr[39m::[32mdiscard()[39m masks [34mscales[39m::discard()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m

### Load the Dataset

In [2]:
# Load the dataset
data(Boston)

# Preview
head(Boston)

Unnamed: 0_level_0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
2,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
3,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
4,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
5,0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
6,0.02985,0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7


In [3]:
# View the structure
glimpse(Boston)

Rows: 506
Columns: 14
$ crim    [3m[90m<dbl>[39m[23m 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, 0.08829,…
$ zn      [3m[90m<dbl>[39m[23m 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5, 12.5, 1…
$ indus   [3m[90m<dbl>[39m[23m 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, 7.87, 7.…
$ chas    [3m[90m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ nox     [3m[90m<dbl>[39m[23m 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524, 0.524,…
$ rm      [3m[90m<dbl>[39m[23m 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172, 5.631,…
$ age     [3m[90m<dbl>[39m[23m 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0, 85.9, 9…
$ dis     [3m[90m<dbl>[39m[23m 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605, 5.9505…
$ rad     [3m[90m<int>[39m[23m 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,…
$ tax     [3m[90m<dbl>[39m[23m 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 31

### Prepare the Data for Regression Task

In [4]:
# Create a copy of Boston
bt <- Boston

In [5]:
# Convert `chas` to factor
bt$chas <- factor(bt$chas,
                  levels = c(1, 0),
                  labels = c("tract bounds river", "otherwise"))

In [6]:
# Check the results
glimpse(bt)

Rows: 506
Columns: 14
$ crim    [3m[90m<dbl>[39m[23m 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, 0.08829,…
$ zn      [3m[90m<dbl>[39m[23m 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5, 12.5, 1…
$ indus   [3m[90m<dbl>[39m[23m 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, 7.87, 7.…
$ chas    [3m[90m<fct>[39m[23m otherwise, otherwise, otherwise, otherwise, otherwise, otherwi…
$ nox     [3m[90m<dbl>[39m[23m 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524, 0.524,…
$ rm      [3m[90m<dbl>[39m[23m 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172, 5.631,…
$ age     [3m[90m<dbl>[39m[23m 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0, 85.9, 9…
$ dis     [3m[90m<dbl>[39m[23m 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605, 5.9505…
$ rad     [3m[90m<int>[39m[23m 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,…
$ tax     [3m[90m<dbl>[39m[23m 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 31

## Step 1. Split the Data

In [7]:
# Create training index

## Set seed for reproducibility
set.seed(100)

## Create index
bt_split <- initial_split(bt,
                          prop = 0.8,
                          strata = medv)

In [8]:
# Create training and test sets

## Training set
bt_train <- training(bt_split)

## Test set
bt_test <- testing(bt_split)

## Step 2. Prepare the Recipe

In [9]:
# Create a recipe
bt_recipe <- recipe(medv ~ .,
                    bt_train) |>
                
                ## Remove near-zero variance predictors
                step_nzv(all_numeric_predictors()) |>

                ## Handle multicollinearity
                step_corr(all_numeric_predictors(),
                          threshold = 0.8) |>

                ## Dummy-code nominal predictors
                step_dummy(all_nominal_predictors()) |>

                ## Normalsie the predictors
                step_normalize(all_numeric_predictors()) |>

                ## Log-transform the outcome
                step_log(all_outcomes())

## Step 3. Prepare & Bake

In [10]:
# Prepare the recipe
bt_recipe_prep <- prep(bt_recipe,
                       data = bt_train)

In [11]:
# Bake the training and test sets

## Training set
bt_train_prep <- bake(bt_recipe_prep,
                      new_data = NULL)

## Test set
bt_test_prep <- bake(bt_recipe_prep,
                     new_data = bt_test)

## Step 4. Instantiate the Model

In [12]:
# Instantiate decision tree
dt_model <- decision_tree() |>

    ## Set engine
    set_engine("rpart") |>

    ## Set mode
    set_mode("regression")

## Step 5. Train the Model

In [13]:
# Train the model
dt_model_fit <- fit(dt_model,
                    medv ~ .,
                    data = bt_train_prep)

## Step 6. Make Predictions

In [14]:
# Predict the outcome
y_pred <- predict(dt_model_fit,
                  new_data = bt_test_prep,
                  type = "numeric")

# View the result
head(y_pred)

.pred
<dbl>
3.535555
2.881556
2.999022
2.999022
2.732528
2.468922


In [15]:
# Store the results in a tibble
dt_results <- tibble(actual = bt_test_prep$medv,
                     predicted = y_pred$.pred)

# View the tibble
head(dt_results)

actual,predicted
<dbl>,<dbl>
3.589059,3.535555
2.80336,2.881556
2.862201,2.999022
2.97553,2.999022
2.747271,2.732528
2.541602,2.468922


## Step 7. Evaluate the Model

In [16]:
# Create a set of custom metrics
dt_metrics <- metric_set(mae,
                         rmse)

In [17]:
# Apply the custom metric set
dt_metrics(dt_results,
           truth = actual,
           estimate = predicted)

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
mae,standard,0.1420251
rmse,standard,0.1879423


## Step 8. Hyperparametre Tuning

In [18]:
# Create a tune model
dt_tune <- decision_tree(cost_complexity = tune(),
                         tree_depth = tune(),
                         min_n = tune()) |>
            
            ## Set engine
            set_engine("rpart") |>

            ## Set mode
            set_mode("regression")

In [19]:
# Create grid random
dt_grid <- grid_random(cost_complexity(range = c(-5, 0), trans = log10_trans()),
                       tree_depth(range = c(1, 20)),
                       min_n(range = c(2, 50)),
                       size = 20)

In [20]:
# Create v-fold cross validation

## Set seed for reproducibility
set.seed(100)

# Create folds
dt_cv <- vfold_cv(bt_train,
                  v = 5,
                  strata = medv)

In [21]:
# Create a workflow
dt_wf <- workflow() |>

            ## Add recipe
            add_recipe(bt_recipe) |>

            ## Add model
            add_model(dt_tune)

In [22]:
# Tune the model
dt_tune_results <- tune_grid(dt_wf,
                             resamples = dt_cv,
                             grid = dt_grid,
                             metrics = dt_metrics)

## Step 9. Select the Best Parametre Combination

In [23]:
# View the best parametre combination
dt_tune_results |> show_best(metric = "rmse",
                             n = 5)

cost_complexity,tree_depth,min_n,.metric,.estimator,mean,n,std_err,.config
<dbl>,<int>,<int>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
0.0047900531,9,18,rmse,standard,0.2001291,5,0.004814137,Preprocessor1_Model16
0.0003390114,17,16,rmse,standard,0.2019157,5,0.005244076,Preprocessor1_Model07
0.0067354148,11,7,rmse,standard,0.2028539,5,0.011383164,Preprocessor1_Model20
0.0001850074,5,22,rmse,standard,0.2032098,5,0.00698633,Preprocessor1_Model01
0.0006427211,20,28,rmse,standard,0.2058226,5,0.004145649,Preprocessor1_Model06


In [24]:
# Select the best combination
best_params <- select_best(dt_tune_results,
                           metric = "rmse")

In [25]:
# Add the combination to the model
dt_best_wf <- finalize_workflow(dt_wf,
                                best_params)

## Step 10. Fit the Best Model

In [26]:
# Fit the model
dt_best_fit <- last_fit(dt_best_wf,
                        split = bt_split,
                        metrics = dt_metrics)

## Step 11. Make Predictions With the Best Model

In [27]:
# Make predictions
dt_best_predictions <- collect_predictions(dt_best_fit)

# Preview the results
head(dt_best_predictions)

.pred,id,.row,medv,.config
<dbl>,<chr>,<int>,<dbl>,<chr>
3.535555,train/test split,5,3.589059,Preprocessor1_Model1
2.94397,train/test split,9,2.80336,Preprocessor1_Model1
2.999022,train/test split,18,2.862201,Preprocessor1_Model1
2.999022,train/test split,22,2.97553,Preprocessor1_Model1
2.732528,train/test split,25,2.747271,Preprocessor1_Model1
2.468922,train/test split,31,2.541602,Preprocessor1_Model1


## Step 12. Evaluate the Best Model

In [28]:
# Collect metrics
collect_metrics(dt_best_fit)

.metric,.estimator,.estimate,.config
<chr>,<chr>,<dbl>,<chr>
mae,standard,0.1266422,Preprocessor1_Model1
rmse,standard,0.1797016,Preprocessor1_Model1
