# Code for Machine Learning With tidymodels

Language: R

## Step 0. Prepare the Packages & Dataset

### Install and Load the Library

In [32]:
# Install
install.packages("tidymodels")
install.packages("MASS")
install.packages("dplyr")

"package 'tidymodels' is in use and will not be installed"
"package 'MASS' is in use and will not be installed"
"package 'dplyr' is in use and will not be installed"


In [33]:
# Load
library(tidymodels)
library(MASS)
library(dplyr)

### Load the Datasete

In [34]:
# Load the dataset
data(Boston)

# Preview
head(Boston)

Unnamed: 0_level_0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
2,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
3,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
4,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
5,0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
6,0.02985,0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7


In [35]:
# View the structure
glimpse(Boston)

Rows: 506
Columns: 14
$ crim    [3m[90m<dbl>[39m[23m 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, 0.08829,…
$ zn      [3m[90m<dbl>[39m[23m 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5, 12.5, 1…
$ indus   [3m[90m<dbl>[39m[23m 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, 7.87, 7.…
$ chas    [3m[90m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ nox     [3m[90m<dbl>[39m[23m 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524, 0.524,…
$ rm      [3m[90m<dbl>[39m[23m 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172, 5.631,…
$ age     [3m[90m<dbl>[39m[23m 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0, 85.9, 9…
$ dis     [3m[90m<dbl>[39m[23m 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605, 5.9505…
$ rad     [3m[90m<int>[39m[23m 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,…
$ tax     [3m[90m<dbl>[39m[23m 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 31

### Prepare the Data for Classification Task

In [36]:
# Create a copy of Boston
bt <- Boston

In [37]:
# Convert `chas` to factor
bt$chas <- factor(bt$chas,
                  levels = c(1, 0),
                  labels = c("tract bounds river", "otherwise"))

In [38]:
# Create a new categorical outcome
bt <- bt |>
        mutate(medv_class = if_else(medv > quantile(medv, 0.5),
                                    "high",
                                    "low"))

In [39]:
# Convert `medv_class` to factor
bt$medv_class <- factor(bt$medv_class,
                        levels = c("high", "low"))

In [40]:
# Drop `medv`
bt$medv <- NULL

In [41]:
# Check the results
glimpse(bt)

Rows: 506
Columns: 14
$ crim       [3m[90m<dbl>[39m[23m 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, 0.088…
$ zn         [3m[90m<dbl>[39m[23m 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5, 12.5…
$ indus      [3m[90m<dbl>[39m[23m 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, 7.87,…
$ chas       [3m[90m<fct>[39m[23m otherwise, otherwise, otherwise, otherwise, otherwise, othe…
$ nox        [3m[90m<dbl>[39m[23m 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524, 0.5…
$ rm         [3m[90m<dbl>[39m[23m 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172, 5.6…
$ age        [3m[90m<dbl>[39m[23m 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0, 85.9…
$ dis        [3m[90m<dbl>[39m[23m 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605, 5.9…
$ rad        [3m[90m<int>[39m[23m 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4,…
$ tax        [3m[90m<dbl>[39m[23m 296, 242, 242, 222, 222, 222, 311, 311, 311, 311,

## Step 1. Split the Data

In [42]:
# Create training index
bt_split <- initial_split(bt,
                          prop = 0.8,
                          strata = medv_class)

In [43]:
# Create training and test sets

## Training set
bt_train <- training(bt_split)

## Test set
bt_test <- testing(bt_split)

## Step 2. Prepare the Recipe

In [44]:
# Create a recipe
bt_recipe <- recipe(bt_train) |>
                
                ## Remove near-zero variance predictors
                step_nzv(all_numeric_predictors()) |>

                ## Handle multicollinearity
                step_corr(all_numeric_predictors(),
                          threshold = 0.8) |>

                ## Dummy-code nominal predictors
                step_dummy(all_nominal_predictors()) |>

                ## Normalsie the data
                step_normalize(all_numeric_predictors())

## Step 3. Prepare & Bake

In [45]:
# Prepare the recipe
bt_recipe_prep <- prep(bt_recipe,
                       data = bt_train)

In [46]:
# Bake the training and test sets

## Training set
bt_train_prep <- bake(bt_recipe_prep,
                      new_data = NULL)

## Test set
bt_test_prep <- bake(bt_recipe_prep,
                     new_data = bt_test)

## Step 4. Instantiate the Model

In [47]:
# Instantiate decision tree
dt_model <- decision_tree() |>

    ## Set engine
    set_engine("rpart") |>

    ## Set mode
    set_mode("classification")

## Step 5. Train the Model

In [48]:
# Train the model
dt_model_fit <- fit(dt_model,
                    medv_class ~ .,
                    data = bt_train_prep)

## Step 6. Make Predictions

In [None]:
# Predict the outcome
y_pred <- predict(dt_model_fit,
                  new_data = bt_test_prep,
                  type = "class")

# View the result
head(y_pred)

.pred_class
<fct>
high
high
low
low
low
low


In [55]:
# Get predicted probabilities
y_prob <- predict(dt_model_fit,
                  new_data = bt_test_prep,
                  type = "prob")

# View the results
head(y_prob)

.pred_high,.pred_low
<dbl>,<dbl>
0.9551282,0.04487179
0.9551282,0.04487179
0.12,0.88
0.1046512,0.89534884
0.1046512,0.89534884
0.1046512,0.89534884


In [57]:
# Store the results in a tibble
dt_results <- tibble(actual = bt_test_prep$medv_class,
                     predicted = y_pred$.pred_class,
                     prob_high = y_prob$.pred_high,
                     prob_low = y_prob$.pred_low)

# View the tibble
head(dt_results)

actual,predicted,prob_high,prob_low
<fct>,<fct>,<dbl>,<dbl>
high,high,0.9551282,0.04487179
high,high,0.9551282,0.04487179
low,low,0.12,0.88
low,low,0.1046512,0.89534884
low,low,0.1046512,0.89534884
low,low,0.1046512,0.89534884


## Step 7. Evaluate the Model

In [None]:
# Get 