In [1]:
# Loading in all necessary libraries.

library(tidyverse)
library(tidymodels)
library(repr)
library(GGally)
options(repr.matrix.max.rows = 10)

set.seed(14)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.7     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.0.0 ──

[32m✔[39m [34mbroom       [39m 1.0.0     [32m✔[39m [34mrsample     [39m 1.0.0
[32m✔[39m [34mdials       [39m 1.0.0     [32m✔[39m [34mtune        [39m 1.0.0
[32m✔[39m [34minfer       [39m 1.0.2     [32m✔[39m [34mworkflows   [39m 1.0.0
[32m✔

In [2]:
# Loading in Cleveland data set

URL <- "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"

cleveland_data <- read_csv(URL, col_names=FALSE, show_col_types = FALSE)

# Since column names aren't pre-specified, we rename them using colnames, according to the UCI repository.

colnames(cleveland_data) <- c("age", "sex", "cp", "trestbps", 
                              "chol", "fbs", "restecg", "thalach", 
                              "exang", "oldpeak", "slope", "ca", "thal", "num")
cleveland_data <- cleveland_data |>
    mutate(ca = as.numeric(ca), thal = as.numeric(thal)) |>
    filter(ca != "NA", thal != "NA")

“NAs introduced by coercion”
“NAs introduced by coercion”


In [3]:
#Adding column indicating yes or no for presence of heart disease
cl_have_disease <- cleveland_data |>
    filter(num == 1 | num == 2 | num == 3 | num == 4) |>
    mutate(hd = "yes") |>
    mutate(hd = as_factor(hd))
cl_no_disease <- cleveland_data |>
    filter(num == 0) |>
    mutate(hd = "no") |>
    mutate(hd = as_factor(hd))
cl_data_alt <- rbind(cl_have_disease, cl_no_disease)

In [4]:
# Loading in Hungarian data set

URL2 <- "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/reprocessed.hungarian.data"

hungarian_data <- read_delim(URL2, col_names = FALSE, delim = " ", show_col_types = FALSE)

colnames(hungarian_data) <- c("age", "sex", "cp", "trestbps", 
                              "chol", "fbs", "restecg", "thalach", 
                              "exang", "oldpeak", "slope", "ca", "thal", "num")

# Combine levels 1,2,3,4 into hd column. Levels 1-4 as "yes", level 0 as "no."
hung_have_disease <- hungarian_data |>
    filter(num == 1 | num == 2 | num == 3 | num == 4) |>
    mutate(hd = "yes") |>
    mutate(hd = as_factor(hd))
hung_no_disease <- hungarian_data |>
    filter(num == 0) |>
    mutate(hd = "no") |>
    mutate(hd = as_factor(hd))
hung_data_alt <- rbind(hung_have_disease, hung_no_disease)

In [6]:
heart_data <- rbind(hung_data_alt, cl_data_alt)

In [7]:
# Percentages of hd (yes, no) 
num_obs_h <- nrow(heart_data)
heart_proportions <- heart_data |>
    group_by(hd) |>
    summarize(
        count = n(),
        percentage = n() / num_obs_h * 100
        )
heart_proportions

hd,count,percentage
<fct>,<int>,<dbl>
yes,243,41.11675
no,348,58.88325


In [8]:
#Creating Training and Testing Data Sets
heart_split <- initial_split(heart_data, prop = 0.75, strata = hd)
heart_training <- training(heart_split)
heart_testing <- testing(heart_split)

In [9]:
#Created a duplicate set to perform Forward Selection
#Standardized this new subset
heart_subset <- heart_training |> select(-num)

#Implementing Forward Selection Code to find best predictors
names <- colnames(heart_subset |> select(-hd))

#create a tibble to store accuracies
accuracies <- tibble(size = integer(), 
                     model_string = character(), 
                     accuracy = numeric())

#create a model spec
knn_FS_spec <- nearest_neighbor(weight_func = "rectangular",
                             neighbors = tune()) |>
    set_engine("kknn") |>
    set_mode("classification")

#create a 5-fold cross-validation object
heart_vfold <- vfold_cv(heart_subset, v = 5, strata = hd)

#store the total number of predictors
n_total <- length(names)

#stores selected predictors
selected <- c()
for (i in 1:n_total) {
    accs <- list()
    models <- list()
    for (j in 1:length(names)) {
        preds_new <- c(selected, names[[j]])
        model_string <- paste("hd", "~", paste(preds_new, collapse="+"))
        print(model_string)
        heart_FS_recipe <- recipe(as.formula(model_string),
                                data = heart_subset) |>
                            step_scale(all_predictors()) |>
                            step_center(all_predictors())
        acc <- workflow() |>
            add_recipe(heart_FS_recipe) |>
            add_model(knn_FS_spec) |>
            tune_grid(resamples = heart_vfold, grid = 10) |>
            collect_metrics() |>
            filter(.metric == "accuracy") |>
            summarize(mx = max(mean))
        acc <- acc$mx |> unlist()
        accs[[j]] <- acc
        models[[j]] <- model_string
    }
    jstar <- which.max(unlist(accs))
    accuracies <- accuracies |>
        add_row(size = i,
                model_string = models[[jstar]],
                accuracy = accs[[jstar]])
    selected <- c(selected, names[[jstar]])
    names <- names[-jstar]
}
accuracies

[1] "hd ~ age"
[1] "hd ~ sex"
[1] "hd ~ cp"
[1] "hd ~ trestbps"
[1] "hd ~ chol"
[1] "hd ~ fbs"
[1] "hd ~ restecg"
[1] "hd ~ thalach"
[1] "hd ~ exang"
[1] "hd ~ oldpeak"
[1] "hd ~ slope"
[1] "hd ~ ca"
[1] "hd ~ thal"
[1] "hd ~ oldpeak+age"
[1] "hd ~ oldpeak+sex"
[1] "hd ~ oldpeak+cp"
[1] "hd ~ oldpeak+trestbps"
[1] "hd ~ oldpeak+chol"
[1] "hd ~ oldpeak+fbs"
[1] "hd ~ oldpeak+restecg"
[1] "hd ~ oldpeak+thalach"
[1] "hd ~ oldpeak+exang"
[1] "hd ~ oldpeak+slope"
[1] "hd ~ oldpeak+ca"
[1] "hd ~ oldpeak+thal"
[1] "hd ~ oldpeak+cp+age"
[1] "hd ~ oldpeak+cp+sex"
[1] "hd ~ oldpeak+cp+trestbps"
[1] "hd ~ oldpeak+cp+chol"
[1] "hd ~ oldpeak+cp+fbs"
[1] "hd ~ oldpeak+cp+restecg"
[1] "hd ~ oldpeak+cp+thalach"
[1] "hd ~ oldpeak+cp+exang"
[1] "hd ~ oldpeak+cp+slope"
[1] "hd ~ oldpeak+cp+ca"
[1] "hd ~ oldpeak+cp+thal"
[1] "hd ~ oldpeak+cp+thal+age"
[1] "hd ~ oldpeak+cp+thal+sex"
[1] "hd ~ oldpeak+cp+thal+trestbps"
[1] "hd ~ oldpeak+cp+thal+chol"
[1] "hd ~ oldpeak+cp+thal+fbs"
[1] "hd ~ oldpeak+cp+thal+

size,model_string,accuracy
<int>,<chr>,<dbl>
1,hd ~ oldpeak,0.7043508
2,hd ~ oldpeak+cp,0.7813449
3,hd ~ oldpeak+cp+thal,0.8130360
4,hd ~ oldpeak+cp+thal+fbs,0.8130110
5,hd ~ oldpeak+cp+thal+fbs+slope,0.8062944
⋮,⋮,⋮
9,hd ~ oldpeak+cp+thal+fbs+slope+sex+restecg+exang+ca,0.8083895
10,hd ~ oldpeak+cp+thal+fbs+slope+sex+restecg+exang+ca+thalach,0.8039451
11,hd ~ oldpeak+cp+thal+fbs+slope+sex+restecg+exang+ca+thalach+age,0.8151061
12,hd ~ oldpeak+cp+thal+fbs+slope+sex+restecg+exang+ca+thalach+age+chol,0.8196011
