In [1]:
#Preliminary exploratory data analysis:
library(tidyverse)
library(tidymodels)
options(repr.matrix.max.rows = 6)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.7     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.0.0 ──

[32m✔[39m [34mbroom       [39m 1.0.0     [32m✔[39m [34mrsample     [39m 1.0.0
[32m✔[39m [34mdials       [39m 1.0.0     [32m✔[39m [34mtune        [39m 1.0.0
[32m✔[39m [34minfer       [39m 1.0.2     [32m✔[39m [34mworkflows   [39m 1.0.0
[32m✔

In [2]:
#read the data
land_mine<-read_csv("https://raw.githubusercontent.com/zsong18/land_mine_dataset/main/Land%20mines.csv")
land_mine

[1mRows: [22m[34m338[39m [1mColumns: [22m[34m4[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[32mdbl[39m (4): V, H, S, M

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


V,H,S,M
<dbl>,<dbl>,<dbl>,<dbl>
0.3381568,0.0000000,1,1
0.3202413,0.1818182,1,1
0.2870087,0.2727273,1,1
⋮,⋮,⋮,⋮
0.3534739,0.4545455,6,5
0.3625373,0.7272727,6,5
0.3141990,1.0000000,6,5


In [3]:
# wrangling the data (1)
# complete column name
new_names<-c("voltage","height","soil_type","mine_type")
colnames(land_mine)<-new_names
land_mine

voltage,height,soil_type,mine_type
<dbl>,<dbl>,<dbl>,<dbl>
0.3381568,0.0000000,1,1
0.3202413,0.1818182,1,1
0.2870087,0.2727273,1,1
⋮,⋮,⋮,⋮
0.3534739,0.4545455,6,5
0.3625373,0.7272727,6,5
0.3141990,1.0000000,6,5


In [4]:
# wrangling the data (2)
# change target variable mine_type into factor as it is category variable
land_mine <- land_mine|>
  mutate(mine_type = as_factor(mine_type))
land_mine

voltage,height,soil_type,mine_type
<dbl>,<dbl>,<dbl>,<fct>
0.3381568,0.0000000,1,1
0.3202413,0.1818182,1,1
0.2870087,0.2727273,1,1
⋮,⋮,⋮,⋮
0.3534739,0.4545455,6,5
0.3625373,0.7272727,6,5
0.3141990,1.0000000,6,5


In [5]:
# wrangling the data (3)
# select voltage, height, mine_type column 
land_mine<-select(land_mine,voltage,height,mine_type) 
land_mine

voltage,height,mine_type
<dbl>,<dbl>,<fct>
0.3381568,0.0000000,1
0.3202413,0.1818182,1
0.2870087,0.2727273,1
⋮,⋮,⋮
0.3534739,0.4545455,5
0.3625373,0.7272727,5
0.3141990,1.0000000,5


In [6]:
# set the seed 
# spliting the data into train/test
set.seed(1)
land_mine_split<-initial_split(land_mine,prop=0.75,strata=mine_type)
land_mine_train<-training(land_mine_split)
land_mine_train
land_mine_test<-testing(land_mine_split)
land_mine_test

voltage,height,mine_type
<dbl>,<dbl>,<fct>
0.3381568,0.0000000,1
0.3202413,0.1818182,1
0.2409665,0.7272727,1
⋮,⋮,⋮
0.3534739,0.4545455,5
0.3625373,0.7272727,5
0.3141990,1.0000000,5


voltage,height,mine_type
<dbl>,<dbl>,<fct>
0.2870087,0.2727273,1
0.2562836,0.4545455,1
0.2628396,0.5454545,1
⋮,⋮,⋮
0.3781568,0.5454545,5
0.3474316,0.6363636,5
0.4441082,0.1818182,5


In [7]:
# create the recipe
set.seed(1)
land_mine_recipe<-recipe(mine_type~voltage+height,data=land_mine_train)|>
                   step_scale(all_predictors())|>
                   step_center(all_predictors())
land_mine_recipe

Recipe

Inputs:

      role #variables
   outcome          1
 predictor          2

Operations:

Scaling for all_predictors()
Centering for all_predictors()

In [8]:
# create the model specification
set.seed(1)
knn_spec<-nearest_neighbor(weight_func="rectangular",neighbors=tune()) |>
  set_engine("kknn") |>
  set_mode("classification")
knn_spec

K-Nearest Neighbor Model Specification (classification)

Main Arguments:
  neighbors = tune()
  weight_func = rectangular

Computational engine: kknn 


In [12]:
# choose the optimal value of k
# 5-fold cross-validation
# create a workflow to fit the knn model
set.seed(1)
ks<-tibble(neighbors=seq(from=1,to=100,by=5))
land_mine_vfold<-vfold_cv(land_mine_train,v=5,strata=mine_type)

knn_results<-workflow()|>
  add_recipe(land_mine_recipe)|>
  add_model(knn_spec)|>
  tune_grid(resamples=land_mine_vfold,grid=ks) |>
  collect_metrics()
knn_results

neighbors,.metric,.estimator,mean,n,std_err,.config
<dbl>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
1,accuracy,multiclass,0.4398120,5,0.02646056,Preprocessor1_Model01
1,roc_auc,hand_till,0.6433586,5,0.01645443,Preprocessor1_Model01
6,accuracy,multiclass,0.4397983,5,0.02627058,Preprocessor1_Model02
⋮,⋮,⋮,⋮,⋮,⋮,⋮
91,roc_auc,hand_till,0.6763584,5,0.02545807,Preprocessor1_Model19
96,accuracy,multiclass,0.3496043,5,0.04647619,Preprocessor1_Model20
96,roc_auc,hand_till,0.6597358,5,0.02531704,Preprocessor1_Model20


In [None]:
accuracies<-knn_results|>
        filter(.metric == "accuracy")|>
        arrange(desc(mean)) 

accuracies