In [4]:
# Helper packages
library(tidyverse)

# Modeling process packages
library(rsample) # for resampling procedures
library(caret) # for resampling and model training
library(h2o) # for resampling and model training

# h2o set-up
h2o.no_progress() # turn off h2o progress bars
h2o.init() # launch h2o

ames <- read_csv('data/ames.csv')
attrition <- read_csv('data/attrition.csv')

"package 'tidyverse' was built under R version 4.0.3"
-- [1mAttaching packages[22m ------------------------------------------------------------------------------- tidyverse 1.3.0 --

[32mv[39m [34mggplot2[39m 3.3.2     [32mv[39m [34mpurrr  [39m 0.3.4
[32mv[39m [34mtibble [39m 3.0.4     [32mv[39m [34mdplyr  [39m 1.0.2
[32mv[39m [34mtidyr  [39m 1.1.2     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 1.4.0     [32mv[39m [34mforcats[39m 0.5.0

"package 'tibble' was built under R version 4.0.3"
"package 'readr' was built under R version 4.0.3"
-- [1mConflicts[22m ---------------------------------------------------------------------------------- tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

"package 'rsample' was built under R version 4.0.3"
"package 'caret' was built under R version 4.0.3"
Loading required pa


H2O is not running yet, starting it now...

Note:  In case of errors look at the following log files:
    C:\Users\User\AppData\Local\Temp\Rtmpgj6cAT\file304c34586ccd/h2o_User_started_from_r.out
    C:\Users\User\AppData\Local\Temp\Rtmpgj6cAT\file304c2d121bcf/h2o_User_started_from_r.err


Starting H2O JVM and connecting:  Connection successful!

R is connected to the H2O cluster: 
    H2O cluster uptime:         6 seconds 775 milliseconds 
    H2O cluster timezone:       America/New_York 
    H2O data parsing timezone:  UTC 
    H2O cluster version:        3.32.0.1 
    H2O cluster version age:    3 months and 28 days !!! 
    H2O cluster name:           H2O_started_from_R_User_osn291 
    H2O cluster total nodes:    1 
    H2O cluster total memory:   3.50 GB 
    H2O cluster total cores:    12 
    H2O cluster allowed cores:  12 
    H2O cluster healthy:        TRUE 
    H2O Connection ip:          localhost 
    H2O Connection port:        54321 
    H2O Connection proxy:       NA 


"
Your H2O cluster version is too old (3 months and 28 days)!
Please download and install the latest version from http://h2o.ai/download/"






[36m--[39m [1m[1mColumn specification[1m[22m [36m------------------------------------------------------------------------------------------------[39m
cols(
  .default = col_character(),
  Lot_Frontage = [32mcol_double()[39m,
  Lot_Area = [32mcol_double()[39m,
  Year_Built = [32mcol_double()[39m,
  Year_Remod_Add = [32mcol_double()[39m,
  Mas_Vnr_Area = [32mcol_double()[39m,
  BsmtFin_SF_1 = [32mcol_double()[39m,
  BsmtFin_SF_2 = [32mcol_double()[39m,
  Bsmt_Unf_SF = [32mcol_double()[39m,
  Total_Bsmt_SF = [32mcol_double()[39m,
  First_Flr_SF = [32mcol_double()[39m,
  Second_Flr_SF = [32mcol_double()[39m,
  Low_Qual_Fin_SF = [32mcol_double()[39m,
  Gr_Liv_Area = [32mcol_double()[39m,
  Bsmt_Full_Bath = [32mcol_double()[39m,
  Bsmt_Half_Bath = [32mcol_double()[39m,
  Full_Bath = [32mcol_double()[39m,
  Half_Bath = [32mcol_double()[39m,
  Bedroom_AbvGr = [32mcol_double()[39m,
  Kitchen_AbvGr = [32mcol_double()[39m,
  TotRms_AbvGrd = [32mcol_

In [5]:
# Set up data as h2o object
## also means converting any factors 


ames.h2o <- as.h2o(ames)
# Job attrition data
churn <- attrition %>%
mutate_if(is.ordered, .funs = factor, ordered = FALSE)
churn.h2o <- as.h2o(churn)

In [6]:
# Different ways to split the data

# Using base R
set.seed(123) # for reproducibility
index_1 <- sample(1:nrow(ames), round(nrow(ames) * 0.7))
train_1 <- ames[index_1, ]
test_1 <- ames[-index_1, ]

In [7]:
# Using caret package
set.seed(123) # for reproducibility
index_2 <- createDataPartition(ames$Sale_Price, p = 0.7,
list = FALSE)
train_2 <- ames[index_2, ]
test_2 <- ames[-index_2, ]

"The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
Convert to a vector.


In [8]:
# Using rsample package
set.seed(123) # for reproducibility
split_1 <- initial_split(ames, prop = 0.7)
train_3 <- training(split_1)
test_3 <- testing(split_1)

In [9]:
# Using h2o package
split_2 <- h2o.splitFrame(ames.h2o, ratios = 0.7,
seed = 123)
train_4 <- split_2[[1]]
test_4 <- split_2[[2]]

In [10]:
# Do stratified sampling

set.seed(123)
split_strat <- initial_split(churn, prop = 0.7,
strata = "Attrition")
train_strat <- training(split_strat)
test_strat <- testing(split_strat)


# Compare train and test
table(train_strat$Attrition) %>% prop.table()
table(test_strat$Attrition) %>% prop.table()


      No      Yes 
0.838835 0.161165 


       No       Yes 
0.8386364 0.1613636 

In [11]:
# Stratified sampling with the rsample package
set.seed(123)
split <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- training(split)
ames_test <- testing(split)

In [14]:
# Specify resampling strategy
cv <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 5
)


# Create grid of hyperparameter values
hyper_grid <- expand.grid(k = seq(2, 25, by = 1))

# Tune a knn model using grid search
knn_fit <- train(
Sale_Price ~ .,
data = ames_train,
method = "knn",
trControl = cv,
tuneGrid = hyper_grid,
metric = "RMSE"
)

In [15]:
# Print and plot the CV results
knn_fit

k-Nearest Neighbors 

2053 samples
  80 predictor

No pre-processing
Resampling: Cross-Validated (10 fold, repeated 5 times) 
Summary of sample sizes: 1848, 1848, 1848, 1847, 1849, 1847, ... 
Resampling results across tuning parameters:

  k   RMSE      Rsquared   MAE     
   2  47850.39  0.6536592  31016.31
   3  45875.81  0.6769848  29784.85
   4  44529.45  0.6949249  28991.93
   5  43944.52  0.7026961  28737.77
   6  43645.34  0.7079746  28553.04
   7  43439.07  0.7129916  28617.80
   8  43657.69  0.7123356  28768.62
   9  43799.09  0.7129019  28904.80
  10  44058.33  0.7108970  29061.31
  11  44304.91  0.7091949  29197.78
  12  44565.82  0.7073437  29320.81
  13  44797.89  0.7056535  29475.03
  14  44966.27  0.7051474  29561.70
  15  45188.96  0.7035986  29731.55
  16  45376.24  0.7027131  29860.77
  17  45558.05  0.7016237  29974.52
  18  45666.30  0.7021351  30018.59
  19  45836.45  0.7013008  30105.59
  20  46044.44  0.6997198  30235.80
  21  46242.59  0.6983978  30367.95
  22  

The question remains: “Is this the best predictive model we can find?” We may
have identified the optimal k-nearest neighbor model for our given data set,
but this doesn’t mean we’ve found the best possible overall model.