In [None]:
# Helper packages
library(tidyverse)
library(visdat) # for additional visualizations
# Feature engineering packages
library(caret) # for various ML tasks
library(recipes) # for feature engineering tasks

library(rsample)

In [None]:
ames <- read_csv('data/ames.csv')
set.seed(123)
split <- initial_split(ames, prop = 0.7,
strata = "Sale_Price")
ames_train <- training(split)
ames_test <- testing(split)

In [None]:
# log transformation
ames_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_log(all_outcomes())

In [None]:
# Take stock of missing values
sum(is.na(AmesHousing::ames_raw))

In [None]:
# Visualize missing values

AmesHousing::ames_raw %>%
is.na() %>%
reshape2::melt() %>%
ggplot(aes(Var2, Var1, fill=value)) +
geom_raster() +
coord_flip() +
scale_y_continuous(NULL, expand = c(0, 0)) +
scale_fill_grey(name = "",
labels = c("Present","Missing")) +
xlab("Observation") +
theme(axis.text.y = element_text(size = 4))

In [None]:
# Check for missing values easily with vis_miss
vis_miss(AmesHousing::ames_raw, cluster = TRUE)

In [None]:
# Median imputation
ames_recipe %>%
    step_medianimpute(Gr_Liv_Area)

In [None]:
# Knn imputation
ames_recipe %>%
step_knnimpute(all_predictors(), neighbors = 6)

In [None]:
# Bagging imputation
ames_recipe %>%
step_bagimpute(all_predictors())

In [None]:
# Are there any near-zero variance variables to eliminate? 
# nzv says they are near the threshold

caret::nearZeroVar(ames_train, saveMetrics= TRUE) %>%
rownames_to_column() %>%
filter(nzv)

In [None]:
# Normalize all numeric columns
recipe(Sale_Price ~ ., data = ames_train) %>%
step_YeoJohnson(all_numeric())
## Data Recipe

In [None]:
# Center and scale all numeric variables
ames_recipe %>%
step_center(all_numeric(), -all_outcomes()) %>%
step_scale(all_numeric(), -all_outcomes())

In [None]:
count(ames_train, Neighborhood) %>% arrange(n)

In [None]:
count(ames_train, Screen_Porch) %>% arrange(n)

In [None]:
library(Ecdat)

In [None]:
count(Housing, gashw, prefarea)

In [None]:
# Lump infrequent categories into 
# an "other" level

# Lump levels for two features
lumping <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_other(Neighborhood, threshold = 0.01,
other = "other") %>%
step_other(Screen_Porch, threshold = 0.1,
other = ">0")

# Apply this blue print --> you will learn about this at
# the end of the chapter
apply_2_training <- prep(lumping, training = ames_train) %>%
bake(ames_train)
# New distribution of Neighborhood
count(apply_2_training, Neighborhood) %>% arrange(n)

In [None]:
# New distribution of Screen_Porch
count(apply_2_training, Screen_Porch) %>% arrange(n)

In [None]:
# One-hot encoding
# Lump levels for two features
recipe(Sale_Price ~ ., data = ames_train) %>%
step_dummy(all_nominal(), one_hot = TRUE)

In [None]:
# Original categories
count(ames_train, MS_SubClass)

In [None]:
recipe(Sale_Price ~ ., data = ames_train) %>%
step_integer(MS_SubClass) %>%
prep(ames_train) %>%
bake(ames_train) %>%
count(MS_SubClass)

In [None]:
# Label encoded
recipe(Sale_Price ~ ., data = ames_train) %>%
step_integer(Overall_Qual) %>%
prep(ames_train) %>%
bake(ames_train) %>%
count(Overall_Qual)

In [None]:
# PCA and retain 95% of the variance w
# the components

recipe(Sale_Price ~ ., data = ames_train) %>%
step_center(all_numeric()) %>%
step_scale(all_numeric()) %>%
step_pca(all_numeric(), threshold = .95)

In [None]:
# Set up our recipe

blueprint <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_nzv(all_nominal()) %>%
step_integer(matches("Qual|Cond|QC|Qu")) %>%
step_center(all_numeric(), -all_outcomes()) %>%
step_scale(all_numeric(), -all_outcomes()) %>%
step_pca(all_numeric(), -all_outcomes())
blueprint

In [None]:
prepare <- prep(blueprint, training = ames_train)
prepare

In [None]:
# Apply blueprint to the new data with bake

baked_train <- bake(prepare, new_data = ames_train)
baked_test <- bake(prepare, new_data = ames_test)
baked_train

In [None]:
# Set up the blueprint

blueprint <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_nzv(all_nominal()) %>%
step_integer(matches("Qual|Cond|QC|Qu")) %>%
step_center(all_numeric(), -all_outcomes()) %>%
step_scale(all_numeric(), -all_outcomes()) %>%
step_dummy(all_nominal(), -all_outcomes(),
one_hot = TRUE)

In [None]:
# Specify resampling plan
cv <- trainControl(
method = "repeatedcv",
    number = 10,
repeats = 5
)

# Construct grid of hyperparameter values
hyper_grid <- expand.grid(k = seq(2, 25, by = 1))

# Tune a knn model using grid search
knn_fit2 <- train(
blueprint,
data = ames_train,
method = "knn",
trControl = cv,
tuneGrid = hyper_grid,
metric = "RMSE"
)

In [None]:
# print model results
knn_fit2

In [None]:
# plot cross validation results
ggplot(knn_fit2)