In [None]:
# Google Colaboratoryの環境設定
if (file.exists("/content")) {
  options(Ncpus = parallel::detectCores())
  installed_packages <- rownames(installed.packages())
  packages_to_install <- c("caret", "doParallel", "pastecs")
  install.packages(setdiff(packages_to_install, installed_packages))
}

## 7.1 自動車の停止距離




## 7.2 データの確認

In [None]:
library(caret)
library(tidyverse)
my_data <- cars

In [None]:
dim(my_data)

In [None]:
head(my_data)

In [None]:
options(digits = 3)
pastecs::stat.desc(my_data)

In [None]:
my_data %>%
  ggplot(aes(x = speed, y = dist)) +
  geom_point()

## 7.3 回帰分析

In [None]:
library(tidyverse)

my_data <- cars
tmp <- data.frame(speed = 21.5, dist = 67)
my_data %>% ggplot(aes(x = speed, y = dist)) +
  coord_cartesian(xlim = c(4, 25), ylim = c(0, 120)) +
  geom_point() +
  stat_smooth(formula = y ~ x, method = "lm") +
  geom_pointrange(data = tmp, aes(ymin = -9, ymax = dist),  linetype = "dotted") +
  geom_pointrange(data = tmp, aes(xmin =  0, xmax = speed), linetype = "dotted")

In [None]:
library(caret)
library(tidyverse)
my_data <- cars

In [None]:
my_model <- train(form = dist ~ speed, # モデル式（出力変数と入力変数の関係）
                  data = my_data,      # データ
                  method = "lm")       # 手法

In [None]:
coef(my_model$finalModel)

In [None]:
tmp <- data.frame(speed = 21.5)
my_model %>% predict(tmp)

In [None]:
f <- function(x) { my_model %>% predict(data.frame(speed = x)) }

In [None]:
my_data %>%
  ggplot(aes(x = speed, y = dist,
             color = "data")) +
  geom_point() +
  stat_function(
    fun = f,
    mapping = aes(color = "model"))

## 7.4 当てはまりの良さの指標

In [None]:
library(caret)
library(tidyverse)
my_data <- cars
my_model <- train(form = dist ~ speed, data = my_data, method = "lm")

y  <- my_data$dist
y_ <- my_model %>% predict(my_data)
my_data$y_ <- y_

In [None]:
my_data$residual <- y - y_
head(my_data)

In [None]:
my_data %>%
  ggplot(aes(x = speed, y = dist)) +
  geom_point() +
  geom_line(aes(x = speed, y = y_)) +
  geom_linerange(mapping = aes(ymin = y_, ymax = dist), linetype = "dotted")

In [None]:
RMSE(y_, y)
# あるいは
mean((my_data$residual^2))**0.5


In [None]:
R2(pred = y_, obs = y,
   form = "traditional")

In [None]:
R2(pred = y_, obs = y,
   form = "corr")
# あるいは
summary(my_model$finalModel)$r.squared

In [None]:
my_test <- my_data[1:3, ]
y  <- my_test$dist
y_ <- my_model %>% predict(my_test)

R2(pred = y_, obs = y,
   form = "traditional")

R2(pred = y_, obs = y,
   form = "corr")

In [None]:
library(caret)
library(tidyverse)
my_data <- cars
my_idx <- c(2, 11, 27, 34, 39, 44)
my_sample <- my_data[my_idx, ]

In [None]:
options(warn = -1) # これ以降，警告を表示しない．
my_model <- train(form = dist ~ poly(speed, degree = 5, raw = TRUE),
                  data = my_sample,
                  method = "lm")
options(warn = 0)  # これ以降，警告を表示する．

y  <- my_sample$dist
y_ <- my_model %>% predict(my_sample)

In [None]:
RMSE(y_, y)

R2(pred = y_, obs = y,
   form = "traditional")

R2(pred = y_, obs = y,
   form = "corr")

In [None]:
f <- function(x) { my_model %>% predict(data.frame(speed = x)) }

my_data %>%
  ggplot(aes(x = speed, y = dist, color = "data")) +
  geom_point() +
  geom_point(data = my_sample, mapping = aes(color = "sample")) +
  stat_function(fun = f, mapping = aes(color = "model")) +
  coord_cartesian(ylim = c(0, 120))

## 7.5 K最近傍法

In [None]:
# 準備
library(caret)
library(tidyverse)
my_data <- cars

# 訓練
my_model <- train(form = dist ~ speed, data = my_data, method = "knn")

# 可視化の準備
f <- function(x) { my_model %>% predict(data.frame(speed = x))}

In [None]:
my_data %>%
  ggplot(aes(x = speed,
             y = dist,
             color = "data")) +
  geom_point() +
  stat_function(
    fun = f,
    mapping = aes(color = "model"))

In [None]:
y  <- my_data$dist
y_ <- my_model %>% predict(my_data)

RMSE(y_, y)

R2(pred = y_, obs = y,
   form = "traditional")

R2(pred = y_, obs = y,
   form = "corr")

## 7.6 検証

In [None]:
library(caret)
library(tidyverse)
my_data <- cars
my_model <- train(form = dist ~ speed, data = my_data, method = "lm")

my_model$results

In [None]:
my_model <- train(form = dist ~ speed, data = my_data, method = "lm",
                  trControl = trainControl(method = "cv", number = 5))
my_model$results

In [None]:
my_model <- train(form = dist ~ speed, data = my_data, method = "lm",
                  trControl = trainControl(method = "LOOCV"))
my_model$results

In [None]:
library(doParallel)
cl <- makeCluster(detectCores())
registerDoParallel(cl)

In [None]:
library(caret)
library(tidyverse)
my_data <- cars
my_model <- train(form = dist ~ speed, data = my_data, method = "lm")
y  <- my_data$dist
y_ <- my_model %>% predict(my_data)

In [None]:
# RMSE（訓練）
RMSE(y_, y)

# 決定係数1（訓練）
R2(pred = y_, obs = y,
   form = "traditional")

# 決定係数6（訓練）
R2(pred = y_, obs = y,
   form = "corr")

In [None]:
postResample(pred = y_, obs = y)

In [None]:
my_model <- train(form = dist ~ speed, data = my_data, method = "lm")
my_model$results
# 左から，RMSE（検証），決定係数6（検証），MAE（検証）

In [None]:
my_model <- train(form = dist ~ speed, data = my_data, method = "lm",
                  trControl = trainControl(method = "LOOCV"))

# 方法1
my_model$results

# 方法2
y  <- my_model$pred$obs
y_ <- my_model$pred$pred
mean((y - y_)^2)**0.5

In [None]:
mean(((y - y_)^2)**0.5)

In [None]:
library(caret)
library(tidyverse)
my_data <- cars

my_lm_model <- train(form = dist ~ speed, data = my_data, method = "lm",
                     trControl = trainControl(method = "LOOCV"))

my_knn_model <- train(form = dist ~ speed, data = my_data, method = "knn",
                      tuneGrid = data.frame(k = 5),
                      trControl = trainControl(method = "LOOCV"))

In [None]:
my_lm_model$results$RMSE

my_knn_model$results$RMSE

In [None]:
y     <- my_data$dist
y_lm  <- my_lm_model$pred$pred
y_knn <- my_knn_model$pred$pred

my_df <- data.frame(
  lm  = (y - y_lm)^2,
  knn = (y - y_knn)^2)

head(my_df)

In [None]:
boxplot(my_df, ylab = "r^2")

In [None]:
t.test(x = my_df$lm, y = my_df$knn,
       conf.level = 0.95,
       paired = TRUE,
       alternative = "two.sided")


## 7.7 パラメータチューニング

In [None]:
library(caret)
library(tidyverse)
my_data <- cars
my_model <- train(form = dist ~ speed, data = my_data, method = "knn")
my_model$results

In [None]:
my_params <- expand.grid(k = 1:15)

my_model <- train(form = dist ~ speed, data = my_data, method = "knn",
                  tuneGrid = my_params,
                  trControl = trainControl(method = "LOOCV"))

In [None]:
head(my_model$results)

In [None]:
ggplot(my_model)

In [None]:
my_model$bestTune

In [None]:
my_model$results %>%
  filter(RMSE == min(RMSE))

In [None]:
y  <- my_data$dist
y_ <- my_model %>% predict(my_data)
RMSE(y_, y)

In [None]:
library(caret)
library(tidyverse)
my_data <- cars

my_loocv <- function(k) {
  my_model <- train(form = dist ~ speed, data = my_data, method = "knn",
                    tuneGrid = data.frame(k = k),
                    trControl = trainControl(method = "LOOCV"))
  y  <- my_data$dist
  y_ <- my_model %>% predict(my_data)
  list(k = k,
       training = RMSE(y_, y),             # RMSE（訓練）
       validation = my_model$results$RMSE) # RMSE（検証）
}

my_results <- 1:15 %>% map_dfr(my_loocv)

In [None]:
my_results %>%
  pivot_longer(-k) %>%
  ggplot(aes(x = k, y = value,
             color = name)) +
  geom_line() + geom_point() +
  xlab("#Neighbors") + ylab("RMSE") +
  theme(legend.position = c(1, 0),
        legend.justification = c(1, 0))