# 07

In [None]:
# Google Colaboratoryの環境設定
options(Ncpus = 2)
install.packages(c("caret", "doParallel", "pastecs"))

## 7.1 自動車の停止距離

## 7.2 データの確認

In [None]:
library(caret)
library(tidyverse)
my_data <- cars

In [None]:
dim(my_data)
#> [1] 50  2

In [None]:
head(my_data)
#>   speed dist
#> 1     4    2
#> 2     4   10
#> 3     7    4
#> 4     7   22
#> 5     8   16
#> 6     9   10

In [None]:
options(digits = 3)
pastecs::stat.desc(my_data)
#>                speed    dist
#> nbr.val       50.000   50.00
#> nbr.null       0.000    0.00
#> nbr.na         0.000    0.00
#> min            4.000    2.00
#> max           25.000  120.00
#> range         21.000  118.00
#> sum          770.000 2149.00
#> median        15.000   36.00
#> mean          15.400   42.98
#> SE.mean        0.748    3.64
#> CI.mean.0.95   1.503    7.32
#> var           27.959  664.06
#> std.dev        5.288   25.77
#> coef.var       0.343    0.60

In [None]:
my_data %>%
  ggplot(aes(x = speed, y = dist)) +
  geom_point()

## 7.3 回帰分析

### 7.3.1 回帰分析とは何か

### 7.3.2 線形単回帰分析

In [None]:
library(tidyverse)

my_data <- cars
tmp <- data.frame(speed = 21.5, dist = 67)
my_data %>% ggplot(aes(x = speed, y = dist)) +
  coord_cartesian(xlim = c(4, 25), ylim = c(0, 120)) +
  geom_point() +
  stat_smooth(formula = y ~ x, method = "lm") +
  geom_pointrange(data = tmp, aes(ymin = -9, ymax = dist),  linetype = "dotted") +
  geom_pointrange(data = tmp, aes(xmin =  0, xmax = speed), linetype = "dotted")

### 7.3.3 回帰分析の実践

#### 7.3.3.1 データの用意

In [None]:
library(caret)
library(tidyverse)
my_data <- cars

#### 7.3.3.2 訓練

In [None]:
my_model <- train(form = dist ~ speed, # モデル式（出力変数と入力変数の関係）
                  data = my_data,      # データ
                  method = "lm")       # 手法

In [None]:
coef(my_model$finalModel)
#> (Intercept)       speed
#> -17.579095    3.932409

#### 7.3.3.3 予測

In [None]:
tmp <- data.frame(speed = 21.5)
my_model %>% predict(tmp)
#>        1
#> 66.96769

#### 7.3.3.4 モデルの可視化

In [None]:
f <- function(x) { my_model %>% predict(data.frame(speed = x)) }

In [None]:
my_data %>%
  ggplot(aes(x = speed, y = dist,
             color = "data")) +
  geom_point() +
  stat_function(
    fun = f,
    mapping = aes(color = "model"))

## 7.4 当てはまりの良さの指標

### 7.4.1 RMSE

In [None]:
library(caret)
library(tidyverse)
my_data <- cars
my_model <- train(form = dist ~ speed, data = my_data, method = "lm")

y  <- my_data$dist
y_ <- my_model %>% predict(my_data)
my_data$y_ <- y_

In [None]:
my_data$residual <- y - y_
head(my_data)
#>   speed dist        y_  residual
#> 1     4    2 -1.849460  3.849460
#> 2     4   10 -1.849460 11.849460
#> 3     7    4  9.947766 -5.947766
#> 4     7   22  9.947766 12.052234
#> 5     8   16 13.880175  2.119825
#> 6     9   10 17.812584 -7.812584

In [None]:
my_data %>%
  ggplot(aes(x = speed, y = dist)) +
  geom_point() +
  geom_line(aes(x = speed, y = y_)) +
  geom_linerange(mapping = aes(ymin = y_, ymax = dist), linetype = "dotted")

In [None]:
RMSE(y_, y)
# あるいは
mean((my_data$residual^2))**0.5

#> [1] 15.06886

### 7.4.2 決定係数

In [None]:
R2(pred = y_, obs = y,
   form = "traditional")
#> [1] 0.6510794

In [None]:
R2(pred = y_, obs = y,
   form = "corr")
# あるいは
summary(my_model$finalModel)$r.squared
#> [1] 0.6510794

In [None]:
my_test <- my_data[1:3, ]
y  <- my_test$dist
y_ <- my_model %>% predict(my_test)

R2(pred = y_, obs = y,
   form = "traditional")
#> [1] -4.498191  # 決定係数1

R2(pred = y_, obs = y,
   form = "corr")
#> [1] 0.07692308 # 決定係数6

### 7.4.3 当てはまりの良さの指標の問題点

In [None]:
library(caret)
library(tidyverse)
my_data <- cars
my_idx <- c(2, 11, 27, 34, 39, 44)
my_sample <- my_data[my_idx, ]

In [None]:
options(warn = -1) # これ以降，警告を表示しない．
my_model <- train(form = dist ~ poly(speed, degree = 5, raw = TRUE),
                  data = my_sample,
                  method = "lm")
options(warn = 0)  # これ以降，警告を表示する．

y  <- my_sample$dist
y_ <- my_model %>% predict(my_sample)

In [None]:
RMSE(y_, y)
#> [1] 1.042275e-10 # RMSE

R2(pred = y_, obs = y,
   form = "traditional")
#> [1] 1 # 決定係数1

R2(pred = y_, obs = y,
   form = "corr")
#> [1] 1 # 決定係数6

In [None]:
f <- function(x) { my_model %>% predict(data.frame(speed = x)) }

my_data %>%
  ggplot(aes(x = speed, y = dist, color = "data")) +
  geom_point() +
  geom_point(data = my_sample, mapping = aes(color = "sample")) +
  stat_function(fun = f, mapping = aes(color = "model")) +
  coord_cartesian(ylim = c(0, 120))

## 7.5 K最近傍法

### 7.5.1 K最近傍法とは何か

### 7.5.2 K最近傍法の実践

In [None]:
# 準備
library(caret)
library(tidyverse)
my_data <- cars

# 訓練
my_model <- train(form = dist ~ speed, data = my_data, method = "knn")

# 可視化の準備
f <- function(x) { my_model %>% predict(data.frame(speed = x))}

In [None]:
my_data %>%
  ggplot(aes(x = speed,
             y = dist,
             color = "data")) +
  geom_point() +
  stat_function(
    fun = f,
    mapping = aes(color = "model"))

In [None]:
y  <- my_data$dist
y_ <- my_model %>% predict(my_data)

RMSE(y_, y)
#> [1] 13.96845 # RMSE

R2(pred = y_, obs = y,
   form = "traditional")
#> [1] 0.7001789 # 決定係数1

R2(pred = y_, obs = y,
   form = "corr")
#> [1] 0.7017314 # 決定係数6

## 7.6 検証

### 7.6.1 訓練データ・検証データ・テストデータ

### 7.6.2 検証とは何か

### 7.6.3 検証の実践

In [None]:
library(caret)
library(tidyverse)
my_data <- cars
my_model <- train(form = dist ~ speed, data = my_data, method = "lm")

my_model$results
#>   intercept    RMSE  Rsquared      MAE   RMSESD RsquaredSD    MAESD
#> 1      TRUE 16.0206 0.6662176 12.14701 2.518604 0.09249158 1.920564

In [None]:
my_model <- train(form = dist ~ speed, data = my_data, method = "lm",
                  trControl = trainControl(method = "cv", number = 5))
my_model$results
#>   intercept     RMSE  Rsquared      MAE  RMSESD RsquaredSD    MAESD
#> 1      TRUE 15.06708 0.6724501 12.12448 4.75811  0.1848932 3.052435

In [None]:
my_model <- train(form = dist ~ speed, data = my_data, method = "lm",
                  trControl = trainControl(method = "LOOCV"))
my_model$results
#>   intercept     RMSE  Rsquared      MAE
#> 1      TRUE 15.69731 0.6217139 12.05918

### 7.6.4 検証の並列化

In [None]:
library(doParallel)
cl <- makeCluster(detectCores())
registerDoParallel(cl)

### 7.6.5 指標のまとめ

#### 7.6.5.1 準備

In [None]:
library(caret)
library(tidyverse)
my_data <- cars
my_model <- train(form = dist ~ speed, data = my_data, method = "lm")
y  <- my_data$dist
y_ <- my_model %>% predict(my_data)

#### 7.6.5.2 当てはまりの良さの指標

In [None]:
# RMSE（訓練）
RMSE(y_, y)
#> [1] 15.06886

# 決定係数1（訓練）
R2(pred = y_, obs = y,
   form = "traditional")
#> [1] 0.6510794

# 決定係数6（訓練）
R2(pred = y_, obs = y,
   form = "corr")
#> [1] 0.6510794

In [None]:
postResample(pred = y_, obs = y)
#>       RMSE   Rsquared        MAE
#> 15.0688560  0.6510794 11.5801191

#### 7.6.5.3 予測性能の指標（簡単に求められるもの）

In [None]:
my_model <- train(form = dist ~ speed, data = my_data, method = "lm")
my_model$results
#>   intercept     RMSE  Rsquared      MAE ...
#> 1      TRUE 14.88504 0.6700353 11.59226 ...
# 左から，RMSE（検証），決定係数6（検証），MAE（検証）

#### 7.6.5.4 予測性能の指標（RとPythonで同じ結果を得る）

In [None]:
my_model <- train(form = dist ~ speed, data = my_data, method = "lm",
                  trControl = trainControl(method = "LOOCV"))

# 方法1
my_model$results
#>   intercept     RMSE  Rsquared      MAE
#> 1      TRUE 15.69731 0.6217139 12.05918

# 方法2
y  <- my_model$pred$obs
y_ <- my_model$pred$pred
mean((y - y_)^2)**0.5
#> [1] 15.69731

In [None]:
mean(((y - y_)^2)**0.5)
#> [1] 12.05918

### 7.6.6 補足：検証による手法の比較

In [None]:
library(caret)
library(tidyverse)
my_data <- cars

my_lm_model <- train(form = dist ~ speed, data = my_data, method = "lm",
                     trControl = trainControl(method = "LOOCV"))

my_knn_model <- train(form = dist ~ speed, data = my_data, method = "knn",
                      tuneGrid = data.frame(k = 5),
                      trControl = trainControl(method = "LOOCV"))

In [None]:
my_lm_model$results$RMSE
#> [1] 15.69731 # 線形回帰分析

my_knn_model$results$RMSE
#> [1] 15.79924 # K最近傍法

In [None]:
y     <- my_data$dist
y_lm  <- my_lm_model$pred$pred
y_knn <- my_knn_model$pred$pred

my_df <- data.frame(
  lm  = (y - y_lm)^2,
  knn = (y - y_knn)^2)

head(my_df)
#>           lm      knn
#> 1  18.913720 108.1600
#> 2 179.215044   0.6400
#> 3  41.034336 175.5625
#> 4 168.490212  49.0000
#> 5   5.085308   9.0000
#> 6  67.615888 112.8906

In [None]:
boxplot(my_df, ylab = "r^2")

In [None]:
t.test(x = my_df$lm, y = my_df$knn,
       conf.level = 0.95,
       paired = TRUE,
       alternative = "two.sided")

#>  Paired t-test
#>
#> data:  my_df$lm and my_df$knn
#> t = -0.12838, df = 49, p-value = 0.8984
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#>  -53.46930  47.04792
#> sample estimates:
#> mean of the differences
#>               -3.210688

## 7.7 パラメータチューニング

In [None]:
library(caret)
library(tidyverse)
my_data <- cars
my_model <- train(form = dist ~ speed, data = my_data, method = "knn")
my_model$results
#>   k     RMSE  Rsquared      MAE   RMSESD RsquaredSD    MAESD
#> 1 5 15.72114 0.6615765 12.54588 3.013243 0.11043907 2.372245
#> 2 7 16.19047 0.6601173 12.68464 3.165330 0.09990158 2.329326
#> 3 9 16.30276 0.6556700 12.84811 3.367423 0.09645747 2.471620

In [None]:
my_params <- expand.grid(k = 1:15)

my_model <- train(form = dist ~ speed, data = my_data, method = "knn",
                  tuneGrid = my_params,
                  trControl = trainControl(method = "LOOCV"))

In [None]:
head(my_model$results)
#>   k     RMSE  Rsquared      MAE
#> 1 1 17.22299 0.5777197 13.84900
#> 2 2 16.81462 0.5936438 13.03469
#> 3 3 16.32874 0.6218866 12.74524
#> 4 4 15.98970 0.6086993 12.27888
#> 5 5 15.79924 0.6169267 11.96067
#> 6 6 15.98720 0.6079396 12.26667

In [None]:
ggplot(my_model)

In [None]:
my_model$bestTune
#>   k
#> 5 5

In [None]:
my_model$results %>%
  filter(RMSE == min(RMSE))
#>   k     RMSE  Rsquared      MAE
#> 1 5 15.79924 0.6169267 11.96067

In [None]:
y  <- my_data$dist
y_ <- my_model %>% predict(my_data)
RMSE(y_, y)
#> [1] 13.96845

### 7.7.1 補足：ハイパーパラメータとRMSE（訓練）

In [None]:
library(caret)
library(tidyverse)
my_data <- cars

my_loocv <- function(k) {
  my_model <- train(form = dist ~ speed, data = my_data, method = "knn",
                    tuneGrid = data.frame(k = k),
                    trControl = trainControl(method = "LOOCV"))
  y  <- my_data$dist
  y_ <- my_model %>% predict(my_data)
  list(k = k,
       training = RMSE(y_, y),             # RMSE（訓練）
       validation = my_model$results$RMSE) # RMSE（検証）
}

my_results <- 1:15 %>% map_dfr(my_loocv)

In [None]:
my_results %>%
  pivot_longer(-k) %>%
  ggplot(aes(x = k, y = value,
             color = name)) +
  geom_line() + geom_point() +
  xlab("#Neighbors") + ylab("RMSE") +
  theme(legend.position = c(1, 0),
        legend.justification = c(1, 0))