In [None]:
# Google Colaboratoryの環境設定
if (file.exists("/content")) {
  options(Ncpus = parallel::detectCores())
  installed_packages <- rownames(installed.packages())
  packages_to_install <- c("factoextra", "ggbiplot", "igraph")
  install.packages(setdiff(packages_to_install, installed_packages))
}

## 13.1 主成分分析

In [None]:
library(tidyverse)

my_data <- data.frame(
  language  = c(  0,  20,  20,  25,  22,  17),
  english   = c(  0,  20,  40,  20,  24,  18),
  math      = c(100,  20,   5,  30,  17,  25),
  science   = c(  0,  20,   5,  25,  16,  23),
  society   = c(  0,  20,  30,   0,  21,  17),
  row.names = c("A", "B", "C", "D", "E", "F"))
my_result <- my_data %>% prcomp # 主成分分析の実行

In [None]:
my_result$x # 主成分スコア

In [None]:
my_result %>% ggbiplot::ggbiplot(
  labels = row.names(my_data),
  scale = 0)

In [None]:
my_result$rotation %>% t

In [None]:
summary(my_result)

In [None]:
my_result <- prcomp(
  x = my_data,
  scale = TRUE)       # 標準化
# あるいは
my_result <- prcomp(
  x = scale(my_data)) # 標準化データ

my_result$x # 主成分スコア

In [None]:
Z  <- my_data %>% scale(scale = FALSE) %>% as.matrix # 標準化しない場合
#Z <- my_data %>% scale(scale = TRUE)  %>% as.matrix # 標準化する場合

n <- nrow(my_data)
S <- var(Z)                          # 分散共分散行列
#S <- t(Z) %*% Z / (n - 1)           # （同じ結果）
tmp <- eigen(S)                      # 固有値と固有ベクトル
Z %*% tmp$vectors                    # 主成分スコア（結果は割愛）
cumsum(tmp$values) / sum(tmp$values) # 累積寄与率

In [None]:
udv <- svd(Z) # 特異値分解
U <- udv$u
d <- udv$d
V <- udv$v
W <- diag(d)

c(all.equal(Z, U %*% W %*% t(V), check.attributes = FALSE), # 確認1
  all.equal(t(U) %*% U, diag(dim(U)[2])),                   # 確認2
  all.equal(t(V) %*% V, diag(dim(V)[2])))                   # 確認3

U %*% W            # 主成分スコア（結果は割愛）

e <- d^2 / (n - 1) # 分散共分散行列の固有値
cumsum(e) / sum(e) # 累積寄与率

## 13.2 クラスタ分析

In [None]:
library(tidyverse)

my_data <- data.frame(
  x         = c(  0, -16,  10,  10),
  y         = c(  0,   0,  10, -15),
  row.names = c("A", "B", "C", "D"))

my_result <- my_data %>%
  dist("euclidian") %>% # distだけでも可
  hclust("complete")    # hclustだけでも可

In [None]:
my_result %>% factoextra::fviz_dend(
  k = 3, # クラスタ数
  rect = TRUE, rect_fill = TRUE)

In [None]:
my_result %>% factoextra::fviz_dend(
  k = 3,
  rect = TRUE, rect_fill = TRUE,
  type = "phylogenic")

In [None]:
my_result %>% cutree(3)

In [None]:
library(tidyverse)

my_data <- data.frame(
  language  = c(  0,  20,  20,  25,  22,  17),
  english   = c(  0,  20,  40,  20,  24,  18),
  math      = c(100,  20,   5,  30,  17,  25),
  science   = c(  0,  20,   5,  25,  16,  23),
  society   = c(  0,  20,  30,   0,  21,  17),
  row.names = c("A", "B", "C", "D", "E", "F"))

try( # RMarkdownで発生するエラーを回避する．
  my_data %>% scale %>%                        # 列ごとの標準化
    gplots::heatmap.2(cexRow = 1, cexCol = 1), # ラベルのサイズを指定して描画する．
  silent = TRUE)

In [None]:
library(tidyverse)

my_data <- data.frame(
  x         = c(  0, -16,  10,  10),
  y         = c(  0,   0,  10, -15),
  row.names = c("A", "B", "C", "D"))

my_result <- my_data %>% kmeans(3)

In [None]:
my_result$cluster

In [None]:
library(tidyverse)
library(factoextra)

my_data <- iris[, -5]

f <- 2:5 %>% map(function(k) {
  my_data %>% kmeans(k) %>%
    fviz_cluster(data = my_data, geom = "point") +
    ggtitle(sprintf("k = %s", k))
})
gridExtra::grid.arrange(f[[1]], f[[2]], f[[3]], f[[4]], ncol = 2)

In [None]:
fviz_nbclust(my_data, kmeans, method = "wss")

In [None]:
library(tidyverse)
my_data <- iris[, -5] %>% scale

my_result <- prcomp(my_data)$x %>% as.data.frame # 主成分分析

# 非階層的クラスタ分析の場合
my_result$cluster <- (my_data %>% scale %>% kmeans(3))$cluster %>% as.factor

# 階層的クラスタ分析の場合
#my_result$cluster <- my_data %>% dist %>% hclust %>% cutree(3) %>% as.factor

my_result %>%
  ggplot(aes(x = PC1, y = PC2, color = cluster)) + # 色でクラスタを表現する．
  geom_point(shape = iris$Species) +               # 形で品種を表現する．
  theme(legend.position = "none")