In [None]:
# Google Colaboratoryの環境設定
if (file.exists("/content")) {
  options(Ncpus = parallel::detectCores())
  installed_packages <- rownames(installed.packages())
  packages_to_install <- c("caret")
  install.packages(setdiff(packages_to_install, installed_packages))
}

## 5.1 データの読み込み

In [None]:
library(tidyverse)
system(str_c("wget https://raw.githubusercontent.com/taroyabuki",
             "/fromzero/master/data/exam.csv"))

In [None]:
my_df <- read_csv("exam.csv")
# あるいは
my_df <- read.csv("exam.csv",
  stringsAsFactors = FALSE)

my_df

In [None]:
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/exam.csv")
my_df <- read_csv(my_url)
# あるいは
my_df <- read.csv(my_url, stringsAsFactors = FALSE)

In [None]:
my_df2 <- read.csv(
  file = "exam.csv",
  stringsAsFactors = FALSE,
  row.names = 1)
my_df2

In [None]:
my_df %>% write_csv("exam2.csv")
# あるいは
my_df %>% write.csv(
  file = "exam2.csv",
  row.names = FALSE)

In [None]:
my_df2 %>% write.csv("exam3.csv")

In [None]:
my_df <- read_csv(file = "exam.csv",
  locale = locale(encoding = "UTF-8"))
# あるいは
my_df <- read.csv(file = "exam.csv",
  stringsAsFactors = FALSE,
  fileEncoding = "UTF-8")

In [None]:
my_df %>% write_csv("exam2.csv")
# あるいは
my_df %>% write.csv(file = "exam2.csv", row.names = FALSE,
                    fileEncoding = "UTF-8")

In [None]:
my_url <- "https://taroyabuki.github.io/fromzero/exam.html"
my_tables <- xml2::read_html(my_url) %>% rvest::html_table()

In [None]:
my_tables

In [None]:
tmp <- my_tables[[1]]
tmp

In [None]:
# 1行目のデータを使って列の名前を付け直す．
colnames(tmp) <- tmp[1, ]

# 1行目と1列目を削除する．
my_data <- tmp[-1, -1]
my_data

In [None]:
library(jsonlite)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/exam.json")
my_data <- fromJSON(my_url)
#my_data <- fromJSON("exam.json") # （ファイルを使う場合）
my_data

In [None]:
library(xml2)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/exam.xml")
my_xml <- read_xml(my_url)      # XMLデータの読み込み
#my_xml <- read_xml("exam.xml") # （ファイルを使う場合）
xml_ns(my_xml)                  # 名前空間の確認（d1）

In [None]:
my_records <- xml_find_all(my_xml, ".//d1:record")

In [None]:
f <- function(record) {
  tmp <- xml_attrs(record)                    # 属性を全て取り出し，
  xml_children(record) %>% walk(function(e) {
    tmp[xml_name(e)] <<- xml_text(e)          # 子要素の名前と内容を追加する．
  })
  tmp
}

In [None]:
my_data <- my_records %>% map_dfr(f)
my_data$english <- as.numeric(my_data$english)
my_data$math    <- as.numeric(my_data$math)
my_data

## 5.2 データの変換

In [None]:
x1 <- c(1, 2, 3)

z1 <- scale(x1)
# あるいは
z1 <- (x1 - mean(x1)) / sd(x1)

z1

In [None]:
c(mean(z1), sd(z1))

In [None]:
z1 * sd(x1) + mean(x1)

In [None]:
x2 <- c(1, 3, 5)
z2 <- (x2 - mean(x1)) / sd(x1)
c(mean(z2), sd(z2))

In [None]:
library(caret)
library(tidyverse)

my_df <- data.frame(
  id = c(1, 2, 3),
  class = as.factor(
    c("A", "B", "C")))

my_enc <- my_df %>%
  dummyVars(formula = ~ .)

my_enc %>% predict(my_df)

In [None]:
my_df2 <- data.frame(
  id =    c( 4 ,  5 ,  6 ),
  class = c("B", "C", "B"))
my_enc %>% predict(my_df2)

In [None]:
my_enc <- my_df %>%
  dummyVars(formula = ~ .,
            fullRank = TRUE)
my_enc %>% predict(my_df)

my_enc %>% predict(my_df2)