### 4.1.1 平均・分散・標準偏差

In [None]:
x <- c(165, 170, 175, 180, 185)

mean(x) # 平均
#> [1] 175

In [None]:
n <- length(x) # 標本の大きさ
sum(x) / n
#> [1] 175

In [None]:
y <- c(173, 174, 175, 176, 177)
mean(y)
#> [1] 175

In [None]:
var(x) # xの分散
#> [1] 62.5

var(y) # yの分散
#> [1] 2.5

In [None]:
sum((x - mean(x))^2) / (n - 1) # 分散
#> [1] 62.5

In [None]:
sd(x) # xの標準偏差
#> [1] 7.905694

sd(y) # yの標準偏差
#> [1] 1.581139

In [None]:
var(x)**0.5 # xの標準偏差
#> [1] 7.905694

In [None]:
# 結果はデータフレーム
psych::describe(x)
#>    vars n mean   sd ...
#> X1    1 5  175 7.91 ...

# 結果は1次元データ
pastecs::stat.desc(x)
#>      nbr.val ...   std.dev ...
#>    5.0000000 ... 7.9056942 ...

In [None]:
x <- 1:9
quantile(x)
#>   0%  25%  50%  75% 100% 
#>    1    3    5    7    9 

In [None]:
my_df <- data.frame(
  english = c( 60,  90,  70,  90),
  math    = c( 70,  80,  90, 100))
psych::describe(my_df)
#>         vars n mean    sd ...
#> english    1 4 77.5 15.00 ...
#> math       2 4 85.0 12.91 ...



#### 4.1.1.1 補足：不偏分散とその平方根

In [None]:
var(x)
#> [1] 62.5

var(x) * (n - 1) / n
#> [1] 50

In [None]:
# 不偏分散の平方根
sd(x)
#> [1] 7.905694

# 標準偏差'
sqrt((n - 1) / n) * sd(x)
#> [1] 7.071068

In [None]:
sd(x) / length(x)**0.5
#> [1] 3.535534

### 4.1.2 データフレームの統計処理

In [None]:
library(tidyverse)

my_df <- data.frame(
  name    = c("A", "B", "C", "D"),
  english = c( 60,  90,  70,  90),
  math    = c( 70,  80,  90, 100),
  gender  = c("f", "m", "m", "f"))

#### 4.1.2.1 列ごとの集計

In [None]:
var(my_df$english)
#> [1] 225

In [None]:
my_df[, c(2, 3)] %>%
  summarize_each(var)
#>   english     math
#> 1     225 166.6667

In [None]:
my_df[, c(2, 3)] %>%
  summarize_each(function(x) {
    sd(x) / length(x)**0.5 })
# あるいは
my_df[, c(2, 3)] %>%
  summarize_each(
    ~ sd(.) / length(.)**0.5)

#>   english     math
#> 1     7.5 6.454972

In [None]:
psych::describe(my_df)
#>         vars n mean    sd ...
#> name*      1 4  2.5  1.29 ...
#> english    2 4 77.5 15.00 ...
#> math       3 4 85.0 12.91 ...
#> gender*    4 4  1.5  0.58 ...

pastecs::stat.desc(my_df)
#>                  english        math
#> nbr.val        4.0000000   4.0000000
#> nbr.null       0.0000000   0.0000000
#> nbr.na         0.0000000   0.0000000
#> （以下略）

#### 4.1.2.2 分割表

In [None]:
table(my_df$gender)

#> f m 
#> 2 2 

In [None]:
my_df2 <- data.frame(
  gender = my_df$gender,
  excel = my_df$math >= 80)
table(my_df2)

#>       excel
#> gender FALSE TRUE
#>      f     1    1
#>      m     0    2

#### 4.1.2.3 グループごとの集計

In [None]:
my_df %>%
  group_by(gender) %>%
  summarize(n = n())
#> # A tibble: 2 x 2
#>   gender `n()`
#>   <chr>  <int>
#> 1 f          2
#> 2 m          2

In [None]:
my_df[, -1] %>%
  group_by(gender) %>%
  summarize_each(mean)

#> # A tibble: 2 x 3
#>   gender english  math
#>   <chr>    <dbl> <dbl>
#> 1 f           75    85
#> 2 m           80    85

In [None]:
my_df[, -1] %>%
  group_by(gender) %>%
  summarize_each(
    ~ sd(.) /length(.)**0.5)
#> # A tibble: 2 x 3
#>   gender english  math
#>   <chr>    <dbl> <dbl>
#> 1 f           15    15
#> 2 m           10     5