From 367a845e256c321c05331e57b74eae8f8b7998d5 Mon Sep 17 00:00:00 2001 From: "Albert Y. Kim" Date: Fri, 12 Jan 2018 15:30:42 -0800 Subject: [PATCH] Revert "Updated pkgdown site" This reverts commit d362ab74df55d0850c473cc56868cff1c5eeeca8. --- docs/articles/flights_examples.Rmd | 331 ------------ docs/articles/flights_examples.html | 771 +++++++++++++--------------- docs/articles/mtcars_examples.Rmd | 210 -------- docs/articles/mtcars_examples.html | 748 ++++++++++++++------------- docs/authors.html | 2 +- docs/index.html | 2 +- docs/reference/calculate.html | 20 +- docs/reference/generate.html | 14 +- docs/reference/hypothesize.html | 41 +- docs/reference/index.html | 12 +- docs/reference/infer.html | 2 +- docs/reference/pipe.html | 2 +- docs/reference/print.infer.html | 2 +- docs/reference/rep_sample_n-1.png | Bin 57997 -> 0 bytes docs/reference/rep_sample_n.html | 236 --------- docs/reference/specify.html | 41 +- docs/reference/visualize.html | 2 +- 17 files changed, 829 insertions(+), 1607 deletions(-) delete mode 100755 docs/articles/flights_examples.Rmd delete mode 100644 docs/articles/mtcars_examples.Rmd delete mode 100644 docs/reference/rep_sample_n-1.png delete mode 100644 docs/reference/rep_sample_n.html diff --git a/docs/articles/flights_examples.Rmd b/docs/articles/flights_examples.Rmd deleted file mode 100755 index 3b200617..00000000 --- a/docs/articles/flights_examples.Rmd +++ /dev/null @@ -1,331 +0,0 @@ ---- -title: "Randomization Examples using `nycflights13` `flights` data" -author: "Chester Ismay and Andrew bray" -date: "`r Sys.Date()`" -output: - rmarkdown::html_vignette: - df_print: kable -vignette: | - %\VignetteIndexEntry{flights example} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r include=FALSE} -knitr::opts_chunk$set(fig.width = 8, fig.height = 5) -``` - -## Data preparation - -```{r message=FALSE, warning=FALSE} -library(nycflights13) -library(dplyr) -library(ggplot2) -library(stringr) -library(infer) -set.seed(2017) -fli_small <- flights %>% - sample_n(size = 500) %>% - mutate(season = case_when( - month %in% c(10:12, 1:3) ~ "winter", - month %in% c(4:9) ~ "summer" - )) %>% - mutate(day_hour = case_when( - between(hour, 1, 12) ~ "morning", - between(hour, 13, 24) ~ "not morning" - )) %>% - select(arr_delay, dep_delay, season, - day_hour, origin, carrier) %>% - filter(., complete.cases(.)) -``` - -* Two numeric - `arr_delay`, `dep_delay` -* Two categories - - `season` (`"winter"`, `"summer"`), - - `day_hour` (`"morning"`, `"not morning"`) -* Three categories - `origin` (`"EWR"`, `"JFK"`, `"LGA"`) -* Sixteen categories - `carrier` - -*** - -# Hypothesis tests - -### One numerical variable (mean) - -```{r} -x_bar <- fli_small %>% - summarize(mean(dep_delay)) %>% - pull() -null <- fli_small %>% - specify(response = dep_delay) %>% - hypothesize(null = "point", mu = 10) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "mean") -ggplot(null, aes(x = stat)) + - geom_density() + - geom_vline(xintercept = x_bar, color = "red") -null %>% - summarize(p_value = mean(stat > x_bar) * 2) -``` - -### One numerical variable (median) - -```{r} -x_tilde <- fli_small %>% - summarize(median(dep_delay)) %>% - pull() -null <- fli_small %>% - specify(response = dep_delay) %>% - hypothesize(null = "point", med = 0) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "median") -ggplot(null, aes(x = stat)) + - geom_bar() + - geom_vline(xintercept = x_tilde, color = "red") -null %>% - summarize(p_value = mean(stat < x_tilde) * 2) -``` - -### One categorical (2 level) variable - -```{r} -p_hat <- fli_small %>% - summarize(mean(day_hour == "morning")) %>% - pull() -null <- fli_small %>% - specify(response = day_hour, success = "morning") %>% - hypothesize(null = "point", p = .5) %>% - generate(reps = 1000, type = "simulate") %>% - calculate(stat = "prop") -ggplot(null, aes(x = stat)) + - geom_density() + - geom_vline(xintercept = p_hat, color = "red") -null %>% - summarize(p_value = mean(stat < p_hat) * 2) -``` - -### Two categorical (2 level) variables - -```{r} -d_hat <- fli_small %>% - group_by(season) %>% - summarize(prop = mean(day_hour == "morning")) %>% - summarize(diff(prop)) %>% - pull() -null <- fli_small %>% - specify(day_hour ~ season, success = "morning") %>% - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "diff in props", order = c("summer", "winter")) -ggplot(null, aes(x = stat)) + - geom_density() + - geom_vline(xintercept = d_hat, color = "red") -null %>% - summarize(p_value = mean(stat < d_hat) * 2) -``` - -### One categorical (>2 level) - GoF - -```{r} -Chisq_hat <- chisq.test(table(fli_small$origin))$stat -null <- fli_small %>% - specify(response = origin) %>% - hypothesize(null = "point", p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>% - generate(reps = 1000, type = "simulate") %>% - calculate(stat = "Chisq") -ggplot(null, aes(x = stat)) + - geom_density() + - geom_vline(xintercept = Chisq_hat, color = "red") -null %>% - summarize(p_value = mean(stat > Chisq_hat)) -``` - -### Two categorical (>2 level) variables - -```{r} -Chisq_hat <- chisq.test(table(fli_small$day_hour, fli_small$origin))$stat -null <- fli_small %>% - specify(day_hour ~ origin) %>% - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "Chisq") -ggplot(null, aes(x = stat)) + - geom_density() + - geom_vline(xintercept = Chisq_hat, color = "red") -null %>% - summarize(p_value = mean(stat > Chisq_hat)) -``` - -### One numerical variable, one categorical (2 levels) (diff in means) - -```{r} -d_hat <- fli_small %>% - group_by(season) %>% - summarize(mean_stat = mean(dep_delay)) %>% - summarize(diff(mean_stat)) %>% - pull() -null <- fli_small %>% - specify(dep_delay ~ season) %>% # alt: response = dep_delay, explanatory = season - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "diff in means", order = c("summer", "winter")) -ggplot(null, aes(x = stat)) + - geom_density() + - geom_vline(xintercept = d_hat, color = "red") -null %>% - summarize(p_value = mean(stat > d_hat) * 2) -``` - -### One numerical variable, one categorical (2 levels) (diff in medians) - -```{r} -d_hat <- fli_small %>% - group_by(season) %>% - summarize(median_stat = median(dep_delay)) %>% - summarize(diff(median_stat)) %>% - pull() -null <- fli_small %>% - specify(dep_delay ~ season) %>% # alt: response = dep_delay, explanatory = season - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "diff in medians", order = c("summer", "winter")) -ggplot(null, aes(x = stat)) + - geom_bar() + - geom_vline(xintercept = d_hat, color = "red") -null %>% - summarize(p_value = mean(stat > d_hat) * 2) -``` - -### One numerical, one categorical (>2 levels) - ANOVA - -```{r} -F_hat <- anova(aov(formula = arr_delay ~ origin, data = fli_small))$`F value`[1] -null <- fli_small %>% - specify(arr_delay ~ origin) %>% # alt: response = arr_delay, explanatory = origin - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "F") -ggplot(null, aes(x = stat)) + - geom_density() + - geom_vline(xintercept = F_hat, color = "red") -null %>% - summarize(p_value = mean(stat > F_hat)) -``` - -### Two numerical vars - SLR - -```{r} -slope_hat <- lm(arr_delay ~ dep_delay, data = fli_small) %>% - broom::tidy() %>% - filter(term == "dep_delay") %>% - select(estimate) %>% - pull() -null <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% # alt: response = arr_delay, explanatory = dep_delay - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "slope") -ggplot(null, aes(x = stat)) + - geom_density() + - geom_vline(xintercept = slope_hat, color = "red") -null %>% - summarize(p_value = mean(stat > slope_hat) * 2) -``` - -## Confidence intervals - -### One numerical (one mean) - -```{r} -x_bar <- fli_small %>% - summarize(mean(arr_delay)) %>% - pull() -boot <- fli_small %>% - specify(response = arr_delay) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "mean") %>% - pull() -c(lower = x_bar - 2 * sd(boot), - upper = x_bar + 2 * sd(boot)) -``` - -### One numerical (one median) - -```{r} -x_tilde <- fli_small %>% - summarize(median(arr_delay)) %>% - pull() -boot <- fli_small %>% - specify(response = arr_delay) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "median") %>% - pull() -c(lower = x_tilde - 2 * sd(boot), - upper = x_tilde + 2 * sd(boot)) -``` - -### One categorical (one proportion) - -```{r} -p_hat <- fli_small %>% - summarize(mean(day_hour == "morning")) %>% - pull() -boot <- fli_small %>% - specify(response = day_hour, success = "morning") %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "prop") %>% - pull() -c(lower = p_hat - 2 * sd(boot), - upper = p_hat + 2 * sd(boot)) -``` - -### One numerical variable, one categorical (2 levels) (diff in means) - -```{r} -d_hat <- fli_small %>% - group_by(season) %>% - summarize(mean_stat = mean(arr_delay)) %>% - summarize(diff(mean_stat)) %>% - pull() -boot <- fli_small %>% - specify(arr_delay ~ season) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "diff in means", order = c("summer", "winter")) %>% - pull() -c(lower = p_hat - 2 * sd(boot), - upper = p_hat + 2 * sd(boot)) -``` - -### Two categorical variables (diff in proportions) - -```{r} -d_hat <- fli_small %>% - group_by(season) %>% - summarize(prop = mean(day_hour == "morning")) %>% - summarize(diff(prop)) %>% - pull() -boot <- fli_small %>% - specify(day_hour ~ season, success = "morning") %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "diff in props", order = c("summer", "winter")) %>% - pull() -c(lower = d_hat - 2 * sd(boot), - upper = d_hat + 2 * sd(boot)) -``` - -### Two numerical vars - SLR - -```{r} -slope_hat <- lm(arr_delay ~ dep_delay, data = fli_small) %>% - broom::tidy() %>% - filter(term == "dep_delay") %>% - select(estimate) %>% - pull() -boot <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "slope") %>% - pull() -c(lower = slope_hat - 2 * sd(boot), - upper = slope_hat + 2 * sd(boot)) -``` diff --git a/docs/articles/flights_examples.html b/docs/articles/flights_examples.html index b1638470..e8eea71a 100644 --- a/docs/articles/flights_examples.html +++ b/docs/articles/flights_examples.html @@ -1,81 +1,92 @@ - - - + - - - - - - - - - - - -Randomization Examples using nycflights13 flights data - - - - - - - - - - + + + + +Randomization Examples using <code>nycflights13</code> <code>flights</code> data • infer + + + + - +
+
+ + +
+
+ - - -

Randomization Examples using nycflights13 flights data

-

Chester Ismay and Andrew bray

-

2018-01-05

- - - + + +
-

Data preparation

+

+Data preparation

library(nycflights13)
 library(dplyr)
 library(ggplot2)
@@ -83,456 +94,398 @@ 

Data preparation

library(infer) set.seed(2017) fli_small <- flights %>% - sample_n(size = 500) %>% - mutate(season = case_when( - month %in% c(10:12, 1:3) ~ "winter", - month %in% c(4:9) ~ "summer" + sample_n(size = 500) %>% + mutate(season = case_when( + month %in% c(10:12, 1:3) ~ "winter", + month %in% c(4:9) ~ "summer" )) %>% - mutate(day_hour = case_when( - between(hour, 1, 12) ~ "morning", - between(hour, 13, 24) ~ "not morning" + mutate(day_hour = case_when( + between(hour, 1, 12) ~ "morning", + between(hour, 13, 24) ~ "not morning" )) %>% - select(arr_delay, dep_delay, season, + select(arr_delay, dep_delay, season, day_hour, origin, carrier) %>% - filter(., complete.cases(.))
+ filter(., complete.cases(.))
    -
  • Two numeric - arr_delay, dep_delay
  • +
  • Two numeric - arr_delay, dep_delay +
  • Two categories
      -
    • season ("winter", "summer"),
    • -
    • day_hour ("morning", "not morning")
    • -
  • -
  • Three categories - origin ("EWR", "JFK", "LGA")
  • -
  • Sixteen categories - carrier
  • +
  • +season ("winter", "summer"),
  • +
  • +day_hour ("morning", "not morning")
  • +
+ +
  • Three categories - origin ("EWR", "JFK", "LGA")
  • +
  • Sixteen categories - carrier +
  • -
    +
    -

    Hypothesis tests

    +

    +Hypothesis tests

    -

    One numerical variable (mean)

    +

    +One numerical variable (mean)

    x_bar <- fli_small %>%
    -  summarize(mean(dep_delay)) %>%
    -  pull()
    +  summarize(mean(dep_delay)) %>%
    +  pull()
     null <- fli_small %>%
    -  specify(response = dep_delay) %>%
    -  hypothesize(null = "point", mu = 10) %>% 
    -  generate(reps = 1000, type = "bootstrap") %>% 
    -  calculate(stat = "mean")
    -ggplot(null, aes(x = stat)) +
    -  geom_density() +
    -  geom_vline(xintercept = x_bar, color = "red")
    -

    + specify(response = dep_delay) %>% + hypothesize(null = "point", mu = 10) %>% + generate(reps = 1000, type = "bootstrap") %>% + calculate(stat = "mean") +ggplot(null, aes(x = stat)) + + geom_density() + + geom_vline(xintercept = x_bar, color = "red")
    +

    null %>%
    -  summarize(p_value = mean(stat > x_bar) * 2)
    -
    - - - - - - - - - - - -
    p_value
    0.618
    -
    + summarize(p_value = mean(stat > x_bar) * 2)
    +
    ## # A tibble: 1 x 1
    +##   p_value
    +##     <dbl>
    +## 1   0.618
    -

    One numerical variable (median)

    +

    +One numerical variable (median)

    x_tilde <- fli_small %>%
    -  summarize(median(dep_delay)) %>%
    -  pull()
    +  summarize(median(dep_delay)) %>%
    +  pull()
     null <- fli_small %>%
    -  specify(response = dep_delay) %>%
    -  hypothesize(null = "point", med = 0) %>% 
    -  generate(reps = 1000, type = "bootstrap") %>% 
    -  calculate(stat = "median")
    -ggplot(null, aes(x = stat)) +
    -  geom_bar() +
    -  geom_vline(xintercept = x_tilde, color = "red")
    -

    + specify(response = dep_delay) %>% + hypothesize(null = "point", med = 0) %>% + generate(reps = 1000, type = "bootstrap") %>% + calculate(stat = "median") +ggplot(null, aes(x = stat)) + + geom_bar() + + geom_vline(xintercept = x_tilde, color = "red")
    +

    null %>%
    -  summarize(p_value = mean(stat < x_tilde) * 2)
    -
    - - - - - - - - - - - -
    p_value
    0
    -
    + summarize(p_value = mean(stat < x_tilde) * 2)
    +
    ## # A tibble: 1 x 1
    +##   p_value
    +##     <dbl>
    +## 1       0
    -

    One categorical (2 level) variable

    +

    +One categorical (2 level) variable

    p_hat <- fli_small %>%
    -  summarize(mean(day_hour == "morning")) %>%
    -  pull()
    +  summarize(mean(day_hour == "morning")) %>%
    +  pull()
     null <- fli_small %>%
    -  specify(response = day_hour, success = "morning") %>%
    -  hypothesize(null = "point", p = .5) %>% 
    -  generate(reps = 1000, type = "simulate") %>% 
    -  calculate(stat = "prop")
    -ggplot(null, aes(x = stat)) +
    -  geom_density() +
    -  geom_vline(xintercept = p_hat, color = "red")
    -

    + specify(response = day_hour, success = "morning") %>% + hypothesize(null = "point", p = .5) %>% + generate(reps = 1000, type = "simulate") %>% + calculate(stat = "prop") +ggplot(null, aes(x = stat)) + + geom_density() + + geom_vline(xintercept = p_hat, color = "red")
    +

    null %>%
    -  summarize(p_value = mean(stat < p_hat) * 2)
    -
    - - - - - - - - - - - -
    p_value
    0.02
    -
    + summarize(p_value = mean(stat < p_hat) * 2) +
    ## # A tibble: 1 x 1
    +##   p_value
    +##     <dbl>
    +## 1    0.02
    -

    Two categorical (2 level) variables

    +

    +Two categorical (2 level) variables

    d_hat <- fli_small %>%
    -  group_by(season) %>%
    -  summarize(prop = mean(day_hour == "morning")) %>%
    -  summarize(diff(prop)) %>%
    -  pull()
    +  group_by(season) %>%
    +  summarize(prop = mean(day_hour == "morning")) %>%
    +  summarize(diff(prop)) %>%
    +  pull()
     null <- fli_small %>%
    -  specify(day_hour ~ season, success = "morning") %>%
    -  hypothesize(null = "independence") %>% 
    -  generate(reps = 1000, type = "permute") %>% 
    -  calculate(stat = "diff in props", order = c("summer", "winter"))
    -ggplot(null, aes(x = stat)) +
    -  geom_density() +
    -  geom_vline(xintercept = d_hat, color = "red")
    -

    + specify(day_hour ~ season, success = "morning") %>% + hypothesize(null = "independence") %>% + generate(reps = 1000, type = "permute") %>% + calculate(stat = "diff in props", order = c("summer", "winter")) +ggplot(null, aes(x = stat)) + + geom_density() + + geom_vline(xintercept = d_hat, color = "red")
    +

    null %>%
    -  summarize(p_value = mean(stat < d_hat) * 2)
    -
    - - - - - - - - - - - -
    p_value
    0.592
    -
    + summarize(p_value = mean(stat < d_hat) * 2) +
    ## # A tibble: 1 x 1
    +##   p_value
    +##     <dbl>
    +## 1   0.592
    -

    One categorical (>2 level) - GoF

    +

    +One categorical (>2 level) - GoF

    Chisq_hat <- chisq.test(table(fli_small$origin))$stat
     null <- fli_small %>%
    -  specify(response = origin) %>%
    -  hypothesize(null = "point", p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>% 
    -  generate(reps = 1000, type = "simulate") %>% 
    -  calculate(stat = "Chisq")
    -ggplot(null, aes(x = stat)) +
    -  geom_density() +
    -  geom_vline(xintercept = Chisq_hat, color = "red")
    -

    + specify(response = origin) %>% + hypothesize(null = "point", p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>% + generate(reps = 1000, type = "simulate") %>% + calculate(stat = "Chisq") +ggplot(null, aes(x = stat)) + + geom_density() + + geom_vline(xintercept = Chisq_hat, color = "red")
    +

    null %>%
    -  summarize(p_value = mean(stat > Chisq_hat)) 
    -
    - - - - - - - - - - - -
    p_value
    0.019
    -
    + summarize(p_value = mean(stat > Chisq_hat)) +
    ## # A tibble: 1 x 1
    +##   p_value
    +##     <dbl>
    +## 1   0.019
    -

    Two categorical (>2 level) variables

    +

    +Two categorical (>2 level) variables

    Chisq_hat <- chisq.test(table(fli_small$day_hour, fli_small$origin))$stat
     null <- fli_small %>%
    -  specify(day_hour ~ origin) %>%
    -  hypothesize(null = "independence") %>% 
    -  generate(reps = 1000, type = "permute") %>% 
    -  calculate(stat = "Chisq")
    -ggplot(null, aes(x = stat)) +
    -  geom_density() +
    -  geom_vline(xintercept = Chisq_hat, color = "red")
    -

    + specify(day_hour ~ origin) %>% + hypothesize(null = "independence") %>% + generate(reps = 1000, type = "permute") %>% + calculate(stat = "Chisq") +ggplot(null, aes(x = stat)) + + geom_density() + + geom_vline(xintercept = Chisq_hat, color = "red")
    +

    null %>%
    -  summarize(p_value = mean(stat > Chisq_hat)) 
    -
    - - - - - - - - - - - -
    p_value
    0.057
    -
    + summarize(p_value = mean(stat > Chisq_hat)) +
    ## # A tibble: 1 x 1
    +##   p_value
    +##     <dbl>
    +## 1   0.057
    -

    One numerical variable, one categorical (2 levels) (diff in means)

    +

    +One numerical variable, one categorical (2 levels) (diff in means)

    d_hat <- fli_small %>% 
    -  group_by(season) %>% 
    -  summarize(mean_stat = mean(dep_delay)) %>% 
    -  summarize(diff(mean_stat)) %>% 
    -  pull()
    +  group_by(season) %>% 
    +  summarize(mean_stat = mean(dep_delay)) %>% 
    +  summarize(diff(mean_stat)) %>% 
    +  pull()
     null <- fli_small %>%
    -  specify(dep_delay ~ season) %>% # alt: response = dep_delay, explanatory = season
    -  hypothesize(null = "independence") %>%
    -  generate(reps = 1000, type = "permute") %>%
    -  calculate(stat = "diff in means", order = c("summer", "winter"))
    -ggplot(null, aes(x = stat)) +
    -  geom_density() +
    -  geom_vline(xintercept = d_hat, color = "red")
    -

    + specify(dep_delay ~ season) %>% # alt: response = dep_delay, explanatory = season + hypothesize(null = "independence") %>% + generate(reps = 1000, type = "permute") %>% + calculate(stat = "diff in means", order = c("summer", "winter")) +ggplot(null, aes(x = stat)) + + geom_density() + + geom_vline(xintercept = d_hat, color = "red")
    +

    null %>%
    -  summarize(p_value = mean(stat > d_hat) * 2)   
    -
    - - - - - - - - - - - -
    p_value
    1.574
    -
    + summarize(p_value = mean(stat > d_hat) * 2) +
    ## # A tibble: 1 x 1
    +##   p_value
    +##     <dbl>
    +## 1   1.574
    -

    One numerical variable, one categorical (2 levels) (diff in medians)

    +

    +One numerical variable, one categorical (2 levels) (diff in medians)

    d_hat <- fli_small %>% 
    -  group_by(season) %>% 
    -  summarize(median_stat = median(dep_delay)) %>% 
    -  summarize(diff(median_stat)) %>% 
    -  pull()
    +  group_by(season) %>% 
    +  summarize(median_stat = median(dep_delay)) %>% 
    +  summarize(diff(median_stat)) %>% 
    +  pull()
     null <- fli_small %>%
    -  specify(dep_delay ~ season) %>% # alt: response = dep_delay, explanatory = season
    -  hypothesize(null = "independence") %>%
    -  generate(reps = 1000, type = "permute") %>%
    -  calculate(stat = "diff in medians", order = c("summer", "winter"))
    -ggplot(null, aes(x = stat)) +
    -  geom_bar() +
    -  geom_vline(xintercept = d_hat, color = "red")
    -

    + specify(dep_delay ~ season) %>% # alt: response = dep_delay, explanatory = season + hypothesize(null = "independence") %>% + generate(reps = 1000, type = "permute") %>% + calculate(stat = "diff in medians", order = c("summer", "winter")) +ggplot(null, aes(x = stat)) + + geom_bar() + + geom_vline(xintercept = d_hat, color = "red")
    +

    null %>%
    -  summarize(p_value = mean(stat > d_hat) * 2)    
    -
    - - - - - - - - - - - -
    p_value
    0.098
    -
    + summarize(p_value = mean(stat > d_hat) * 2) +
    ## # A tibble: 1 x 1
    +##   p_value
    +##     <dbl>
    +## 1   0.098
    -

    One numerical, one categorical (>2 levels) - ANOVA

    +

    +One numerical, one categorical (>2 levels) - ANOVA

    F_hat <- anova(aov(formula = arr_delay ~ origin, data = fli_small))$`F value`[1]
     null <- fli_small %>%
    -   specify(arr_delay ~ origin) %>% # alt: response = arr_delay, explanatory = origin
    -   hypothesize(null = "independence") %>%
    -   generate(reps = 1000, type = "permute") %>%
    -   calculate(stat = "F")
    -ggplot(null, aes(x = stat)) +
    -  geom_density() +
    -  geom_vline(xintercept = F_hat, color = "red")  
    -

    + specify(arr_delay ~ origin) %>% # alt: response = arr_delay, explanatory = origin + hypothesize(null = "independence") %>% + generate(reps = 1000, type = "permute") %>% + calculate(stat = "F") +ggplot(null, aes(x = stat)) + + geom_density() + + geom_vline(xintercept = F_hat, color = "red")
    +

    null %>% 
    -  summarize(p_value = mean(stat > F_hat))
    -
    - - - - - - - - - - - -
    p_value
    0.309
    -
    + summarize(p_value = mean(stat > F_hat)) +
    ## # A tibble: 1 x 1
    +##   p_value
    +##     <dbl>
    +## 1   0.309
    -

    Two numerical vars - SLR

    +

    +Two numerical vars - SLR

    slope_hat <- lm(arr_delay ~ dep_delay, data = fli_small) %>% 
    -  broom::tidy() %>% 
    -  filter(term == "dep_delay") %>% 
    -  select(estimate) %>% 
    -  pull()
    +  broom::tidy() %>% 
    +  filter(term == "dep_delay") %>% 
    +  select(estimate) %>% 
    +  pull()
     null <- fli_small %>%
    -   specify(arr_delay ~ dep_delay) %>% # alt: response = arr_delay, explanatory = dep_delay
    -   hypothesize(null = "independence") %>%
    -   generate(reps = 1000, type = "permute") %>%
    -   calculate(stat = "slope")
    -ggplot(null, aes(x = stat)) +
    -  geom_density() +
    -  geom_vline(xintercept = slope_hat, color = "red")  
    -

    + specify(arr_delay ~ dep_delay) %>% # alt: response = arr_delay, explanatory = dep_delay + hypothesize(null = "independence") %>% + generate(reps = 1000, type = "permute") %>% + calculate(stat = "slope") +ggplot(null, aes(x = stat)) + + geom_density() + + geom_vline(xintercept = slope_hat, color = "red")
    +

    null %>% 
    -  summarize(p_value = mean(stat > slope_hat) * 2)   
    -
    - - - - - - - - - - - -
    p_value
    0
    -
    + summarize(p_value = mean(stat > slope_hat) * 2) +
    ## # A tibble: 1 x 1
    +##   p_value
    +##     <dbl>
    +## 1       0
    -

    Confidence intervals

    +

    +Confidence intervals

    -

    One numerical (one mean)

    +

    +One numerical (one mean)

    x_bar <- fli_small %>% 
    -   summarize(mean(arr_delay)) %>% 
    -   pull()
    +   summarize(mean(arr_delay)) %>% 
    +   pull()
     boot <- fli_small %>%
    -   specify(response = arr_delay) %>%
    -   generate(reps = 1000, type = "bootstrap") %>%
    -   calculate(stat = "mean") %>% 
    -   pull()
    +   specify(response = arr_delay) %>%
    +   generate(reps = 1000, type = "bootstrap") %>%
    +   calculate(stat = "mean") %>% 
    +   pull()
     c(lower = x_bar - 2 * sd(boot), 
       upper = x_bar + 2 * sd(boot))
    ##      lower      upper 
     ## -0.4499535  6.5942834
    -

    One numerical (one median)

    +

    +One numerical (one median)

    x_tilde <- fli_small %>% 
    -   summarize(median(arr_delay)) %>% 
    -   pull()
    +   summarize(median(arr_delay)) %>% 
    +   pull()
     boot <- fli_small %>%
    -   specify(response = arr_delay) %>%
    -   generate(reps = 1000, type = "bootstrap") %>%
    -   calculate(stat = "median") %>% 
    -   pull()
    +   specify(response = arr_delay) %>%
    +   generate(reps = 1000, type = "bootstrap") %>%
    +   calculate(stat = "median") %>% 
    +   pull()
     c(lower = x_tilde - 2 * sd(boot), 
       upper = x_tilde + 2 * sd(boot))
    ##      lower      upper 
     ## -10.181214  -3.818786
    -

    One categorical (one proportion)

    +

    +One categorical (one proportion)

    p_hat <- fli_small %>%
    -  summarize(mean(day_hour == "morning")) %>%
    -  pull()
    +  summarize(mean(day_hour == "morning")) %>%
    +  pull()
     boot <- fli_small %>%
    -  specify(response = day_hour, success = "morning") %>%
    -  generate(reps = 1000, type = "bootstrap") %>% 
    -  calculate(stat = "prop") %>%
    -  pull()
    +  specify(response = day_hour, success = "morning") %>%
    +  generate(reps = 1000, type = "bootstrap") %>% 
    +  calculate(stat = "prop") %>%
    +  pull()
     c(lower = p_hat - 2 * sd(boot), 
       upper = p_hat + 2 * sd(boot))
    ##     lower     upper 
     ## 0.4057081 0.4973847
    -

    One numerical variable, one categorical (2 levels) (diff in means)

    +

    +One numerical variable, one categorical (2 levels) (diff in means)

    d_hat <- fli_small %>% 
    -  group_by(season) %>% 
    -  summarize(mean_stat = mean(arr_delay)) %>% 
    -  summarize(diff(mean_stat)) %>% 
    -  pull()
    +  group_by(season) %>% 
    +  summarize(mean_stat = mean(arr_delay)) %>% 
    +  summarize(diff(mean_stat)) %>% 
    +  pull()
     boot <- fli_small %>%
    -   specify(arr_delay ~ season) %>%
    -   generate(reps = 1000, type = "bootstrap") %>%
    -   calculate(stat = "diff in means", order = c("summer", "winter")) %>% 
    -   pull()
    +   specify(arr_delay ~ season) %>%
    +   generate(reps = 1000, type = "bootstrap") %>%
    +   calculate(stat = "diff in means", order = c("summer", "winter")) %>% 
    +   pull()
     c(lower = p_hat - 2 * sd(boot), 
       upper = p_hat + 2 * sd(boot))
    ##     lower     upper 
     ## -6.663335  7.566428
    -

    Two categorical variables (diff in proportions)

    +

    +Two categorical variables (diff in proportions)

    d_hat <- fli_small %>%
    -  group_by(season) %>%
    -  summarize(prop = mean(day_hour == "morning")) %>%
    -  summarize(diff(prop)) %>%
    -  pull()
    +  group_by(season) %>%
    +  summarize(prop = mean(day_hour == "morning")) %>%
    +  summarize(diff(prop)) %>%
    +  pull()
     boot <- fli_small %>%
    -  specify(day_hour ~ season, success = "morning") %>%
    -  generate(reps = 1000, type = "bootstrap") %>% 
    -  calculate(stat = "diff in props", order = c("summer", "winter")) %>%
    -  pull()
    +  specify(day_hour ~ season, success = "morning") %>%
    +  generate(reps = 1000, type = "bootstrap") %>% 
    +  calculate(stat = "diff in props", order = c("summer", "winter")) %>%
    +  pull()
     c(lower = d_hat - 2 * sd(boot), 
       upper = d_hat + 2 * sd(boot))
    ##       lower       upper 
     ## -0.11165849  0.06829114
    -

    Two numerical vars - SLR

    +

    +Two numerical vars - SLR

    slope_hat <- lm(arr_delay ~ dep_delay, data = fli_small) %>% 
    -  broom::tidy() %>% 
    -  filter(term == "dep_delay") %>% 
    -  select(estimate) %>% 
    -  pull()
    +  broom::tidy() %>% 
    +  filter(term == "dep_delay") %>% 
    +  select(estimate) %>% 
    +  pull()
     boot <- fli_small %>%
    -   specify(arr_delay ~ dep_delay) %>% 
    -   generate(reps = 1000, type = "bootstrap") %>%
    -   calculate(stat = "slope") %>% 
    -   pull()
    +   specify(arr_delay ~ dep_delay) %>% 
    +   generate(reps = 1000, type = "bootstrap") %>%
    +   calculate(stat = "slope") %>% 
    +   pull()
     c(lower = slope_hat - 2 * sd(boot), 
       upper = slope_hat + 2 * sd(boot))   
    ##     lower     upper 
     ## 0.9541501 1.0735226
    + + + + + + + + - + diff --git a/docs/articles/mtcars_examples.Rmd b/docs/articles/mtcars_examples.Rmd deleted file mode 100644 index 910b9e62..00000000 --- a/docs/articles/mtcars_examples.Rmd +++ /dev/null @@ -1,210 +0,0 @@ ---- -title: "Examples using `mtcars` data" -author: "Chester Ismay and Andrew bray" -date: "`r Sys.Date()`" -output: - rmarkdown::html_vignette -vignette: | - %\VignetteIndexEntry{mtcars example} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r include=FALSE} -knitr::opts_chunk$set(fig.width = 8, fig.height = 5) -``` - -## Data preparation - -```{r message=FALSE, warning=FALSE} -library(nycflights13) -library(dplyr) -library(ggplot2) -library(stringr) -library(infer) -mtcars <- as.data.frame(mtcars) %>% - mutate(cyl = factor(cyl), - vs = factor(vs), - am = factor(am), - gear = factor(gear), - carb = factor(carb)) -``` - - -*** - -One numerical variable (mean) - -```{r} -mtcars %>% - specify(response = mpg) %>% # formula alt: mpg ~ NULL - hypothesize(null = "point", mu = 25) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "mean") -``` - -One numerical variable (median) - -```{r} -mtcars %>% - specify(response = mpg) %>% # formula alt: mpg ~ NULL - hypothesize(null = "point", med = 26) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "median") -``` - -One numerical variable (standard deviation) - -```{r} -mtcars %>% - specify(response = mpg) %>% # formula alt: mpg ~ NULL - hypothesize(null = "point", sigma = 5) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "sd") -``` - -One categorical (2 level) variable - -```{r} -mtcars %>% - specify(response = am, success = "1") %>% # formula alt: am ~ NULL - hypothesize(null = "point", p = .25) %>% - generate(reps = 100, type = "simulate") %>% - calculate(stat = "prop") -``` - -Two categorical (2 level) variables - -```{r} -mtcars %>% - specify(am ~ vs, success = "1") %>% # alt: response = am, explanatory = vs - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "diff in props", order = c("0", "1")) -``` - -One categorical (>2 level) - GoF - -```{r} -mtcars %>% - specify(cyl ~ NULL) %>% # alt: response = cyl - hypothesize(null = "point", p = c("4" = .5, "6" = .25, "8" = .25)) %>% - generate(reps = 100, type = "simulate") %>% - calculate(stat = "Chisq") -``` - -Two categorical (>2 level) variables - -```{r warning = FALSE} -mtcars %>% - specify(cyl ~ am) %>% # alt: response = cyl, explanatory = am - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "Chisq") -``` - -One numerical variable one categorical (2 levels) (diff in means) - -```{r} -mtcars %>% - specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "diff in means", order = c("0", "1")) -``` - -One numerical variable one categorical (2 levels) (diff in medians) - -```{r} -mtcars %>% - specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "diff in medians", order = c("0", "1")) -``` - -One numerical one categorical (>2 levels) - ANOVA - -```{r} -mtcars %>% - specify(mpg ~ cyl) %>% # alt: response = mpg, explanatory = cyl - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "F") -``` - -Two numerical vars - SLR - -```{r} -mtcars %>% - specify(mpg ~ hp) %>% # alt: response = mpg, explanatory = cyl - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "slope") -``` - -### Confidence intervals - -One numerical (one mean) - -```{r} -mtcars %>% - specify(response = mpg) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "mean") -``` - -One numerical (one median) - -```{r} -mtcars %>% - specify(response = mpg) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "median") -``` - -One numerical (standard deviation) - -```{r} -mtcars %>% - specify(response = mpg) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "sd") -``` - -One categorical (one proportion) - -```{r} -mtcars %>% - specify(response = am, success = "1") %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "prop") -``` - -One numerical variable one categorical (2 levels) (diff in means) - -```{r} -mtcars %>% - specify(mpg ~ am) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "diff in means", order = c("0", "1")) -``` - -Two categorical variables (diff in proportions) - -```{r} -mtcars %>% - specify(am ~ vs, success = "1") %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "diff in props", order = c("0", "1")) -``` - -Two numerical vars - SLR - -```{r} -mtcars %>% - specify(mpg ~ hp) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "slope") -``` - diff --git a/docs/articles/mtcars_examples.html b/docs/articles/mtcars_examples.html index f463e400..02e89ab0 100644 --- a/docs/articles/mtcars_examples.html +++ b/docs/articles/mtcars_examples.html @@ -1,462 +1,488 @@ - - - + - - - - - - - - - - - -Examples using mtcars data - - - - - - - - - - + + + + +Examples using <code>mtcars</code> data • infer + + + + - +
    +
    + + +
    +
    + - - -

    Examples using mtcars data

    -

    Chester Ismay and Andrew bray

    -

    2018-01-05

    - - - + + +
    -

    Data preparation

    +

    +Data preparation

    library(nycflights13)
     library(dplyr)
     library(ggplot2)
     library(stringr)
     library(infer)
     mtcars <- as.data.frame(mtcars) %>%
    -  mutate(cyl = factor(cyl),
    +  mutate(cyl = factor(cyl),
              vs = factor(vs),
              am = factor(am),
              gear = factor(gear),
              carb = factor(carb))
    -
    +

    One numerical variable (mean)

    mtcars %>%
    -  specify(response = mpg) %>% # formula alt: mpg ~ NULL
    -  hypothesize(null = "point", mu = 25) %>% 
    -  generate(reps = 100, type = "bootstrap") %>% 
    -  calculate(stat = "mean")
    + specify(response = mpg) %>% # formula alt: mpg ~ NULL + hypothesize(null = "point", mu = 25) %>% + generate(reps = 100, type = "bootstrap") %>% + calculate(stat = "mean")
    ## # A tibble: 100 x 2
    -##    replicate  stat
    -##        <int> <dbl>
    -##  1         1  25.5
    -##  2         2  25.1
    -##  3         3  22.4
    -##  4         4  26.4
    -##  5         5  26.1
    -##  6         6  23.5
    -##  7         7  25.1
    -##  8         8  25.1
    -##  9         9  23.7
    -## 10        10  24.7
    +##    replicate     stat
    +##        <int>    <dbl>
    +##  1         1 26.49375
    +##  2         2 24.94063
    +##  3         3 23.82812
    +##  4         4 23.57812
    +##  5         5 24.70312
    +##  6         6 24.53750
    +##  7         7 24.88438
    +##  8         8 25.52812
    +##  9         9 25.24375
    +## 10        10 26.34063
     ## # ... with 90 more rows

    One numerical variable (median)

    mtcars %>%
    -  specify(response = mpg) %>% # formula alt: mpg ~ NULL
    -  hypothesize(null = "point", med = 26) %>% 
    -  generate(reps = 100, type = "bootstrap") %>% 
    -  calculate(stat = "median")
    + specify(response = mpg) %>% # formula alt: mpg ~ NULL + hypothesize(null = "point", med = 26) %>% + generate(reps = 100, type = "bootstrap") %>% + calculate(stat = "median")
    ## # A tibble: 100 x 2
     ##    replicate  stat
     ##        <int> <dbl>
    -##  1         1  28.2
    -##  2         2  28.2
    -##  3         3  27.8
    -##  4         4  25.5
    -##  5         5  24.5
    -##  6         6  26.0
    -##  7         7  26.0
    -##  8         8  27.8
    -##  9         9  26.2
    -## 10        10  27.8
    +##  1         1 25.75
    +##  2         2 24.75
    +##  3         3 26.00
    +##  4         4 26.25
    +##  5         5 24.60
    +##  6         6 26.50
    +##  7         7 24.60
    +##  8         8 27.80
    +##  9         9 27.15
    +## 10        10 24.60
     ## # ... with 90 more rows

    One numerical variable (standard deviation)

    mtcars %>%
    -  specify(response = mpg) %>% # formula alt: mpg ~ NULL
    -  hypothesize(null = "point", sigma = 5) %>% 
    -  generate(reps = 100, type = "bootstrap") %>% 
    -  calculate(stat = "sd")
    + specify(response = mpg) %>% # formula alt: mpg ~ NULL + hypothesize(null = "point", sigma = 5) %>% + generate(reps = 100, type = "bootstrap") %>% + calculate(stat = "sd")
    ## # A tibble: 100 x 2
    -##    replicate  stat
    -##        <int> <dbl>
    -##  1         1  6.66
    -##  2         2  5.55
    -##  3         3  6.21
    -##  4         4  4.08
    -##  5         5  4.58
    -##  6         6  6.51
    -##  7         7  7.60
    -##  8         8  6.85
    -##  9         9  5.77
    -## 10        10  6.96
    +##    replicate     stat
    +##        <int>    <dbl>
    +##  1         1 6.418569
    +##  2         2 7.210386
    +##  3         3 6.165721
    +##  4         4 5.539418
    +##  5         5 6.107237
    +##  6         6 6.413927
    +##  7         7 6.000000
    +##  8         8 5.651117
    +##  9         9 6.213025
    +## 10        10 5.016086
     ## # ... with 90 more rows

    One categorical (2 level) variable

    mtcars %>%
    -  specify(response = am, success = "1") %>% # formula alt: am ~ NULL
    -  hypothesize(null = "point", p = .25) %>% 
    -  generate(reps = 100, type = "simulate") %>% 
    -  calculate(stat = "prop")
    + specify(response = am, success = "1") %>% # formula alt: am ~ NULL + hypothesize(null = "point", p = .25) %>% + generate(reps = 100, type = "simulate") %>% + calculate(stat = "prop")
    ## # A tibble: 100 x 2
    -##    replicate   stat
    -##    <fctr>     <dbl>
    -##  1 1         0.188 
    -##  2 2         0.0625
    -##  3 3         0.281 
    -##  4 4         0.344 
    -##  5 5         0.219 
    -##  6 6         0.250 
    -##  7 7         0.312 
    -##  8 8         0.250 
    -##  9 9         0.281 
    -## 10 10        0.156 
    +##    replicate    stat
    +##       <fctr>   <dbl>
    +##  1         1 0.12500
    +##  2         2 0.34375
    +##  3         3 0.28125
    +##  4         4 0.37500
    +##  5         5 0.18750
    +##  6         6 0.15625
    +##  7         7 0.31250
    +##  8         8 0.31250
    +##  9         9 0.31250
    +## 10        10 0.15625
     ## # ... with 90 more rows

    Two categorical (2 level) variables

    mtcars %>%
    -  specify(am ~ vs, success = "1") %>% # alt: response = am, explanatory = vs
    -  hypothesize(null = "independence") %>%
    -  generate(reps = 100, type = "permute") %>%
    -  calculate(stat = "diff in props", order = c("0", "1"))
    + specify(am ~ vs, success = "1") %>% # alt: response = am, explanatory = vs + hypothesize(null = "independence") %>% + generate(reps = 100, type = "permute") %>% + calculate(stat = "diff in props", order = c("0", "1"))
    ## # A tibble: 100 x 2
    -##    replicate    stat
    -##        <int>   <dbl>
    -##  1         1  0.0873
    -##  2         2 -0.0397
    -##  3         3  0.317 
    -##  4         4 -0.0952
    -##  5         5  0.230 
    -##  6         6 -0.0238
    -##  7         7  0.183 
    -##  8         8  0.0714
    -##  9         9 -0.238 
    -## 10        10  0.0952
    +##    replicate        stat
    +##        <int>       <dbl>
    +##  1         1  0.23015873
    +##  2         2 -0.22222222
    +##  3         3  0.26190476
    +##  4         4  0.19841270
    +##  5         5  0.19047619
    +##  6         6 -0.23015873
    +##  7         7 -0.07936508
    +##  8         8  0.03174603
    +##  9         9 -0.02380952
    +## 10        10  0.24603175
     ## # ... with 90 more rows

    One categorical (>2 level) - GoF

    mtcars %>%
    -  specify(cyl ~ NULL) %>% # alt: response = cyl
    -  hypothesize(null = "point", p = c("4" = .5, "6" = .25, "8" = .25)) %>%
    -  generate(reps = 100, type = "simulate") %>%
    -  calculate(stat = "Chisq")
    + specify(cyl ~ NULL) %>% # alt: response = cyl + hypothesize(null = "point", p = c("4" = .5, "6" = .25, "8" = .25)) %>% + generate(reps = 100, type = "simulate") %>% + calculate(stat = "Chisq")
    ## # A tibble: 100 x 2
    -##    replicate  stat
    -##    <fctr>    <dbl>
    -##  1 1         2.75 
    -##  2 2         1.69 
    -##  3 3         1.00 
    -##  4 4         4.19 
    -##  5 5         0.688
    -##  6 6         1.69 
    -##  7 7         1.69 
    -##  8 8         3.69 
    -##  9 9         2.00 
    -## 10 10        0.188
    +##    replicate   stat
    +##       <fctr>  <dbl>
    +##  1         1 1.6875
    +##  2         2 5.1875
    +##  3         3 0.6875
    +##  4         4 2.7500
    +##  5         5 4.1875
    +##  6         6 0.7500
    +##  7         7 0.1875
    +##  8         8 1.6875
    +##  9         9 0.7500
    +## 10        10 8.0000
     ## # ... with 90 more rows

    Two categorical (>2 level) variables

    mtcars %>%
    -  specify(cyl ~ am) %>% # alt: response = cyl, explanatory = am
    -  hypothesize(null = "independence") %>%
    -  generate(reps = 100, type = "permute") %>%
    -  calculate(stat = "Chisq")
    + specify(cyl ~ am) %>% # alt: response = cyl, explanatory = am + hypothesize(null = "independence") %>% + generate(reps = 100, type = "permute") %>% + calculate(stat = "Chisq")
    ## # A tibble: 100 x 2
    -##    replicate  stat
    -##    <fctr>    <dbl>
    -##  1 1         5.73 
    -##  2 2         0.513
    -##  3 3         1.36 
    -##  4 4         4.16 
    -##  5 5         1.26 
    -##  6 6         0.134
    -##  7 7         0.172
    -##  8 8         0.164
    -##  9 9         0.592
    -## 10 10        0.296
    +##    replicate      stat
    +##       <fctr>     <dbl>
    +##  1         1 5.7764925
    +##  2         2 2.1933694
    +##  3         3 0.5917753
    +##  4         4 0.5120494
    +##  5         5 1.0815204
    +##  6         6 1.8346026
    +##  7         7 3.5720069
    +##  8         8 1.0145644
    +##  9         9 1.2482972
    +## 10        10 1.3305690
     ## # ... with 90 more rows

    One numerical variable one categorical (2 levels) (diff in means)

    mtcars %>%
    -  specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am
    -  hypothesize(null = "independence") %>%
    -  generate(reps = 100, type = "permute") %>%
    -  calculate(stat = "diff in means", order = c("0", "1"))
    + specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am + hypothesize(null = "independence") %>% + generate(reps = 100, type = "permute") %>% + calculate(stat = "diff in means", order = c("0", "1"))
    ## # A tibble: 100 x 2
    -##    replicate   stat
    -##        <int>  <dbl>
    -##  1         1  2.17 
    -##  2         2  0.344
    -##  3         3  1.67 
    -##  4         4  0.376
    -##  5         5 -1.47 
    -##  6         6 -2.03 
    -##  7         7  0.615
    -##  8         8  0.153
    -##  9         9  1.40 
    -## 10        10 -0.872
    +##    replicate       stat
    +##        <int>      <dbl>
    +##  1         1 -0.5708502
    +##  2         2  1.2311741
    +##  3         3  2.1793522
    +##  4         4 -0.3882591
    +##  5         5  0.7809717
    +##  6         6 -2.4145749
    +##  7         7 -0.3178138
    +##  8         8  1.4218623
    +##  9         9 -1.1582996
    +## 10        10 -0.2595142
     ## # ... with 90 more rows

    One numerical variable one categorical (2 levels) (diff in medians)

    mtcars %>%
    -  specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am
    -  hypothesize(null = "independence") %>%
    -  generate(reps = 100, type = "permute") %>%
    -  calculate(stat = "diff in medians", order = c("0", "1"))
    + specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am + hypothesize(null = "independence") %>% + generate(reps = 100, type = "permute") %>% + calculate(stat = "diff in medians", order = c("0", "1"))
    ## # A tibble: 100 x 2
    -##    replicate   stat
    -##        <int>  <dbl>
    -##  1         1  0.600
    -##  2         2  0    
    -##  3         3 -3.20 
    -##  4         4 -1.90 
    -##  5         5 -1.10 
    -##  6         6  0    
    -##  7         7 -1.10 
    -##  8         8 -1.80 
    -##  9         9 -3.90 
    -## 10        10 -2.40 
    +##    replicate  stat
    +##        <int> <dbl>
    +##  1         1  -2.9
    +##  2         2  -1.0
    +##  3         3  -2.9
    +##  4         4  -0.5
    +##  5         5  -0.8
    +##  6         6   1.7
    +##  7         7   1.9
    +##  8         8  -2.2
    +##  9         9   1.9
    +## 10        10  -0.5
     ## # ... with 90 more rows

    One numerical one categorical (>2 levels) - ANOVA

    mtcars %>%
    -  specify(mpg ~ cyl) %>% # alt: response = mpg, explanatory = cyl
    -  hypothesize(null = "independence") %>%
    -  generate(reps = 100, type = "permute") %>%
    -  calculate(stat = "F")
    + specify(mpg ~ cyl) %>% # alt: response = mpg, explanatory = cyl + hypothesize(null = "independence") %>% + generate(reps = 100, type = "permute") %>% + calculate(stat = "F")
    ## # A tibble: 100 x 2
    -##    replicate  stat
    -##        <int> <dbl>
    -##  1         1 0.129
    -##  2         2 2.33 
    -##  3         3 1.82 
    -##  4         4 0.628
    -##  5         5 0.235
    -##  6         6 0.378
    -##  7         7 0.431
    -##  8         8 1.24 
    -##  9         9 0.988
    -## 10        10 0.642
    +##    replicate      stat
    +##        <int>     <dbl>
    +##  1         1 2.0431650
    +##  2         2 1.1407430
    +##  3         3 0.4150877
    +##  4         4 1.2534262
    +##  5         5 0.3304550
    +##  6         6 0.6924692
    +##  7         7 2.7294339
    +##  8         8 1.8232691
    +##  9         9 2.3305664
    +## 10        10 1.0098759
     ## # ... with 90 more rows

    Two numerical vars - SLR

    mtcars %>%
    -  specify(mpg ~ hp) %>% # alt: response = mpg, explanatory = cyl
    -  hypothesize(null = "independence") %>%
    -  generate(reps = 100, type = "permute") %>%
    -  calculate(stat = "slope")
    + specify(mpg ~ hp) %>% # alt: response = mpg, explanatory = cyl + hypothesize(null = "independence") %>% + generate(reps = 100, type = "permute") %>% + calculate(stat = "slope")
    ## # A tibble: 100 x 2
    -##    replicate     stat
    -##        <int>    <dbl>
    -##  1         1 -0.00473
    -##  2         2 -0.00982
    -##  3         3  0.00359
    -##  4         4  0.00231
    -##  5         5 -0.00980
    -##  6         6 -0.0200 
    -##  7         7  0.0128 
    -##  8         8  0.00150
    -##  9         9 -0.0149 
    -## 10        10 -0.0187 
    +##    replicate          stat
    +##        <int>         <dbl>
    +##  1         1  0.0033946209
    +##  2         2 -0.0033518611
    +##  3         3 -0.0113786922
    +##  4         4  0.0200871493
    +##  5         5  0.0138259072
    +##  6         6 -0.0029685242
    +##  7         7  0.0003615068
    +##  8         8  0.0146026171
    +##  9         9  0.0081265467
    +## 10        10  0.0083522686
     ## # ... with 90 more rows
    -

    Confidence intervals

    +

    +Confidence intervals

    One numerical (one mean)

    mtcars %>%
    -  specify(response = mpg) %>%
    -  generate(reps = 100, type = "bootstrap") %>%
    -  calculate(stat = "mean")
    + specify(response = mpg) %>% + generate(reps = 100, type = "bootstrap") %>% + calculate(stat = "mean")
    ## # A tibble: 100 x 2
    -##    replicate  stat
    -##        <int> <dbl>
    -##  1         1  20.9
    -##  2         2  19.6
    -##  3         3  21.4
    -##  4         4  21.2
    -##  5         5  17.9
    -##  6         6  20.9
    -##  7         7  17.9
    -##  8         8  20.5
    -##  9         9  21.5
    -## 10        10  19.2
    +##    replicate     stat
    +##        <int>    <dbl>
    +##  1         1 20.48750
    +##  2         2 18.10312
    +##  3         3 20.52812
    +##  4         4 19.68125
    +##  5         5 19.86875
    +##  6         6 21.58750
    +##  7         7 20.64688
    +##  8         8 19.93437
    +##  9         9 19.70625
    +## 10        10 20.38750
     ## # ... with 90 more rows

    One numerical (one median)

    mtcars %>%
    -  specify(response = mpg) %>%
    -  generate(reps = 100, type = "bootstrap") %>%
    -  calculate(stat = "median")
    + specify(response = mpg) %>% + generate(reps = 100, type = "bootstrap") %>% + calculate(stat = "median")
    ## # A tibble: 100 x 2
     ##    replicate  stat
     ##        <int> <dbl>
    -##  1         1  19.7
    -##  2         2  18.6
    -##  3         3  21.4
    -##  4         4  17.7
    -##  5         5  17.7
    -##  6         6  20.4
    -##  7         7  21.0
    -##  8         8  19.0
    -##  9         9  17.3
    -## 10        10  20.4
    +##  1         1 19.45
    +##  2         2 21.40
    +##  3         3 21.00
    +##  4         4 18.65
    +##  5         5 17.30
    +##  6         6 18.95
    +##  7         7 20.10
    +##  8         8 18.10
    +##  9         9 22.80
    +## 10        10 21.00
     ## # ... with 90 more rows

    One numerical (standard deviation)

    mtcars %>%
    -  specify(response = mpg) %>%
    -  generate(reps = 100, type = "bootstrap") %>%
    -  calculate(stat = "sd")
    + specify(response = mpg) %>% + generate(reps = 100, type = "bootstrap") %>% + calculate(stat = "sd")
    ## # A tibble: 100 x 2
    -##    replicate  stat
    -##        <int> <dbl>
    -##  1         1  5.86
    -##  2         2  6.20
    -##  3         3  6.41
    -##  4         4  5.67
    -##  5         5  6.12
    -##  6         6  5.80
    -##  7         7  6.58
    -##  8         8  5.02
    -##  9         9  6.55
    -## 10        10  5.96
    +##    replicate     stat
    +##        <int>    <dbl>
    +##  1         1 5.289589
    +##  2         2 6.427662
    +##  3         3 4.288402
    +##  4         4 5.657596
    +##  5         5 5.825472
    +##  6         6 5.651573
    +##  7         7 6.822696
    +##  8         8 5.566228
    +##  9         9 4.253651
    +## 10        10 5.004489
     ## # ... with 90 more rows

    One categorical (one proportion)

    mtcars %>%
    -  specify(response = am, success = "1") %>%
    -  generate(reps = 100, type = "bootstrap") %>%
    -  calculate(stat = "prop")
    + specify(response = am, success = "1") %>% + generate(reps = 100, type = "bootstrap") %>% + calculate(stat = "prop")
    ## # A tibble: 100 x 2
    -##    replicate  stat
    -##        <int> <dbl>
    -##  1         1 0.531
    -##  2         2 0.250
    -##  3         3 0.375
    -##  4         4 0.344
    -##  5         5 0.406
    -##  6         6 0.594
    -##  7         7 0.531
    -##  8         8 0.562
    -##  9         9 0.531
    -## 10        10 0.469
    +##    replicate    stat
    +##        <int>   <dbl>
    +##  1         1 0.50000
    +##  2         2 0.34375
    +##  3         3 0.34375
    +##  4         4 0.25000
    +##  5         5 0.46875
    +##  6         6 0.31250
    +##  7         7 0.56250
    +##  8         8 0.40625
    +##  9         9 0.40625
    +## 10        10 0.25000
     ## # ... with 90 more rows

    One numerical variable one categorical (2 levels) (diff in means)

    mtcars %>%
    -  specify(mpg ~ am) %>%
    -  generate(reps = 100, type = "bootstrap") %>%
    -  calculate(stat = "diff in means", order = c("0", "1"))
    + specify(mpg ~ am) %>% + generate(reps = 100, type = "bootstrap") %>% + calculate(stat = "diff in means", order = c("0", "1"))
    ## # A tibble: 100 x 2
    -##    replicate   stat
    -##        <int>  <dbl>
    -##  1         1 - 7.36
    -##  2         2 - 9.03
    -##  3         3 -10.8 
    -##  4         4 - 7.36
    -##  5         5 - 3.03
    -##  6         6 - 5.74
    -##  7         7 - 9.80
    -##  8         8 - 6.70
    -##  9         9 - 7.71
    -## 10        10 - 7.26
    +##    replicate       stat
    +##        <int>      <dbl>
    +##  1         1 -10.128986
    +##  2         2  -6.725000
    +##  3         3  -9.396667
    +##  4         4  -8.416667
    +##  5         5  -4.953333
    +##  6         6  -7.823529
    +##  7         7  -8.548333
    +##  8         8  -7.686364
    +##  9         9  -8.550649
    +## 10        10  -5.170000
     ## # ... with 90 more rows

    Two categorical variables (diff in proportions)

    mtcars %>%
    -  specify(am ~ vs, success = "1") %>%
    -  generate(reps = 100, type = "bootstrap") %>%
    -  calculate(stat = "diff in props", order = c("0", "1"))
    + specify(am ~ vs, success = "1") %>% + generate(reps = 100, type = "bootstrap") %>% + calculate(stat = "diff in props", order = c("0", "1"))
    ## # A tibble: 100 x 2
    -##    replicate    stat
    -##        <int>   <dbl>
    -##  1         1 -0.217 
    -##  2         2 -0.0688
    -##  3         3  0.203 
    -##  4         4 -0.125 
    -##  5         5 -0.151 
    -##  6         6  0.0405
    -##  7         7  0.0635
    -##  8         8 -0.116 
    -##  9         9 -0.188 
    -## 10        10 -0.0931
    +##    replicate        stat
    +##        <int>       <dbl>
    +##  1         1 -0.11372549
    +##  2         2  0.07058824
    +##  3         3 -0.40000000
    +##  4         4 -0.48181818
    +##  5         5 -0.38823529
    +##  6         6 -0.43750000
    +##  7         7  0.15584416
    +##  8         8 -0.25000000
    +##  9         9 -0.37500000
    +## 10        10 -0.57575758
     ## # ... with 90 more rows

    Two numerical vars - SLR

    mtcars %>%
    -  specify(mpg ~ hp) %>% 
    -  generate(reps = 100, type = "bootstrap") %>%
    -  calculate(stat = "slope")
    + specify(mpg ~ hp) %>% + generate(reps = 100, type = "bootstrap") %>% + calculate(stat = "slope")
    ## # A tibble: 100 x 2
    -##    replicate    stat
    -##        <int>   <dbl>
    -##  1         1 -0.107 
    -##  2         2 -0.0727
    -##  3         3 -0.0439
    -##  4         4 -0.0947
    -##  5         5 -0.0860
    -##  6         6 -0.0714
    -##  7         7 -0.0633
    -##  8         8 -0.0821
    -##  9         9 -0.0865
    -## 10        10 -0.0593
    +##    replicate        stat
    +##        <int>       <dbl>
    +##  1         1 -0.05481123
    +##  2         2 -0.07938540
    +##  3         3 -0.07645180
    +##  4         4 -0.07007124
    +##  5         5 -0.06984628
    +##  6         6 -0.06554539
    +##  7         7 -0.06940029
    +##  8         8 -0.05066387
    +##  9         9 -0.06348753
    +## 10        10 -0.05931265
     ## # ... with 90 more rows
    + + + + + + + + - + diff --git a/docs/authors.html b/docs/authors.html index a09421f7..b265d5e7 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -52,7 +52,7 @@