Merge pull request #82 from rudeboybert/master

Export rep_sample_n()
tidymodels · Jan 12, 2018 · 41f4648 · 41f4648
2 parents 852d1e6 + 367a845
commit 41f4648
Show file tree

Hide file tree

Showing 4 changed files with 131 additions and 27 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -5,6 +5,7 @@ export("%>%")
 export(calculate)
 export(generate)
 export(hypothesize)
+export(rep_sample_n)
 export(specify)
 export(visualize)
 importFrom(dplyr,as.tbl)

diff --git a/R/generate.R b/R/generate.R
@@ -142,30 +142,3 @@ get_par_levels <- function(x) {
   par_names <- names(attr(x, "params"))
   return(gsub("^.\\.", "", par_names))
 }
-
-#' @importFrom dplyr as_tibble pull data_frame inner_join
-
-# Modified oilabs::rep_sample_n() with attr added
-rep_sample_n <- function(tbl, size, replace = FALSE, reps = 1, prob = NULL) {
-  n <- nrow(tbl)
-
-  # assign non-uniform probabilities
-  # there should be a better way!!
-  # prob needs to be nrow(tbl) -- not just number of factor levels
-  if (!is.null(prob)) {
-    if (length(prob) != n) stop("The argument prob must have length nrow(tbl).")
-    df_lkup <- dplyr::data_frame(vals = levels(dplyr::pull(tbl, 1)))
-    names(df_lkup) <- names(tbl)
-    df_lkup$probs <- prob
-    tbl_wgt <- dplyr::inner_join(tbl, df_lkup)
-    prob <- tbl_wgt$probs
-  }
-
-  i <- unlist(replicate(reps, sample.int(n, size, replace = replace, prob = prob),
-                        simplify = FALSE))
-  rep_tbl <- cbind(replicate = rep(1:reps, rep(size, reps)),
-                   tbl[i, ])
-  rep_tbl <- dplyr::as_tibble(rep_tbl)
-  names(rep_tbl)[-1] <- names(tbl)
-  dplyr::group_by(rep_tbl, replicate)
-}
diff --git a/R/rep_sample_n.R b/R/rep_sample_n.R
@@ -0,0 +1,72 @@
+#' Perform repeated sampling
+#' 
+#' Perform repeated sampling of samples of size n. Useful for creating
+#' sampling distributions
+#' 
+#' @param tbl data frame of population from which to sample
+#' @param size sample size of each sample
+#' @param replace should sampling be with replacement?
+#' @param reps number of samples of size n = \code{size} to take
+#' @param prob a vector of probability weights for obtaining the elements of 
+#' the vector being sampled.
+#' @return A tibble of size \code{rep} times \code{size} rows corresponding to 
+#' \code{rep} samples of size n = \code{size} from \code{tbl}.
+#' @importFrom dplyr data_frame
+#' @importFrom dplyr pull
+#' @importFrom dplyr inner_join
+#' @importFrom dplyr as_tibble
+#' @importFrom dplyr group_by
+#' 
+#' @export
+#' @examples
+#' library(dplyr)
+#' library(ggplot2)
+#' 
+#' # Create a virtual population of N = 2400 balls, of which 900 are red and the
+#' # rest are white
+#' N <- 2400
+#' population <- data_frame(
+#'   ball_ID = 1:N,
+#'   color = c(rep("red", 900), rep("white", N - 900))
+#' )
+#' population
+#' 
+#' # Take samples of size n = 50 balls; do this 1000 times
+#' samples <- population %>%
+#'   rep_sample_n(size = 50, reps = 1000)
+#' samples
+#'
+#' # Compute p_hats for all 1000 samples = proportion red
+#' p_hats <- samples %>% 
+#'   group_by(replicate) %>% 
+#'   summarize(prop_red = mean(color == "red"))
+#' p_hats
+#' 
+#' # Plot sampling distribution
+#' ggplot(p_hats, aes(x = prop_red)) + 
+#'   geom_histogram(binwidth = 0.05) + 
+#'   labs(x = "p_hat", y = "Number of samples", 
+#'   title = "Sampling distribution of p_hat based on 1000 samples of size n = 50")
+rep_sample_n <- function(tbl, size, replace = FALSE, reps = 1, prob = NULL) {
+  n <- nrow(tbl)
+
+  # assign non-uniform probabilities
+  # there should be a better way!!
+  # prob needs to be nrow(tbl) -- not just number of factor levels
+  if (!is.null(prob)) {
+    if (length(prob) != n) stop("The argument prob must have length nrow(tbl).")
+    df_lkup <- dplyr::data_frame(vals = levels(dplyr::pull(tbl, 1)))
+    names(df_lkup) <- names(tbl)
+    df_lkup$probs <- prob
+    tbl_wgt <- dplyr::inner_join(tbl, df_lkup)
+    prob <- tbl_wgt$probs
+  }
+
+  i <- unlist(replicate(reps, sample.int(n, size, replace = replace, prob = prob),
+                        simplify = FALSE))
+  rep_tbl <- cbind(replicate = rep(1:reps, rep(size, reps)),
+                   tbl[i, ])
+  rep_tbl <- dplyr::as_tibble(rep_tbl)
+  names(rep_tbl)[-1] <- names(tbl)
+  dplyr::group_by(rep_tbl, replicate)
+}
diff --git a/man/rep_sample_n.Rd b/man/rep_sample_n.Rd