Skip to content

Commit

Permalink
Merge pull request #82 from rudeboybert/master
Browse files Browse the repository at this point in the history
Export rep_sample_n()
  • Loading branch information
andrewpbray committed Jan 12, 2018
2 parents 852d1e6 + 367a845 commit 41f4648
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 27 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ export("%>%")
export(calculate)
export(generate)
export(hypothesize)
export(rep_sample_n)
export(specify)
export(visualize)
importFrom(dplyr,as.tbl)
Expand Down
27 changes: 0 additions & 27 deletions R/generate.R
Original file line number Diff line number Diff line change
Expand Up @@ -142,30 +142,3 @@ get_par_levels <- function(x) {
par_names <- names(attr(x, "params"))
return(gsub("^.\\.", "", par_names))
}

#' @importFrom dplyr as_tibble pull data_frame inner_join

# Modified oilabs::rep_sample_n() with attr added
rep_sample_n <- function(tbl, size, replace = FALSE, reps = 1, prob = NULL) {
n <- nrow(tbl)

# assign non-uniform probabilities
# there should be a better way!!
# prob needs to be nrow(tbl) -- not just number of factor levels
if (!is.null(prob)) {
if (length(prob) != n) stop("The argument prob must have length nrow(tbl).")
df_lkup <- dplyr::data_frame(vals = levels(dplyr::pull(tbl, 1)))
names(df_lkup) <- names(tbl)
df_lkup$probs <- prob
tbl_wgt <- dplyr::inner_join(tbl, df_lkup)
prob <- tbl_wgt$probs
}

i <- unlist(replicate(reps, sample.int(n, size, replace = replace, prob = prob),
simplify = FALSE))
rep_tbl <- cbind(replicate = rep(1:reps, rep(size, reps)),
tbl[i, ])
rep_tbl <- dplyr::as_tibble(rep_tbl)
names(rep_tbl)[-1] <- names(tbl)
dplyr::group_by(rep_tbl, replicate)
}
72 changes: 72 additions & 0 deletions R/rep_sample_n.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#' Perform repeated sampling
#'
#' Perform repeated sampling of samples of size n. Useful for creating
#' sampling distributions
#'
#' @param tbl data frame of population from which to sample
#' @param size sample size of each sample
#' @param replace should sampling be with replacement?
#' @param reps number of samples of size n = \code{size} to take
#' @param prob a vector of probability weights for obtaining the elements of
#' the vector being sampled.
#' @return A tibble of size \code{rep} times \code{size} rows corresponding to
#' \code{rep} samples of size n = \code{size} from \code{tbl}.
#' @importFrom dplyr data_frame
#' @importFrom dplyr pull
#' @importFrom dplyr inner_join
#' @importFrom dplyr as_tibble
#' @importFrom dplyr group_by
#'
#' @export
#' @examples
#' library(dplyr)
#' library(ggplot2)
#'
#' # Create a virtual population of N = 2400 balls, of which 900 are red and the
#' # rest are white
#' N <- 2400
#' population <- data_frame(
#' ball_ID = 1:N,
#' color = c(rep("red", 900), rep("white", N - 900))
#' )
#' population
#'
#' # Take samples of size n = 50 balls; do this 1000 times
#' samples <- population %>%
#' rep_sample_n(size = 50, reps = 1000)
#' samples
#'
#' # Compute p_hats for all 1000 samples = proportion red
#' p_hats <- samples %>%
#' group_by(replicate) %>%
#' summarize(prop_red = mean(color == "red"))
#' p_hats
#'
#' # Plot sampling distribution
#' ggplot(p_hats, aes(x = prop_red)) +
#' geom_histogram(binwidth = 0.05) +
#' labs(x = "p_hat", y = "Number of samples",
#' title = "Sampling distribution of p_hat based on 1000 samples of size n = 50")
rep_sample_n <- function(tbl, size, replace = FALSE, reps = 1, prob = NULL) {
n <- nrow(tbl)

# assign non-uniform probabilities
# there should be a better way!!
# prob needs to be nrow(tbl) -- not just number of factor levels
if (!is.null(prob)) {
if (length(prob) != n) stop("The argument prob must have length nrow(tbl).")
df_lkup <- dplyr::data_frame(vals = levels(dplyr::pull(tbl, 1)))
names(df_lkup) <- names(tbl)
df_lkup$probs <- prob
tbl_wgt <- dplyr::inner_join(tbl, df_lkup)
prob <- tbl_wgt$probs
}

i <- unlist(replicate(reps, sample.int(n, size, replace = replace, prob = prob),
simplify = FALSE))
rep_tbl <- cbind(replicate = rep(1:reps, rep(size, reps)),
tbl[i, ])
rep_tbl <- dplyr::as_tibble(rep_tbl)
names(rep_tbl)[-1] <- names(tbl)
dplyr::group_by(rep_tbl, replicate)
}
58 changes: 58 additions & 0 deletions man/rep_sample_n.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 41f4648

Please sign in to comment.