diff --git a/NAMESPACE b/NAMESPACE index 09aa459e..c6ca82a3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,6 +5,7 @@ export("%>%") export(calculate) export(generate) export(hypothesize) +export(rep_sample_n) export(specify) export(visualize) importFrom(dplyr,as.tbl) diff --git a/R/generate.R b/R/generate.R index b5eff3bc..27fafbe3 100644 --- a/R/generate.R +++ b/R/generate.R @@ -142,30 +142,3 @@ get_par_levels <- function(x) { par_names <- names(attr(x, "params")) return(gsub("^.\\.", "", par_names)) } - -#' @importFrom dplyr as_tibble pull data_frame inner_join - -# Modified oilabs::rep_sample_n() with attr added -rep_sample_n <- function(tbl, size, replace = FALSE, reps = 1, prob = NULL) { - n <- nrow(tbl) - - # assign non-uniform probabilities - # there should be a better way!! - # prob needs to be nrow(tbl) -- not just number of factor levels - if (!is.null(prob)) { - if (length(prob) != n) stop("The argument prob must have length nrow(tbl).") - df_lkup <- dplyr::data_frame(vals = levels(dplyr::pull(tbl, 1))) - names(df_lkup) <- names(tbl) - df_lkup$probs <- prob - tbl_wgt <- dplyr::inner_join(tbl, df_lkup) - prob <- tbl_wgt$probs - } - - i <- unlist(replicate(reps, sample.int(n, size, replace = replace, prob = prob), - simplify = FALSE)) - rep_tbl <- cbind(replicate = rep(1:reps, rep(size, reps)), - tbl[i, ]) - rep_tbl <- dplyr::as_tibble(rep_tbl) - names(rep_tbl)[-1] <- names(tbl) - dplyr::group_by(rep_tbl, replicate) -} diff --git a/R/rep_sample_n.R b/R/rep_sample_n.R new file mode 100644 index 00000000..7f4a75c2 --- /dev/null +++ b/R/rep_sample_n.R @@ -0,0 +1,72 @@ +#' Perform repeated sampling +#' +#' Perform repeated sampling of samples of size n. Useful for creating +#' sampling distributions +#' +#' @param tbl data frame of population from which to sample +#' @param size sample size of each sample +#' @param replace should sampling be with replacement? +#' @param reps number of samples of size n = \code{size} to take +#' @param prob a vector of probability weights for obtaining the elements of +#' the vector being sampled. +#' @return A tibble of size \code{rep} times \code{size} rows corresponding to +#' \code{rep} samples of size n = \code{size} from \code{tbl}. +#' @importFrom dplyr data_frame +#' @importFrom dplyr pull +#' @importFrom dplyr inner_join +#' @importFrom dplyr as_tibble +#' @importFrom dplyr group_by +#' +#' @export +#' @examples +#' library(dplyr) +#' library(ggplot2) +#' +#' # Create a virtual population of N = 2400 balls, of which 900 are red and the +#' # rest are white +#' N <- 2400 +#' population <- data_frame( +#' ball_ID = 1:N, +#' color = c(rep("red", 900), rep("white", N - 900)) +#' ) +#' population +#' +#' # Take samples of size n = 50 balls; do this 1000 times +#' samples <- population %>% +#' rep_sample_n(size = 50, reps = 1000) +#' samples +#' +#' # Compute p_hats for all 1000 samples = proportion red +#' p_hats <- samples %>% +#' group_by(replicate) %>% +#' summarize(prop_red = mean(color == "red")) +#' p_hats +#' +#' # Plot sampling distribution +#' ggplot(p_hats, aes(x = prop_red)) + +#' geom_histogram(binwidth = 0.05) + +#' labs(x = "p_hat", y = "Number of samples", +#' title = "Sampling distribution of p_hat based on 1000 samples of size n = 50") +rep_sample_n <- function(tbl, size, replace = FALSE, reps = 1, prob = NULL) { + n <- nrow(tbl) + + # assign non-uniform probabilities + # there should be a better way!! + # prob needs to be nrow(tbl) -- not just number of factor levels + if (!is.null(prob)) { + if (length(prob) != n) stop("The argument prob must have length nrow(tbl).") + df_lkup <- dplyr::data_frame(vals = levels(dplyr::pull(tbl, 1))) + names(df_lkup) <- names(tbl) + df_lkup$probs <- prob + tbl_wgt <- dplyr::inner_join(tbl, df_lkup) + prob <- tbl_wgt$probs + } + + i <- unlist(replicate(reps, sample.int(n, size, replace = replace, prob = prob), + simplify = FALSE)) + rep_tbl <- cbind(replicate = rep(1:reps, rep(size, reps)), + tbl[i, ]) + rep_tbl <- dplyr::as_tibble(rep_tbl) + names(rep_tbl)[-1] <- names(tbl) + dplyr::group_by(rep_tbl, replicate) +} diff --git a/man/rep_sample_n.Rd b/man/rep_sample_n.Rd new file mode 100644 index 00000000..ff373ed9 --- /dev/null +++ b/man/rep_sample_n.Rd @@ -0,0 +1,58 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rep_sample_n.R +\name{rep_sample_n} +\alias{rep_sample_n} +\title{Perform repeated sampling} +\usage{ +rep_sample_n(tbl, size, replace = FALSE, reps = 1, prob = NULL) +} +\arguments{ +\item{tbl}{data frame of population from which to sample} + +\item{size}{sample size of each sample} + +\item{replace}{should sampling be with replacement?} + +\item{reps}{number of samples of size n = \code{size} to take} + +\item{prob}{a vector of probability weights for obtaining the elements of +the vector being sampled.} +} +\value{ +A tibble of size \code{rep} times \code{size} rows corresponding to +\code{rep} samples of size n = \code{size} from \code{tbl}. +} +\description{ +Perform repeated sampling of samples of size n. Useful for creating +sampling distributions +} +\examples{ +library(dplyr) +library(ggplot2) + +# Create a virtual population of N = 2400 balls, of which 900 are red and the +# rest are white +N <- 2400 +population <- data_frame( + ball_ID = 1:N, + color = c(rep("red", 900), rep("white", N - 900)) +) +population + +# Take samples of size n = 50 balls; do this 1000 times +samples <- population \%>\% + rep_sample_n(size = 50, reps = 1000) +samples + +# Compute p_hats for all 1000 samples = proportion red +p_hats <- samples \%>\% + group_by(replicate) \%>\% + summarize(prop_red = mean(color == "red")) +p_hats + +# Plot sampling distribution +ggplot(p_hats, aes(x = prop_red)) + + geom_histogram(binwidth = 0.05) + + labs(x = "p_hat", y = "Number of samples", + title = "Sampling distribution of p_hat based on 1000 samples of size n = 50") +}