tidymodels · ismayc · Jul 7, 2018 · Jun 13, 2018 · Jun 13, 2018 · Jun 13, 2018
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: infer
 Type: Package
 Title: Tidy Statistical Inference
-Version: 0.2.1
+Version: 0.3.0
 Authors@R: c(
     person("Andrew", "Bray", email = "abray@reed.edu", role = c("aut", "cre")),
     person("Chester", "Ismay", email = "chester.ismay@gmail.com", role = "aut"),
@@ -36,8 +36,7 @@ Suggests:
     nycflights13,
     stringr,
     testthat,
-    covr,
-    okcupiddata
+    covr
 URL: https://github.com/andrewpbray/infer
 BugReports: https://github.com/andrewpbray/infer/issues
 RoxygenNote: 6.0.1

diff --git a/NAMESPACE b/NAMESPACE
@@ -5,8 +5,13 @@ export("%>%")
 export(calculate)
 export(chisq_stat)
 export(chisq_test)
+export(conf_int)
 export(generate)
+export(get_ci)
+export(get_confidence_interval)
+export(get_pvalue)
 export(hypothesize)
+export(p_value)
 export(rep_sample_n)
 export(specify)
 export(t_stat)

diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,19 @@
 # infer 0.2.1
 
-- Implemented chi-square Goodness of Fit observed stat depending on `params` being set in `hypothesize` with `specify() %>% calculate()` shortcut
+- Added `conf_int` logical argument and `conf_level` argument to `t_test()`
+- Switched `shade_color` argument in `visualize()` to be `pvalue_fill` instead
+since fill color for confidence intervals is also added now
+- Shading for Confidence Intervals in `visualize()` 
+    - Green is default color for CI and red for p-values
+    - `direction = "between"` to get the green shading
+    - Currently working only for simulation-based methods
+- Implemented `conf_int()` function for computing confidence interval provided a simulation-based method with a `stat` variable
+    - `get_ci()` and `get_confidence_interval()` are aliases for `conf_int()`
+    - Converted longer confidence interval calculation code in vignettes to use `get_ci()` instead    
+- Implemented `p_value()` function for computing p-value provided a simulation-based method with a `stat` variable
+    - `get_pvalue()` is an alias for `p_value()`
+    - Converted longer p-value calculation code in vignettes to use `get_pvalue()` instead
+- Implemented Chi-square Goodness of Fit observed stat depending on `params` being set in `hypothesize` with `specify() %>% calculate()` shortcut
 - Removed "standardized" slope $t$ since its formula is different than "standardized" correlation and there is no way currently to give one over the other
 - Implemented correlation with bootstrap CI and permutation hypothesis test
 - Filled the `type` argument automatically in `generate()` based

diff --git a/R/calculate.R b/R/calculate.R
@@ -125,18 +125,10 @@ calculate <- function(x,
       "Your choice of `stat` is invalid for the ",
       "types of variables `specify`ed."
     ))
-  else
-    class(result) <- append("infer", class(result))
+#  else
+#    class(result) <- append("infer", class(result))
 
-  attr(result, "response") <- attr(x, "response")
-  attr(result, "success") <- attr(x, "success")
-  attr(result, "explanatory") <- attr(x, "explanatory")
-  attr(result, "response_type") <- attr(x, "response_type")
-  attr(result, "explanatory_type") <- attr(x, "explanatory_type")
-  attr(result, "params") <- attr(x, "params")
-  attr(result, "distr_param") <- attr(x, "distr_param")
-  attr(result, "distr_param2") <- attr(x, "distr_param2")
-  attr(result, "theory_type") <- attr(x, "theory_type")
+  result <- set_attributes(to = result, from = x)
   attr(result, "stat") <- stat
 
   # For returning a 1x1 observed statistic value
@@ -152,7 +144,7 @@ calc_impl <-
 
 
 calc_impl.mean <- function(stat, x, order, ...) {
-  col <- setdiff(names(x), "replicate")
+  col <- base::setdiff(names(x), "replicate")
 
   x %>%
     dplyr::group_by(replicate) %>%
@@ -161,23 +153,23 @@ calc_impl.mean <- function(stat, x, order, ...) {
 }
 
 calc_impl.median <- function(stat, x, order, ...) {
-  col <- setdiff(names(x), "replicate")
+  col <- base::setdiff(names(x), "replicate")
 
   x %>%
     dplyr::group_by(replicate) %>%
     dplyr::summarize(stat = stats::median(!!(sym(col)), ...))
 }
 
 calc_impl.sd <- function(stat, x, order, ...) {
-  col <- setdiff(names(x), "replicate")
+  col <- base::setdiff(names(x), "replicate")
 
   x %>%
     dplyr::group_by(replicate) %>%
     dplyr::summarize(stat = stats::sd(!!(sym(col)), ...))
 }
 
 calc_impl.prop <- function(stat, x, order, ...) {
-  col <- setdiff(names(x), "replicate")
+  col <- base::setdiff(names(x), "replicate")
 
   ## No longer needed with implementation of `check_point_params()`
   # if(!is.factor(x[[col]])){
@@ -227,7 +219,7 @@ calc_impl.slope <- function(stat, x, order, ...) {
 
 calc_impl.correlation <- function(stat, x, order, ...) {
   x %>% 
-    dplyr::summarize(stat = cor(!!attr(x, "explanatory"), 
+    dplyr::summarize(stat = stats::cor(!!attr(x, "explanatory"), 
                                 !!attr(x, "response")))
 }
 

diff --git a/R/conf_int.R b/R/conf_int.R
@@ -0,0 +1,84 @@
+#' Compute the confidence interval for (currently only) simulation-based methods
+#' 
+#' \code{get_confidence_interval()} and \code{get_ci()} are both aliases of \code{conf_int()}
+#' @param x data frame of calculated statistics or containing attributes
+#' of theoretical distribution values. Currently, dependent on statistics being stored in \code{stat} column as created in \code{calculate()} function.
+#' @param level a numerical value between 0 and 1 giving the confidence level. Default value is 0.95.
+#' @param type a string giving which method should be used for creating the confidence interval. The default is \code{"percentile"} with \code{"se"} corresponding to (2 * standard error) as the other option.
+#' @param point_estimate a numeric value or a 1x1 data frame set to NULL by default. Needed to be provided if \code{type = "se"}.
+#'
+#' @return a 2 x 1 tibble with values corresponding to lower and upper values in the confidence interval
+#' @export
+#' @rdname get_ci
+#' @examples
+#' mtcars_df <- mtcars %>%
+#'     dplyr::mutate(am = factor(am))
+#' d_hat <- mtcars_df %>%
+#'     specify(mpg ~ am) %>%
+#'     calculate(stat = "diff in means", order = c("1", "0"))
+#' bootstrap_distn <- mtcars_df %>%
+#'     specify(mpg ~ am) %>%
+#'     generate(reps = 100) %>%
+#'    calculate(stat = "diff in means", order = c("1", "0"))
+#' bootstrap_distn %>% conf_int(level = 0.9)
+#' bootstrap_distn %>% conf_int(type = "se", point_estimate = d_hat)
+
+conf_int <- function(x, level = 0.95, type = "percentile", 
+                     point_estimate = NULL){
+
+  check_ci_args(x, level, type, point_estimate)
+
+  if(type == "percentile") {
+    ci_vec <- stats::quantile(x[["stat"]], 
+                      probs = c((1 - level) / 2, level + (1 - level) / 2))
+
+    ci <- tibble::tibble(ci_vec[1], ci_vec[2])
+    names(ci) <- names(ci_vec)
+  } else {
+    point_estimate <- check_obs_stat(point_estimate)
+    multiplier <- stats::qnorm(1 - (1 - level) / 2)
+    ci <- tibble::tibble(
+      lower = point_estimate - multiplier * stats::sd(x[["stat"]]),
+      upper = point_estimate + multiplier * stats::sd(x[["stat"]]))
+  }
+
+  return(ci)
+}
+
+check_ci_args <- function(x, level, type, point_estimate){
+
+  if(!is.null(point_estimate)){
+    if(!is.data.frame(point_estimate))
+      assertive::assert_is_numeric(point_estimate)
+    else
+      assertive::assert_is_data.frame(point_estimate)
+  }
+  assertive::assert_is_data.frame(x)
+  assertive::assert_is_numeric(level)
+  if(level <= 0 || level >= 1){
+    stop(paste("The value of `level` must be between 0 and 1", 
+                  "non-inclusive."))
+  }
+
+  if(!(type %in% c("percentile", "se"))){
+    stop(paste('The options for `type` are "percentile" or "se".'))
+  }
+
+  if(type == "se" && is.null(point_estimate))
+    stop(paste('A numeric value needs to be given for `point_estimate`',
+               'for `type = "se"'))
+
+  if(type == "se" && is.vector(point_estimate))
+    assertive::assert_is_numeric(point_estimate)
+}
+
+
+#' @export
+#' @rdname get_ci
+
+get_ci <- conf_int
+
+#' @export
+#' @rdname get_ci
+
+get_confidence_interval <- conf_int
diff --git a/R/hypothesize.R b/R/hypothesize.R
@@ -5,6 +5,7 @@
 #' @return A tibble containing the response (and explanatory, if specified) 
 #' variable data with parameter information stored as well
 #' @importFrom dplyr as.tbl
+#' @return a data frame with attributes set
 #' @export
 #' @examples
 #' # Permutation test similar to ANOVA

diff --git a/R/infer.R b/R/infer.R
@@ -18,4 +18,5 @@ if(getRversion() >= "2.15.1")
                            "statistic", ".", "parameter", "p.value",
                            "xmin", "xmax", "density", "denom",
                            "diff_prop", "group_num", "n1", "n2",
-                           "num_suc", "p_hat", "total_suc", "explan", "probs"))
+                           "num_suc", "p_hat", "total_suc", "explan", "probs",
+                           "conf.low", "conf.high"))
diff --git a/R/p_value.R b/R/p_value.R
@@ -0,0 +1,122 @@
+#' Compute the p-value for (currently only) simulation-based methods
+#' \code{get_pvalue()} is an alias of \code{p_value}
+#' @param x data frame of calculated statistics or containing attributes
+#' of theoretical distribution values
+#' @param obs_stat a numeric value or a 1x1 data frame (as extreme or more extreme than this)
+#' @param direction a character string. Options are "less", "greater", or "two_sided".
+#' Can also specify "left", "right", or "both".
+#'
+#' @return a 1x1 data frame with value between 0 and 1
+#' @export
+#' @rdname get_pvalue
+#' @examples
+#' mtcars_df <- mtcars %>%
+#'     dplyr::mutate(am = factor(am))
+#' d_hat <- mtcars_df %>%
+#'     specify(mpg ~ am) %>%
+#'     calculate(stat = "diff in means", order = c("1", "0"))
+#' null_distn <- mtcars_df %>%
+#'     specify(mpg ~ am) %>% 
+#'     hypothesize(null = "independence") %>%
+#'     generate(reps = 100) %>%
+#'     calculate(stat = "diff in means", order = c("1", "0"))
+#' null_distn %>% 
+#'     p_value(obs_stat = d_hat, direction = "right")
+
+p_value <- function(x, obs_stat, direction){
+
+  assertive::assert_is_data.frame(x)
+  obs_stat <- check_obs_stat(obs_stat)
+  check_direction(direction)
+
+  is_simulation_based <- !is.null(attr(x, "generate")) &&
+    attr(x, "generate")
+
+  if(is_simulation_based)
+    pvalue <- simulation_based_p_value(x = x, obs_stat = obs_stat, 
+                                        direction = direction)
+
+  ## Theoretical-based p-value
+  # Could be more specific
+  # else if(is.null(attr(x, "theory_type")) || is.null(attr(x, "distr_param")))
+  #   stop(paste("Attributes have not been set appropriately.",
+  #              "Check your {infer} pipeline again."
+  #   ))
+
+  # if(!("stat" %in% names(x))){
+  #    # Theoretical distribution
+  #  which_distribution(x, 
+  #                     theory_type <- attr(x, "theory_type"),
+  #                     obs_stat = obs_stat,
+  #                     direction = direction) 
+  # }
+
+  return(pvalue)
+}
+
+simulation_based_p_value <- function(x, obs_stat, direction){
+
+  if(direction %in% c("less", "left")){
+   p_value <- x %>% 
+      dplyr::summarize(p_value = mean(stat <= obs_stat))
+  }
+  else if(direction %in% c("greater", "right")){
+   p_value <- x %>% 
+      dplyr::summarize(p_value = mean(stat >= obs_stat))
+  }
+  else{
+    p_value <- x %>% two_sided_p_value(obs_stat = obs_stat)
+  }
+
+  p_value
+}
+
+two_sided_p_value <- function(x, obs_stat){
+
+  if(stats::median(x$stat) >= obs_stat){
+    basic_p_value <- get_percentile(x$stat, obs_stat) +
+      (1 - get_percentile(x$stat, stats::median(x$stat) + 
+                       stats::median(x$stat) - obs_stat))
+  } else {
+    basic_p_value <- 1 - get_percentile(x$stat, obs_stat) +
+      (get_percentile(x$stat, stats::median(x$stat) + 
+                            stats::median(x$stat) - obs_stat))
+  }
+
+  if(basic_p_value >= 1)
+    # Catch all if adding both sides produces a number
+    # larger than 1. Should update with test in that
+    # scenario instead of using >=
+    return(tibble::tibble(p_value = 1))
+  else
+    return(tibble::tibble(p_value = basic_p_value))
+}
+
+#' @export
+#' @rdname get_pvalue
+get_pvalue <- p_value
+
+# which_distribution <- function(x, theory_type, obs_stat, direction){
+#   
+#   param <- attr(x, "distr_param")
+#   if(!is.null(attr(x, "distr_param2")))
+#     param2 <- attr(x, "distr_param2")
+#   
+#   if(theory_type == "Two sample t")
+#     return(pt(q = obs_stat,
+#                              df = param,
+#                              lower.tail = set_lower_tail(direction))
+#          )
+# 
+# }
+
+#theory_t_pvalue <- 
+
+# set_lower_tail <- function(direction){
+#   if(direction %in% c("greater", "right"))
+#     lower_tail <- FALSE
+#   else
+#     lower_tail <- TRUE
+#   
+#   lower_tail
+# }  
diff --git a/R/pipe.R b/R/pipe.R
@@ -1,7 +1,7 @@
 #' Pipe
 #'
-#' Like \code{dplyr}, \code{infer} also uses the pipe function, \code{\%>\%} to turn
-#' function composition into a series of imperative statements.
+#' Like \code{dplyr}, \code{infer} also uses the pipe function, \code{\%>\%}
+#' to turn function composition into a series of imperative statements.
 #'
 #' @importFrom magrittr %>%
 #' @name %>%