man/PPC-distributions.Rd

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/ppc-distributions.R
\name{PPC-distributions}
\alias{PPC-distributions}
\alias{ppc_data}
\alias{ppc_dens_overlay}
\alias{ppc_dens_overlay_grouped}
\alias{ppc_ecdf_overlay}
\alias{ppc_ecdf_overlay_grouped}
\alias{ppc_dens}
\alias{ppc_hist}
\alias{ppc_freqpoly}
\alias{ppc_freqpoly_grouped}
\alias{ppc_boxplot}
\alias{ppc_dots}
\alias{ppc_violin_grouped}
\alias{ppc_pit_ecdf}
\alias{ppc_pit_ecdf_grouped}
\title{PPC distributions}
\usage{
ppc_data(y, yrep, group = NULL)

ppc_dens_overlay(
  y,
  yrep,
  ...,
  size = 0.25,
  alpha = 0.7,
  trim = FALSE,
  bw = "nrd0",
  adjust = 1,
  kernel = "gaussian",
  n_dens = 1024
)

ppc_dens_overlay_grouped(
  y,
  yrep,
  group,
  ...,
  size = 0.25,
  alpha = 0.7,
  trim = FALSE,
  bw = "nrd0",
  adjust = 1,
  kernel = "gaussian",
  n_dens = 1024
)

ppc_ecdf_overlay(
  y,
  yrep,
  ...,
  discrete = FALSE,
  pad = TRUE,
  size = 0.25,
  alpha = 0.7
)

ppc_ecdf_overlay_grouped(
  y,
  yrep,
  group,
  ...,
  discrete = FALSE,
  pad = TRUE,
  size = 0.25,
  alpha = 0.7
)

ppc_dens(y, yrep, ..., trim = FALSE, size = 0.5, alpha = 1)

ppc_hist(
  y,
  yrep,
  ...,
  binwidth = NULL,
  bins = NULL,
  breaks = NULL,
  freq = TRUE
)

ppc_freqpoly(
  y,
  yrep,
  ...,
  binwidth = NULL,
  bins = NULL,
  freq = TRUE,
  size = 0.5,
  alpha = 1
)

ppc_freqpoly_grouped(
  y,
  yrep,
  group,
  ...,
  binwidth = NULL,
  bins = NULL,
  freq = TRUE,
  size = 0.5,
  alpha = 1
)

ppc_boxplot(y, yrep, ..., notch = TRUE, size = 0.5, alpha = 1)

ppc_dots(y, yrep, ..., binwidth = NA, quantiles = NA, freq = TRUE)

ppc_violin_grouped(
  y,
  yrep,
  group,
  ...,
  probs = c(0.1, 0.5, 0.9),
  size = 1,
  alpha = 1,
  y_draw = c("violin", "points", "both"),
  y_size = 1,
  y_alpha = 1,
  y_jitter = 0.1
)

ppc_pit_ecdf(
  y,
  yrep,
  ...,
  pit = NULL,
  K = NULL,
  prob = 0.99,
  plot_diff = FALSE,
  interpolate_adj = NULL
)

ppc_pit_ecdf_grouped(
  y,
  yrep,
  group,
  ...,
  K = NULL,
  pit = NULL,
  prob = 0.99,
  plot_diff = FALSE,
  interpolate_adj = NULL
)
}
\arguments{
\item{y}{A vector of observations. See \strong{Details}.}

\item{yrep}{An \code{S} by \code{N} matrix of draws from the posterior (or prior)
predictive distribution. The number of rows, \code{S}, is the size of the
posterior (or prior) sample used to generate \code{yrep}. The number of columns,
\code{N} is the number of predicted observations (\code{length(y)}). The columns of
\code{yrep} should be in the same order as the data points in \code{y} for the plots
to make sense. See the \strong{Details} and \strong{Plot Descriptions} sections for
additional advice specific to particular plots.}

\item{group}{A grouping variable of the same length as \code{y}.
Will be coerced to \link[base:factor]{factor} if not already a factor.
Each value in \code{group} is interpreted as the group level pertaining
to the corresponding observation.}

\item{...}{For dot plots, optional additional arguments to pass to \code{\link[ggdist:stat_dots]{ggdist::stat_dots()}}.}

\item{size, alpha}{Passed to the appropriate geom to control the appearance of
the predictive distributions.}

\item{trim}{A logical scalar passed to \code{\link[ggplot2:geom_density]{ggplot2::geom_density()}}.}

\item{bw, adjust, kernel, n_dens}{Optional arguments passed to
\code{\link[stats:density]{stats::density()}} to override default kernel density estimation
parameters. \code{n_dens} defaults to \code{1024}.}

\item{discrete}{For \code{ppc_ecdf_overlay()}, should the data be treated as
discrete? The default is \code{FALSE}, in which case \code{geom="line"} is
passed to \code{\link[ggplot2:stat_ecdf]{ggplot2::stat_ecdf()}}. If \code{discrete} is set to
\code{TRUE} then \code{geom="step"} is used.}

\item{pad}{A logical scalar passed to \code{\link[ggplot2:stat_ecdf]{ggplot2::stat_ecdf()}}.}

\item{binwidth}{Passed to \code{\link[ggplot2:geom_histogram]{ggplot2::geom_histogram()}} to override
the default binwidth.}

\item{bins}{Passed to \code{\link[ggplot2:geom_histogram]{ggplot2::geom_histogram()}} to override
the default binwidth.}

\item{breaks}{Passed to \code{\link[ggplot2:geom_histogram]{ggplot2::geom_histogram()}} as an
alternative to \code{binwidth}.}

\item{freq}{For histograms, \code{freq=TRUE} (the default) puts count on the
y-axis. Setting \code{freq=FALSE} puts density on the y-axis. (For many
plots the y-axis text is off by default. To view the count or density
labels on the y-axis see the \code{\link[=yaxis_text]{yaxis_text()}} convenience
function.)}

\item{notch}{For the box plot, a logical scalar passed to
\code{\link[ggplot2:geom_boxplot]{ggplot2::geom_boxplot()}}. Note: unlike \code{geom_boxplot()}, the default is
\code{notch=TRUE}.}

\item{quantiles}{For dot plots, an optional integer passed to
\code{\link[ggdist:stat_dots]{ggdist::stat_dots()}} specifying the number of quantiles to use for a
quantile dot plot. If \code{quantiles} is \code{NA} (the default) then all data
points are plotted.}

\item{probs}{A numeric vector passed to \code{\link[ggplot2:geom_violin]{ggplot2::geom_violin()}}'s
\code{draw_quantiles} argument to specify at which quantiles to draw
horizontal lines. Set to \code{NULL} to remove the lines.}

\item{y_draw}{For \code{ppc_violin_grouped()}, a string specifying how to draw
\code{y}: \code{"violin"} (default), \code{"points"} (jittered points), or \code{"both"}.}

\item{y_jitter, y_size, y_alpha}{For \code{ppc_violin_grouped()}, if \code{y_draw} is
\code{"points"} or \code{"both"} then \code{y_size}, \code{y_alpha}, and \code{y_jitter} are passed
to to the \code{size}, \code{alpha}, and \code{width} arguments of \code{\link[ggplot2:geom_jitter]{ggplot2::geom_jitter()}}
to control the appearance of \code{y} points. The default of \code{y_jitter=NULL}
will let \strong{ggplot2} determine the amount of jitter.}

\item{pit}{An optional vector of probability integral transformed values for
which the ECDF is to be drawn. If NULL, PIT values are computed to \code{y} with
respect to the corresponding values in \code{yrep}.}

\item{K}{An optional integer defining the number of equally spaced evaluation
points for the PIT-ECDF. Reducing K when using \code{interpolate_adj = FALSE}
makes computing the confidence bands faster. For \code{ppc_pit_ecdf} and
\code{ppc_pit_ecdf_grouped}, if PIT values are supplied, defaults to
\code{length(pit)}, otherwise yrep determines the maximum accuracy of the
estimated PIT values and \code{K} is set to \code{min(nrow(yrep) + 1, 1000)}. For
\code{mcmc_rank_ecdf}, defaults to the number of iterations per chain in \code{x}.}

\item{prob}{The desired simultaneous coverage level of the bands around the
ECDF. A value in (0,1).}

\item{plot_diff}{A boolean defining whether to plot the difference between
the observed PIT- ECDF and the theoretical expectation for uniform PIT
values rather than plotting the regular ECDF. The default is \code{FALSE}, but
for large samples we recommend setting \code{plot_diff=TRUE} as the difference
plot will visually show a more dynamic range.}

\item{interpolate_adj}{A boolean defining if the simultaneous confidence
bands should be interpolated based on precomputed values rather than
computed exactly. Computing the bands may be computationally intensive and
the approximation gives a fast method for assessing the ECDF trajectory.
The default is to use interpolation if \code{K} is greater than 200.}
}
\value{
The plotting functions return a ggplot object that can be further
customized using the \strong{ggplot2} package. The functions with suffix
\verb{_data()} return the data that would have been drawn by the plotting
function.
}
\description{
Compare the empirical distribution of the data \code{y} to the distributions of
simulated/replicated data \code{yrep} from the posterior predictive distribution.
See the \strong{Plot Descriptions} section, below, for details.
}
\details{
For Binomial data, the plots may be more useful if
the input contains the "success" \emph{proportions} (not discrete
"success" or "failure" counts).
}
\section{Plot Descriptions}{

\describe{
\item{\verb{ppc_hist(), ppc_freqpoly(), ppc_dens(), ppc_boxplot()}}{
A separate histogram, shaded frequency polygon, smoothed kernel density
estimate, or box and whiskers plot is displayed for \code{y} and each
dataset (row) in \code{yrep}. For these plots \code{yrep} should therefore
contain only a small number of rows. See the \strong{Examples} section.
}
\item{\code{ppc_dots()}}{
A dot plot plot is displayed for \code{y} and each dataset (row) in \code{yrep}.
For these plots \code{yrep} should therefore contain only a small number of rows.
See the \strong{Examples} section. This function requires \link[ggdist:stat_dots]{ggdist::stat_dots} to be installed.
}
\item{\code{ppc_freqpoly_grouped()}}{
A separate frequency polygon is plotted for each level of a grouping
variable for \code{y} and each dataset (row) in \code{yrep}. For this plot
\code{yrep} should therefore contain only a small number of rows. See the
\strong{Examples} section.
}
\item{\code{ppc_ecdf_overlay()}, \code{ppc_dens_overlay()},
\code{ppc_ecdf_overlay_grouped()}, \code{ppc_dens_overlay_grouped()}}{
Kernel density or empirical CDF estimates of each dataset (row) in
\code{yrep} are overlaid, with the distribution of \code{y} itself on top
(and in a darker shade). When using \code{ppc_ecdf_overlay()} with discrete
data, set the \code{discrete} argument to \code{TRUE} for better results.
For an example of \code{ppc_dens_overlay()} also see Gabry et al. (2019).
}
\item{\code{ppc_violin_grouped()}}{
The density estimate of \code{yrep} within each level of a grouping
variable is plotted as a violin with horizontal lines at notable
quantiles. \code{y} is overlaid on the plot either as a violin, points, or
both, depending on the \code{y_draw} argument.
}
\item{\code{ppc_pit_ecdf()}, \code{ppc_pit_ecdf_grouped()}}{
The PIT-ECDF of the empirical PIT values of \code{y} computed with respect to
the corresponding \code{yrep} values. \code{100 * prob}\% central simultaneous
confidence intervals are provided to asses if \code{y} and \code{yrep} originate
from the same distribution. The PIT values can also be provided directly
as \code{pit}.
See Säilynoja et al. (2021) for more details.}
}
}

\examples{
color_scheme_set("brightblue")
y <- example_y_data()
yrep <- example_yrep_draws()
group <- example_group_data()
dim(yrep)

ppc_dens_overlay(y, yrep[1:25, ])
\donttest{
# ppc_ecdf_overlay with continuous data (set discrete=TRUE if discrete data)
ppc_ecdf_overlay(y, yrep[sample(nrow(yrep), 25), ])

# PIT-ECDF and PIT-ECDF difference plot of the PIT values of y compared to
# yrep with 99\% simultaneous confidence bands.
ppc_pit_ecdf(y, yrep, prob = 0.99, plot_diff = FALSE)
ppc_pit_ecdf(y, yrep, prob = 0.99, plot_diff = TRUE)
}

# for ppc_hist,dens,freqpoly,boxplot,dots definitely use a subset yrep rows so
# only a few (instead of nrow(yrep)) histograms are plotted
ppc_hist(y, yrep[1:8, ])
\donttest{
color_scheme_set("red")
ppc_boxplot(y, yrep[1:8, ])

# wizard hat plot
color_scheme_set("blue")
ppc_dens(y, yrep[200:202, ])

# dot plot
ppc_dots(y, yrep[1:8, ])
}

\donttest{
# frequency polygons
ppc_freqpoly(y, yrep[1:3, ], alpha = 0.1, size = 1, binwidth = 5)

ppc_freqpoly_grouped(y, yrep[1:3, ], group) + yaxis_text()

# if groups are different sizes then the 'freq' argument can be useful
ppc_freqpoly_grouped(y, yrep[1:3, ], group, freq = FALSE) + yaxis_text()
}

# density and distribution overlays by group
ppc_dens_overlay_grouped(y, yrep[1:25, ], group = group)

ppc_ecdf_overlay_grouped(y, yrep[1:25, ], group = group)

\donttest{
# PIT-ECDF plots of the PIT values by group
# with 99\% simultaneous confidence bands.
ppc_pit_ecdf_grouped(y, yrep, group=group, prob=0.99)
}

\donttest{
# don't need to only use small number of rows for ppc_violin_grouped
# (as it pools yrep draws within groups)
color_scheme_set("gray")
ppc_violin_grouped(y, yrep, group, size = 1.5)
ppc_violin_grouped(y, yrep, group, alpha = 0)

# change how y is drawn
ppc_violin_grouped(y, yrep, group, alpha = 0, y_draw = "points", y_size = 1.5)
ppc_violin_grouped(y, yrep, group,
  alpha = 0, y_draw = "both",
  y_size = 1.5, y_alpha = 0.5, y_jitter = 0.33
)
}
}
\references{
Gabry, J. , Simpson, D. , Vehtari, A. , Betancourt, M. and
Gelman, A. (2019), Visualization in Bayesian workflow.
\emph{J. R. Stat. Soc. A}, 182: 389-402. doi:10.1111/rssa.12378.
(\href{https://rss.onlinelibrary.wiley.com/doi/full/10.1111/rssa.12378}{journal version},
\href{https://arxiv.org/abs/1709.01449}{arXiv preprint},
\href{https://github.com/jgabry/bayes-vis-paper}{code on GitHub})

Säilynoja, T., Bürkner, P., Vehtari, A.
(2021). Graphical Test for Discrete Uniformity and its Applications in
Goodness of Fit Evaluation and Multiple Sample Comparison \href{https://arxiv.org/abs/2103.10522}{arXiv preprint}.

Gelman, A., Carlin, J. B., Stern, H. S., Dunson, D. B., Vehtari,
A., and Rubin, D. B. (2013). \emph{Bayesian Data Analysis.} Chapman & Hall/CRC
Press, London, third edition. (Ch. 6)
}
\seealso{
Other PPCs: 
\code{\link{PPC-censoring}},
\code{\link{PPC-discrete}},
\code{\link{PPC-errors}},
\code{\link{PPC-intervals}},
\code{\link{PPC-loo}},
\code{\link{PPC-overview}},
\code{\link{PPC-scatterplots}},
\code{\link{PPC-test-statistics}}
}
\concept{PPCs}