Skip to content

Commit

Permalink
add weights argument (#29)
Browse files Browse the repository at this point in the history
* remove unused variable (#26)

* fix edf computation for deg = 2

* bump version + update docs

* remove package cache on appveyor

* weighted bandwidth selection

* weighted fitting

* adjust R interface

* add unit tests

* a few more sanity checks

* update docs
  • Loading branch information
tnagler committed Oct 8, 2018
1 parent 10b08bd commit e9a9a54
Show file tree
Hide file tree
Showing 14 changed files with 198 additions and 74 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ Imports:
qrng,
stats,
utils
RoxygenNote: 6.0.1
RoxygenNote: 6.1.0
Suggests:
testthat
URL: https://github.com/tnagler/kde1d
Expand Down
9 changes: 5 additions & 4 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
#' @return `An Rcpp::List` containing the fitted density values on a grid and
#' additional information.
#' @noRd
fit_kde1d_cpp <- function(x, bw, xmin, xmax, deg) {
.Call('_kde1d_fit_kde1d_cpp', PACKAGE = 'kde1d', x, bw, xmin, xmax, deg)
fit_kde1d_cpp <- function(x, bw, xmin, xmax, deg, weights) {
.Call('_kde1d_fit_kde1d_cpp', PACKAGE = 'kde1d', x, bw, xmin, xmax, deg, weights)
}

#' computes the pdf of a kernel density estimate by interpolation.
Expand Down Expand Up @@ -48,9 +48,10 @@ qkde1d_cpp <- function(x, R_object) {
#' @param x vector of observations
#' @param grid_size number of equally-spaced points over which binning is
#' performed to obtain kernel functional approximation
#' @param weights vector of weights for each observation (can be empty).
#' @return the selected bandwidth
#' @noRd
select_bw_cpp <- function(x, bw, mult, discrete) {
.Call('_kde1d_select_bw_cpp', PACKAGE = 'kde1d', x, bw, mult, discrete)
select_bw_cpp <- function(x, bw, mult, discrete, weights) {
.Call('_kde1d_select_bw_cpp', PACKAGE = 'kde1d', x, bw, mult, discrete, weights)
}

15 changes: 10 additions & 5 deletions R/kde1d.R
Original file line number Diff line number Diff line change
Expand Up @@ -83,25 +83,30 @@
#' @importFrom cctools cont_conv
#' @importFrom stats na.omit
#' @export
kde1d <- function(x, xmin = NaN, xmax = NaN, mult = 1, bw = NA, deg = 2) {
kde1d <- function(x, xmin = NaN, xmax = NaN, mult = 1, bw = NA, deg = 2,
weights = numeric(0)) {
x <- na.omit(x)
# sanity checks
check_arguments(x, mult, xmin, xmax, bw, deg)
check_arguments(x, mult, xmin, xmax, bw, deg, weights)

# jittering for discrete variables
x <- cctools::cont_conv(x)

# bandwidth selection
bw <- select_bw_cpp(boundary_transform(x, xmin, xmax), bw, mult,
length(attr(x, "i_disc")) == 1)
bw <- select_bw_cpp(boundary_transform(x, xmin, xmax),
bw,
mult,
length(attr(x, "i_disc")) == 1,
weights)

# fit model
fit <- fit_kde1d_cpp(x, bw, xmin, xmax, deg)
fit <- fit_kde1d_cpp(x, bw, xmin, xmax, deg, weights)

# add info
fit$jitter_info <- attributes(x)
fit$var_name <- colnames(x)
fit$nobs <- length(x)
fit$weights <- weights

# return as kde1d object
class(fit) <- "kde1d"
Expand Down
4 changes: 3 additions & 1 deletion R/kde1d_methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,12 @@ pkde1d <- function(q, obj) {
p <- pkde1d_cpp(q, obj)
} else {
if (!is.ordered(q))

q <- ordered(q, obj$jitter_info$levels$x)
x_all <- as.ordered(obj$jitter_info$levels$x)
p_all <- dkde1d(x_all, obj)
p <- sapply(q, function(y) sum(p_all[x_all <= y]))
p_total <- sum(p_all)
p <- sapply(q, function(y) sum(p_all[x_all <= y] / p_total))
p <- pmin(pmax(p, 0), 1)
}

Expand Down
18 changes: 17 additions & 1 deletion R/tools.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,21 @@ check_boundary_violations <- function(x, xmin, xmax) {

#' check and pre-process arguments passed to kde1d()
#' @noRd
check_arguments <- function(x, mult, xmin, xmax, bw, deg) {
check_arguments <- function(x, mult, xmin, xmax, bw, deg, weights) {
stopifnot(NCOL(x) == 1)
stopifnot(length(mult) == 1)
stopifnot(length(xmin) == 1)
stopifnot(length(xmax) == 1)
stopifnot(length(bw) == 1)
stopifnot(length(deg) == 1)

stopifnot(is.numeric(mult))
stopifnot(mult > 0)
stopifnot(is.numeric(xmin))
stopifnot(is.numeric(xmax))
stopifnot(is.numeric(xmax))
stopifnot(is.na(bw) | is.numeric(bw))
stopifnot(is.numeric(deg))

if (!is.ordered(x) & is.factor(x))
stop("Factors not allowed; use kdevine::kdevine() or cctools::cckde().")
Expand All @@ -34,6 +47,9 @@ check_arguments <- function(x, mult, xmin, xmax, bw, deg) {

if (!(deg %in% 0:2))
stop("deg must be either 0, 1, or 2.")

if ((length(weights) > 0) && (length(weights) != length(x)))
stop("x and weights must have same length.")
}

#' adjusts observations and evaluation points for boundary effects
Expand Down
44 changes: 30 additions & 14 deletions inst/include/dpik.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@
//! @param a lower bound
//! @param b upper bound
//! @param m number of bins
//' @param weights vector of weights for each observation.
//! @return bin counts
inline Eigen::VectorXd linbin(const Eigen::VectorXd& x,
double a, double b, size_t m)
double a, double b, size_t m,
const Eigen::VectorXd& weights)
{
Eigen::VectorXd gcnts = Eigen::VectorXd::Zero(m);
double rem, lxi, delta;
Expand All @@ -23,11 +25,12 @@ inline Eigen::VectorXd linbin(const Eigen::VectorXd& x,
size_t li = static_cast<size_t>(lxi);
rem = lxi - li;
if (li >= 1 && li < m) {
gcnts(li - 1) += 1 - rem;
gcnts(li) += rem;
gcnts(li - 1) += (1 - rem) * weights(i);
gcnts(li) += rem * weights(i);
}
}
return(gcnts);

return gcnts;
}

// Binned Kernel Functional Estimate
Expand Down Expand Up @@ -83,41 +86,54 @@ double bkfe(const Eigen::VectorXd& x,

// Bandwidth for Kernel Density Estimation
//! @param x vector of observations
//! @param weights vector of weights for each observation (can be empty).
//! @param grid_size number of equally-spaced points over which binning is
//! performed to obtain kernel functional approximation
//! @return the selected bandwidth
inline double dpik(const Eigen::VectorXd& x, size_t grid_size = 401) {
inline double dpik(const Eigen::VectorXd& x,
Eigen::VectorXd weights = Eigen::VectorXd(),
size_t grid_size = 401)
{
if (weights.size() > 0 && (weights.size() != x.size()))
throw std::runtime_error("x and weights must have the same size");

double n = static_cast<double>(x.size());
if (weights.size() == 0) {
weights = Eigen::VectorXd::Constant(x.size(), 1.0);
} else {
weights = weights * x.size() / weights.sum();
}

double n = static_cast<double>(x.size());
double a = x.minCoeff();
double b = x.maxCoeff();

double m_x = x.mean();
double m_x = x.cwiseProduct(weights).mean();
Eigen::VectorXd sx = (x - Eigen::VectorXd::Constant(x.size(), m_x));
double sd_x = std::sqrt(sx.cwiseAbs2().sum()/(n - 1));
double sd_x = std::sqrt(sx.cwiseAbs2().cwiseProduct(weights).sum()/(n - 1));
Eigen::VectorXd q_x(2);
q_x(0) = 0.75;
q_x(1) = 0.25;
q_x = stats::quantile(x, q_x);
q_x = stats::quantile(x, q_x, weights);
double scale = std::min((q_x(0) - q_x(1))/1.349, sd_x);

// use effective sample size for pilot bandwidths
double effn = std::pow(weights.sum(), 2) / weights.cwiseAbs2().sum();
double bw = 0.0;
try {
sx /= scale;
double sa = (a - m_x) / scale;
double sb = (b - m_x) / scale;
auto x2 = linbin(sx, sa, sb, grid_size);
auto x2 = linbin(sx, sa, sb, grid_size, weights);

double alpha = std::pow(std::pow(2.0, 11.0 / 2.0)/(7.0 * n), 1.0 / 9.0);
double alpha = std::pow(std::pow(2.0, 11.0 / 2.0)/(7.0 * effn), 1.0 / 9.0);
double psi6hat = bkfe(x2, 6, alpha, sa, sb);
alpha = std::pow(-3.0 * std::sqrt(2.0 / M_PI)/(psi6hat * n), 1.0 / 7.0);
alpha = std::pow(-3.0 * std::sqrt(2.0 / M_PI)/(psi6hat * effn), 1.0 / 7.0);
double psi4hat = bkfe(x2, 4, alpha, sa, sb);

double del0 = 1.0 / std::pow(4.0 * M_PI, 1.0 / 10.0);
bw = scale * del0 * std::pow(1.0 / (psi4hat * n), 1.0 / 5.0);
bw = scale * del0 * std::pow(1.0 / (psi4hat * effn), 1.0 / 5.0);
} catch (...) {
bw = 4.0 * 1.06 *scale * std::pow(1.0 / n, 1.0 / 5.0);
bw = 4.0 * 1.06 * scale * std::pow(1.0 / effn, 1.0 / 5.0);
}

return(bw);
Expand Down
35 changes: 26 additions & 9 deletions inst/include/lpdens.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ class LPDens1d {
public:
// constructors
LPDens1d() {}
LPDens1d(Eigen::VectorXd x, double bw, double xmin, double xmax, size_t p);
LPDens1d(Eigen::VectorXd x, double bw, double xmin, double xmax, size_t p,
const Eigen::VectorXd& weights = Eigen::VectorXd());

// getters
Eigen::VectorXd get_values() const {return grid_.get_values();}
Expand All @@ -35,12 +36,14 @@ class LPDens1d {
// private methods
Eigen::VectorXd kern_gauss(const Eigen::VectorXd& x);
Eigen::MatrixXd fit_lp(const Eigen::VectorXd& x_ev,
const Eigen::VectorXd& x);
const Eigen::VectorXd& x,
const Eigen::VectorXd& weights);
double calculate_infl(const size_t& n,
const double& f0,
const double& b,
const double& bw,
const double& s);
const double& s,
const double& weight);
Eigen::VectorXd boundary_transform(const Eigen::VectorXd& x,
bool inverse = false);
Eigen::VectorXd boundary_correct(const Eigen::VectorXd& x,
Expand All @@ -58,16 +61,21 @@ class LPDens1d {
//! @param xmax upper bound for the support of the density, `NaN` means no
//! boundary.
//! @param p order of the local polynomial.
//! @param weights vector of weights for each observation (can be empty).
inline LPDens1d::LPDens1d(Eigen::VectorXd x,
double bw,
double xmin,
double xmax,
size_t deg) :
size_t deg,
const Eigen::VectorXd& weights) :
bw_(bw),
xmin_(xmin),
xmax_(xmax),
deg_(deg)
{
if (weights.size() > 0 && (weights.size() != x.size()))
throw std::runtime_error("x and weights must have the same size");

// construct equally spaced grid on original domain
Eigen::VectorXd grid_points = construct_grid_points(x);

Expand All @@ -76,7 +84,7 @@ inline LPDens1d::LPDens1d(Eigen::VectorXd x,
x = boundary_transform(x);

// fit model and evaluate in transformed domain
Eigen::MatrixXd fitted = fit_lp(grid_points, x);
Eigen::MatrixXd fitted = fit_lp(grid_points, x, weights);

// back-transform grid to original domain
grid_points = boundary_transform(grid_points, true);
Expand Down Expand Up @@ -123,10 +131,12 @@ inline Eigen::VectorXd LPDens1d::kern_gauss(const Eigen::VectorXd& x)
//! function on a user-supplied grid.
//! @param x_ev evaluation points.
//! @param x observations.
//! @param weights vector of weights for each observation (can be empty).
//! @return a two-column matrix containing the density estimate in the first
//! and the influence function in the second column.
inline Eigen::MatrixXd LPDens1d::fit_lp(const Eigen::VectorXd& x_ev,
const Eigen::VectorXd& x)
const Eigen::VectorXd& x,
const Eigen::VectorXd& weights)
{
Eigen::MatrixXd res(x_ev.size(), 2);
size_t n = x.size();
Expand All @@ -140,6 +150,8 @@ inline Eigen::MatrixXd LPDens1d::fit_lp(const Eigen::VectorXd& x_ev,
// classical (local constant) kernel density estimate
xx = (x.array() - x_ev(k)) / bw_;
kernels = kern_gauss(xx) / bw_;
if (weights.size() > 0)
kernels = kernels.cwiseProduct(weights);
f0 = kernels.mean();
res(k, 0) = f0;

Expand Down Expand Up @@ -168,7 +180,11 @@ inline Eigen::MatrixXd LPDens1d::fit_lp(const Eigen::VectorXd& x_ev,
}

// influence function estimate
res(k, 1) = calculate_infl(n, f0, b, bw_, s);
if (weights.size() > 0) {
res(k, 1) = calculate_infl(n, f0, b, bw_, s, weights(k));
} else {
res(k, 1) = calculate_infl(n, f0, b, bw_, s, 1.0);
}
}

return res;
Expand All @@ -180,7 +196,8 @@ inline double LPDens1d::calculate_infl(const size_t &n,
const double& f0,
const double& b,
const double& bw,
const double& s)
const double& s,
const double& weight)
{
Eigen::MatrixXd M;
double bw2 = std::pow(bw, 2);
Expand Down Expand Up @@ -208,7 +225,7 @@ inline double LPDens1d::calculate_infl(const size_t &n,
}

double infl = kern_gauss(Eigen::VectorXd::Zero(1))(0) / bw;
infl *= M.inverse()(0, 0) / static_cast<double>(n);
infl *= M.inverse()(0, 0) * weight / static_cast<double>(n);
return infl;
}

Expand Down
45 changes: 44 additions & 1 deletion inst/include/stats.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ inline Eigen::MatrixXd qnorm(const Eigen::MatrixXd& x)
//! @param x data.
//! @param q evaluation points.
//! @return vector of quantiles.
inline Eigen::VectorXd quantile(const Eigen::VectorXd &x,
inline Eigen::VectorXd quantile(const Eigen::VectorXd& x,
const Eigen::VectorXd& q)
{
double n = static_cast<double>(x.size());
Expand All @@ -66,4 +66,47 @@ inline Eigen::VectorXd quantile(const Eigen::VectorXd &x,
return res;
}

//! empirical quantiles
//! @param x data.
//! @param q evaluation points.
//! @param w vector of weights.
//! @return vector of quantiles.
inline Eigen::VectorXd quantile(const Eigen::VectorXd& x,
const Eigen::VectorXd& q,
const Eigen::VectorXd& w)
{
if (w.size() != x.size())
throw std::runtime_error("x and w must have the same size");
double n = static_cast<double>(x.size());
size_t m = q.size();
Eigen::VectorXd res(m);

// map to std::vector and sort
std::vector<size_t> ind(n);
for (size_t i = 0; i < n; ++i)
ind[i] = i;
std::sort(ind.begin(), ind.end(),
[&x] (size_t i, size_t j) { return x(i) < x(j); });

auto x2 = x;
auto wcum = w;
auto wrank = Eigen::VectorXd::Constant(n, 0.0);
double wacc = 0.0;
for (size_t i = 0; i < n; ++i) {
x2(i) = x(ind[i]);
wcum(i) = wacc;
wacc += w(ind[i]);
}

double wsum = w.sum() - w(ind[n - 1]);;
for (size_t j = 0; j < m; ++j) {
size_t i = 1;
while (wcum(i) < q(j) * wsum)
i++;
res(j) = x2(i - 1) + (x2(i) - x2(i - 1)) * (q(j) * wsum - wcum(i - 1));
}

return res;
}

}
1 change: 0 additions & 1 deletion man/dkde1d.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion man/kde1d.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit e9a9a54

Please sign in to comment.