-
Notifications
You must be signed in to change notification settings - Fork 2
/
lpdens.hpp
382 lines (342 loc) · 13.6 KB
/
lpdens.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
#pragma once
#include "interpolation.hpp"
#include "stats.hpp"
#include "tools.hpp"
#include <functional>
#include <cmath>
//! Local-polynomial density estimation in 1-d.
class LPDens1d {
public:
// constructors
LPDens1d() {}
LPDens1d(Eigen::VectorXd x, double bw, double xmin, double xmax, size_t p,
const Eigen::VectorXd& weights = Eigen::VectorXd());
// getters
Eigen::VectorXd get_values() const {return grid_.get_values();}
Eigen::VectorXd get_grid_points() const {return grid_.get_grid_points();}
double get_bw() const {return bw_;}
double get_p() const {return deg_;}
double get_xmin() const {return xmin_;}
double get_xmax() const {return xmax_;}
double get_edf() const {return edf_;}
double get_loglik() const {return loglik_;}
private:
// data members
InterpolationGrid1d grid_;
double bw_;
double xmin_;
double xmax_;
size_t deg_;
double loglik_;
double edf_;
static constexpr double K0_ = 0.3989425;
// private methods
Eigen::VectorXd kern_gauss(const Eigen::VectorXd& x);
Eigen::MatrixXd fit_lp(const Eigen::VectorXd& x_ev,
const Eigen::VectorXd& x,
const Eigen::VectorXd& weights);
double calculate_infl(const size_t& n,
const double& f0,
const double& b,
const double& bw,
const double& s,
const double& weight);
Eigen::VectorXd boundary_transform(const Eigen::VectorXd& x,
bool inverse = false);
Eigen::VectorXd boundary_correct(const Eigen::VectorXd& x,
const Eigen::VectorXd& fhat);
Eigen::VectorXd construct_grid_points(const Eigen::VectorXd& x,
const Eigen::VectorXd& weights);
Eigen::VectorXd finalize_grid(Eigen::VectorXd& grid_points);
Eigen::VectorXd without_boundary_ext(const Eigen::VectorXd& grid_points);
};
//! constructor for fitting the density estimate.
//! @param x vector of observations
//! @param bw positive bandwidth parameter
//! @param xmin lower bound for the support of the density, `NaN` means no
//! boundary.
//! @param xmax upper bound for the support of the density, `NaN` means no
//! boundary.
//! @param p order of the local polynomial.
//! @param weights vector of weights for each observation (can be empty).
inline LPDens1d::LPDens1d(Eigen::VectorXd x,
double bw,
double xmin,
double xmax,
size_t deg,
const Eigen::VectorXd& weights) :
bw_(bw),
xmin_(xmin),
xmax_(xmax),
deg_(deg)
{
if (weights.size() > 0 && (weights.size() != x.size()))
throw std::runtime_error("x and weights must have the same size");
// construct grid on original domain
Eigen::VectorXd grid_points = construct_grid_points(x, weights);
// transform in case of boundary correction
grid_points = boundary_transform(grid_points);
x = boundary_transform(x);
// fit model and evaluate in transformed domain
Eigen::MatrixXd fitted = fit_lp(grid_points, x, weights);
// back-transform grid to original domain
grid_points = boundary_transform(grid_points, true);
x = boundary_transform(x, true);
// correct estimated density for transformation
Eigen::VectorXd values = boundary_correct(grid_points, fitted.col(0));
// move boundary points to xmin/xmax
grid_points = finalize_grid(grid_points);
// construct interpolation grid
// (3 iterations for normalization to a proper density)
grid_ = InterpolationGrid1d(grid_points, values, 3);
// store normalized values
values = grid_.get_values();
// calculate log-likelihood of final estimate
loglik_ = grid_.interpolate(x).array().log().sum();
// calculate effective degrees of freedom
InterpolationGrid1d infl_grid(without_boundary_ext(grid_points),
without_boundary_ext(fitted.col(1)),
0);
edf_ = infl_grid.interpolate(x).sum();
}
//! Gaussian kernel (truncated at +/- 5).
//! @param x vector of evaluation points.
inline Eigen::VectorXd LPDens1d::kern_gauss(const Eigen::VectorXd& x)
{
auto f = [] (double xx) {
// truncate at +/- 5
if (std::fabs(xx) > 5.0)
return 0.0;
// otherwise calculate normal pdf (orrect for truncation)
return stats::dnorm(Eigen::VectorXd::Constant(1, xx))(0) / 0.999999426;
};
return x.unaryExpr(f);
}
//! (analytically) evaluates the kernel density estimate and its influence
//! function on a user-supplied grid.
//! @param x_ev evaluation points.
//! @param x observations.
//! @param weights vector of weights for each observation (can be empty).
//! @return a two-column matrix containing the density estimate in the first
//! and the influence function in the second column.
inline Eigen::MatrixXd LPDens1d::fit_lp(const Eigen::VectorXd& x_ev,
const Eigen::VectorXd& x,
const Eigen::VectorXd& weights)
{
Eigen::MatrixXd res(x_ev.size(), 2);
size_t n = x.size();
double f0, f1, b;
double s = bw_;
double w0 = 1.0;
Eigen::VectorXd xx(x.size());
Eigen::VectorXd xx2(x.size());
Eigen::VectorXd kernels(x.size());
for (size_t k = 0; k < x_ev.size(); k++) {
// classical (local constant) kernel density estimate
xx = (x.array() - x_ev(k)) / bw_;
kernels = kern_gauss(xx) / bw_;
if (weights.size() > 0)
kernels = kernels.cwiseProduct(weights);
f0 = kernels.mean();
res(k, 0) = f0;
// Before continuing with higher-order polynomials, check
// (local constant) influence. If it is close to one, there is only one
// observation contributing to the estimate and it is evaluated close to
// it. To avoid numerical issues in this case, we just use the
// local constant estimate.
if (weights.size()) {
// find weight corresponding to observation closest to x_ev(k)
w0 = weights(tools::find_min_index(xx.array().abs()));
}
res(k, 1) = K0_ * w0 / (n * bw_) / f0;
if (res(k, 1) > 0.95) {
continue;
}
if (deg_ > 0) {
// calculate b for local linear
xx /= bw_;
f1 = xx.cwiseProduct(kernels).mean(); // first order derivative
b = f1 / f0;
if (deg_ == 2) {
// more calculations for local quadratic
xx2 = xx.cwiseProduct(kernels) / (f0 * static_cast<double>(n));
b *= std::pow(bw_, 2);
s = 1.0 / (std::pow(bw_, 4) * xx.transpose() * xx2 - b*b);
res(k, 0) *= bw_ * std::sqrt(s);
}
// final estimate
res(k, 0) *= std::exp(-0.5 * std::pow(b, 2) * s);
res(k, 1) = calculate_infl(n, f0, b, bw_, s, w0);
if (std::isnan(res(k)) | std::isinf(res(k))) {
// inverse operation might go wrong due to rounding when
// true value is equal or close to zero
res(k, 0) = 0.0;
res(k, 1) = 0.0;
}
}
}
return res;
}
//! calculate influence for data point for density estimate based on
//! quantities pre-computed in `fit_lp()`.
inline double LPDens1d::calculate_infl(const size_t &n,
const double& f0,
const double& b,
const double& bw,
const double& s,
const double& weight)
{
Eigen::MatrixXd M;
double bw2 = std::pow(bw, 2);
double b2 = std::pow(b, 2);
if (deg_ == 0) {
M = Eigen::MatrixXd::Constant(1, 1, f0);
} else if (deg_ == 1) {
M = Eigen::MatrixXd(2, 2);
M(0, 0) = f0;
M(0, 1) = bw2 * b * f0;
M(1, 0) = M(0, 1);
M(1, 1) = f0 * bw2 + f0 * bw2 * bw2 * b2;
} else if (deg_ == 2) {
M = Eigen::MatrixXd(3, 3);
M(0, 0) = f0;
M(0, 1) = f0 * b;
M(1, 0) = M(0, 1);
M(1, 1) = f0 * bw2 + f0 * b2;
M(1, 2) = 0.5 * f0 * (3.0 / s * b + b * b2);
M(2, 1) = M(1, 2);
M(2, 2) = 0.25 * f0;
M(2, 2) *= 3.0 / std::pow(s, 2) + 6.0 / s * b2 + b2 * b2;
M(0, 2) = M(2, 2);
M(2, 0) = M(2, 2);
}
return K0_ * weight / (n * bw) * M.inverse()(0, 0);
}
//! transformations for density estimates with bounded support.
//! @param x evaluation points.
//! @param inverse whether the inverse transformation should be applied.
//! @return the transformed evaluation points.
inline Eigen::VectorXd LPDens1d::boundary_transform(const Eigen::VectorXd& x,
bool inverse)
{
Eigen::VectorXd x_new = x;
if (!inverse) {
if (!std::isnan(xmin_) & !std::isnan(xmax_)) {
// two boundaries -> probit transform
x_new = (x.array() - xmin_ + 5e-5) / (xmax_ - xmin_ + 1e-4);
x_new = stats::qnorm(x_new);
} else if (!std::isnan(xmin_)) {
// left boundary -> log transform
x_new = (1e-3 + x.array() - xmin_).log();
} else if (!std::isnan(xmax_)) {
// right boundary -> negative log transform
x_new = (1e-3 + xmax_ - x.array()).log();
} else {
// no boundary -> no transform
}
} else {
if (!std::isnan(xmin_) & !std::isnan(xmax_)) {
// two boundaries -> probit transform
x_new = stats::pnorm(x).array() + xmin_ - 5e-5;
x_new *= (xmax_ - xmin_ + 1e-4);
} else if (!std::isnan(xmin_)) {
// left boundary -> log transform
x_new = x.array().exp() + xmin_ - 1e-3;
} else if (!std::isnan(xmax_)) {
// right boundary -> negative log transform
x_new = -(x.array().exp() - xmax_ - 1e-3);
} else {
// no boundary -> no transform
}
}
return x_new;
}
//! corrects the density estimate for a preceding boundary transformation of
//! the data.
//! @param x evaluation points (in original domain).
//! @param fhat the density estimate evaluated in the transformed domain.
//! @return corrected density estimates at `x`.
inline Eigen::VectorXd LPDens1d::boundary_correct(const Eigen::VectorXd& x,
const Eigen::VectorXd& fhat)
{
Eigen::VectorXd corr_term(fhat.size());
if (!std::isnan(xmin_) & !std::isnan(xmax_)) {
// two boundaries -> probit transform
corr_term = (x.array() - xmin_ + 5e-5) / (xmax_ - xmin_ + 1e-4);
corr_term = stats::dnorm(stats::qnorm(corr_term));
corr_term /= (xmax_ - xmin_ + 1e-4);
corr_term = 1.0 / corr_term.array().max(1e-6);
} else if (!std::isnan(xmin_)) {
// left boundary -> log transform
corr_term = 1.0 / (1e-3 + x.array() - xmin_);
} else if (!std::isnan(xmax_)) {
// right boundary -> negative log transform
corr_term = 1.0 / (1e-3 + xmax_ - x.array());
} else {
// no boundary -> no transform
corr_term.fill(1.0);
}
return fhat.array() * corr_term.array();
}
//! constructs a grid that is later used for interpolation.
//! @param x vector of observations.
//! @return a grid of size 50.
inline Eigen::VectorXd LPDens1d::construct_grid_points(
const Eigen::VectorXd& x,
const Eigen::VectorXd& weights)
{
// hybrid grid: quantiles and two equally spaced points between them
auto qgrid = stats::quantile(
x, Eigen::VectorXd::LinSpaced(14, 0, 1), weights);
size_t grid_size = 53;
Eigen::VectorXd inner_grid(grid_size);
for (unsigned i = 0; i < qgrid.size() - 1; i++) {
inner_grid.segment(i * 4, 5) =
Eigen::VectorXd::LinSpaced(5, qgrid(i), qgrid(i + 1));
}
// extend grid where there's no boundary
double range = inner_grid.maxCoeff() - inner_grid.minCoeff();
Eigen::VectorXd lowr_ext, uppr_ext;
if (std::isnan(xmin_)) {
// no left boundary -> add a few points to the left
lowr_ext = Eigen::VectorXd(2);
lowr_ext[1] = inner_grid[0] - 1 * bw_;
lowr_ext[0] = inner_grid[0] - 2 * bw_;
}
if (std::isnan(xmax_)) {
// no right boundary -> add a few points to the right
uppr_ext = Eigen::VectorXd(2);
uppr_ext[0] = inner_grid[grid_size - 1] + 1 * bw_;
uppr_ext[1] = inner_grid[grid_size - 1] + 2 * bw_;
}
Eigen::VectorXd grid_points(grid_size + uppr_ext.size() + lowr_ext.size());
grid_points << lowr_ext, inner_grid, uppr_ext;
return grid_points;
}
//! moves the boundary points of the grid to xmin/xmax (if non-NaN).
//! @param grid_points the grid points.
inline Eigen::VectorXd LPDens1d::finalize_grid(Eigen::VectorXd& grid_points)
{
if (!std::isnan(xmin_))
grid_points(0) = xmin_;
if (!std::isnan(xmax_))
grid_points(grid_points.size() - 1) = xmax_;
return grid_points;
}
//! removes the boundary extension from the grid_points (see
//! `construct_grid_points`).
//! @param grid_points the grid points.
inline Eigen::VectorXd LPDens1d::without_boundary_ext(
const Eigen::VectorXd& grid_points)
{
size_t grid_start = 0;
size_t grid_size = grid_points.size();
// (grid extension has length 2)
if (std::isnan(xmin_)) {
grid_start += 1;
grid_size -= 2;
}
if (std::isnan(xmax_))
grid_size -= 2;
return grid_points.segment(grid_start, grid_size);
}