forked from mongodb/mongo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpercentiles_approx.js
150 lines (130 loc) · 5.05 KB
/
percentiles_approx.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
/**
* Tests for the approximate percentile accumulator semantics.
* @tags: [
* requires_fcv_70,
* ]
*/
import {
testLargeUniformDataset,
testLargeUniformDataset_Decimal,
testLargeUniformDataset_WithInfinities,
testWithMultipleGroups,
testWithSingleGroup
} from "jstests/aggregation/libs/percentiles_util.js";
const coll = db[jsTestName()];
/**
* Tests for correctness without grouping. Each group gets its own accumulator so we can validate
* the basic $percentile functionality using a single group.
*/
testWithSingleGroup({
coll: coll,
docs: [{x: 0}, {x: "non-numeric"}, {x: 1}, {no_x: 0}, {x: 2}],
percentileSpec: {$percentile: {p: [0.5], input: "$x", method: "approximate"}},
expectedResult: [1],
msg: "Non-numeric data should be ignored"
});
testWithSingleGroup({
coll: coll,
docs: [{x: "non-numeric"}, {no_x: 0}, {x: new Date()}, {x: [42, 43]}, {x: null}, {x: NaN}],
percentileSpec: {$percentile: {p: [0.5], input: "$x", method: "approximate"}},
expectedResult: [null],
msg: "Single percentile of completely non-numeric data"
});
testWithSingleGroup({
coll: coll,
docs: [{x: "non-numeric"}, {no_x: 0}, {x: new Date()}, {x: [42, 43]}, {x: null}, {x: NaN}],
percentileSpec: {$percentile: {p: [0.5, 0.9], input: "$x", method: "approximate"}},
expectedResult: [null, null],
msg: "Multiple percentiles of completely non-numeric data"
});
testWithSingleGroup({
coll: coll,
docs: [{x: 10}, {x: 5}, {x: 27}],
percentileSpec: {$percentile: {p: [0], input: "$x", method: "approximate"}},
expectedResult: [5],
msg: "Minimum percentile"
});
testWithSingleGroup({
coll: coll,
docs: [{x: 10}, {x: 5}, {x: 27}],
percentileSpec: {$percentile: {p: [1], input: "$x", method: "approximate"}},
expectedResult: [27],
msg: "Maximum percentile"
});
testWithSingleGroup({
coll: coll,
docs: [{x: 0}, {x: 1}, {x: 2}],
percentileSpec: {$percentile: {p: [0.5, 0.9, 0.1], input: "$x", method: "approximate"}},
expectedResult: [1, 2, 0],
msg: "Multiple percentiles"
});
testWithSingleGroup({
coll: coll,
docs: [{x: 0}, {x: 1}, {x: 2}],
percentileSpec: {$percentile: {p: "$$ps", input: "$x", method: "approximate"}},
letSpec: {ps: [0.5, 0.9, 0.1]},
expectedResult: [1, 2, 0],
msg: "Multiple percentiles using variable in the percentile spec for the whole array"
});
testWithSingleGroup({
coll: coll,
docs: [{x: 0}, {x: 1}, {x: 2}],
percentileSpec: {$percentile: {p: ["$$p90"], input: "$x", method: "approximate"}},
letSpec: {p90: 0.9},
expectedResult: [2],
msg: "Single percentile using variable in the percentile spec for the array elements"
});
testWithSingleGroup({
coll: coll,
docs: [{x: 0}, {x: 1}, {x: 2}],
percentileSpec: {
$percentile:
{p: {$concatArrays: [[0.1, 0.5], ["$$p90"]]}, input: "$x", method: "approximate"}
},
letSpec: {p90: 0.9},
expectedResult: [0, 1, 2],
msg: "Multiple percentiles using const expression in the percentile spec"
});
testWithSingleGroup({
coll: coll,
docs: [{x: 0}, {x: 1}, {x: 2}],
percentileSpec: {$percentile: {p: "$$ps", input: {$add: [42, "$x"]}, method: "approximate"}},
letSpec: {ps: [0.5, 0.9, 0.1]},
expectedResult: [42 + 1, 42 + 2, 42 + 0],
msg: "Multiple percentiles using expression as input"
});
/**
* Tests for correctness with grouping on $k and computing the percentile on $x.
*/
testWithMultipleGroups({
coll: coll,
docs: [{k: 0, x: 0}, {k: 0, x: 1}, {k: 1, x: 2}, {k: 2}, {k: 0, x: "str"}, {k: 1, x: 0}],
percentileSpec: {$percentile: {p: [0.9], input: "$x", method: "approximate"}},
expectedResult: [/* k:0 */[1], /* k:1 */[2], /* k:2 */[null]],
msg: "Multiple groups"
});
/**
* The tests above use tiny datasets where t-digest would create a centroid per sample and will be
* always precise. The following tests use more data. We create the data with Random.rand() which
* produces a uniform distribution in [0.0, 1.0) (for testing with other data distributions see C++
* unit tests for TDigest).
*/
// While t-digest is expected to have better accuracy for the extreme percentiles, we check the
// error uniformly in these tests because on uniform distribution with our chosen seed, the error
// happens to be super low across the board.
// The seed is arbitrary but the accuracy error has been empirically determined based on the
// generated samples with _this_ seed.
Random.setRandomSeed(20230328);
const accuracyError = 0.001;
let samples = [];
for (let i = 0; i < 10000; i++) {
samples.push(Random.rand());
}
let sortedSamples = [].concat(samples);
sortedSamples.sort((a, b) => a - b);
const p = [0.0, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 0.999, 1.0];
testLargeUniformDataset(coll, samples, sortedSamples, p, accuracyError, "approximate");
testLargeUniformDataset_WithInfinities(
coll, samples, sortedSamples, p, accuracyError, "approximate");
// Same dataset but using Decimal128 type.
testLargeUniformDataset_Decimal(coll, samples, sortedSamples, p, accuracyError, "approximate");