New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix Qtree quantileBounds off-by-one error #472
Changes from 3 commits
315f9e5
c8fb5aa
ab9ac69
e50179c
407da82
6579841
d510be2
ab80526
99910d6
4a71905
b96b1df
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -43,20 +43,20 @@ object QTree { | |
* level gives a bin size of 2^level. By default the bin size is 1/65536 (level = -16) | ||
*/ | ||
def apply[A](kv: (Double, A), level: Int = DefaultLevel): QTree[A] = | ||
QTree(math.floor(kv._1 / math.pow(2.0, level)).toLong, | ||
level, | ||
1, | ||
kv._2, | ||
None, | ||
None) | ||
QTree(offset = math.floor(kv._1 / math.pow(2.0, level)).toLong, | ||
level = level, | ||
count = 1, | ||
sum = kv._2, | ||
lowerChild = None, | ||
upperChild = None) | ||
|
||
def apply[A](kv: (Long, A)): QTree[A] = | ||
QTree(kv._1, | ||
0, | ||
1, | ||
kv._2, | ||
None, | ||
None) | ||
QTree(offset = kv._1, | ||
level = 0, | ||
count = 1, | ||
sum = kv._2, | ||
lowerChild = None, | ||
upperChild = None) | ||
|
||
/** | ||
* The common case of wanting an offset and sum for the same value | ||
|
@@ -197,36 +197,38 @@ case class QTree[A]( | |
def quantileBounds(p: Double): (Double, Double) = { | ||
require(p >= 0.0 && p <= 1.0, "The given percentile must be of the form 0 <= p <= 1.0") | ||
|
||
val rank = math.floor(count * p).toLong | ||
// get is safe below, because findRankLowerBound only returns | ||
// None if rank > count, but due to construction rank <= count | ||
(findRankLowerBound(rank).get, findRankUpperBound(rank).get) | ||
// rank is 0-indexed and 0 <= rank < count | ||
val rank = math.floor(count * p).toLong.min(count - 1) | ||
|
||
findRankBounds(rank) | ||
} | ||
|
||
private def findRankLowerBound(rank: Long): Option[Double] = | ||
if (rank > count) | ||
None | ||
else { | ||
val childCounts = mapChildrenWithDefault(0L)(_.count) | ||
val parentCount = count - childCounts._1 - childCounts._2 | ||
lowerChild.flatMap { _.findRankLowerBound(rank - parentCount) } | ||
.orElse { | ||
val newRank = rank - childCounts._1 - parentCount | ||
if (newRank <= 0) | ||
Some(lowerBound) | ||
else | ||
upperChild.flatMap{ _.findRankLowerBound(newRank) } | ||
} | ||
} | ||
/** | ||
* Precondition: if 0 <= rank < count | ||
*/ | ||
private def findRankBounds(rank: Long): (Double, Double) = { | ||
require(0 <= rank && rank < count) | ||
|
||
private def findRankUpperBound(rank: Long): Option[Double] = { | ||
if (rank > count) | ||
None | ||
else { | ||
lowerChild.flatMap{ _.findRankUpperBound(rank) }.orElse { | ||
val lowerCount = lowerChild.map{ _.count }.getOrElse(0L) | ||
upperChild.flatMap{ _.findRankUpperBound(rank - lowerCount) }.orElse(Some(upperBound)) | ||
} | ||
val (leftCount, rightCount) = mapChildrenWithDefault(0L)(_.count) | ||
val parentCount = count - leftCount - rightCount | ||
|
||
if (0 <= rank && rank < leftCount) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 0 <= rank is redundant since we'll always run the require validation? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, that's just for clarity. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can comment There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's more confusing actually. If the code looks like it handles the case where rank < 0, a reasonable person may (initially) infer that it's possible for that to be true. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's a good point. I've fixed this in the latest commit. |
||
|
||
// lowerChild.get is safe because | ||
// leftCount > 0, so lowerChild is not None. | ||
lowerChild.get.findRankBounds(rank) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we use lowerChildNullable and upperChildNullable in this method? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If its called from the 'hot path' i.e. in plus or sumOption then yes worth doing it There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's only called in quantileBounds, so optimizing it is probably not necessary There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah I wouldn't bother there, best to keep the less functional stuff out of the non-critical speed paths. -- That said, it could be worth adding a benchmark on how the results are usually extracted from QTree, all the benchmarks now are for combinations. We had some big perf woes in HLL around some of them. So might be worth the experiment |
||
|
||
} else if (leftCount <= rank && rank < leftCount + parentCount) { | ||
(lowerBound, upperBound) | ||
} else { | ||
// so leftCount + parentCount <= rank < count | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IMO just the first and last comment lines here are sufficient |
||
|
||
// upperChild.get is safe because | ||
// leftCount + parentCount < count, | ||
// so leftCount + (count - leftCount - rightCount) < count, | ||
// so count - rightCount < count, | ||
// so rightCount > 0, so upperChild is not None. | ||
upperChild.get.findRankBounds(rank - leftCount - parentCount) | ||
} | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -68,6 +68,21 @@ class QTreeTest extends WordSpec with Matchers { | |
def trueRangeSum(list: Seq[Double], from: Double, to: Double) = | ||
list.filter{ _ >= from }.filter{ _ < to }.sum | ||
|
||
for (k <- Seq(3, 11, 51, 101)) { | ||
s"QTree with elements (1 to $k)" should { | ||
val trueMedian = (1 + k) / 2 | ||
s"have median $trueMedian" in { | ||
implicit val sg = new QTreeSemigroup[Double](k) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated in latest commit |
||
|
||
val list = (1 to k).map(_.toDouble) | ||
val qtree = sg.sumOption(list.map(QTree(_))).get | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated in latest commit |
||
|
||
val (lower, upper) = qtree.quantileBounds(0.5) | ||
assert(lower <= trueMedian && trueMedian <= upper) | ||
} | ||
} | ||
} | ||
|
||
for (k <- (1 to 6)) | ||
("QTree with sizeHint 2^" + k) should { | ||
"always contain the true quantile within its bounds" in { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IMO remove the 'if' here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed