Skip to content

Commit

Permalink
Add HLL method to do error-based Aggregator
Browse files Browse the repository at this point in the history
  • Loading branch information
johnynek committed May 6, 2015
1 parent f2c52cb commit 743d8ac
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,45 @@ object HyperLogLog {

}

/**
* The true error is distributed like a Gaussian with
* this standard deviation.
* let m = 2^bits. The size of the HLL is m bytes.
*
* bits | size | error
* 9 512 0.0460
* 10 1024 0.0325
* 11 2048 0.0230
* 12 4096 0.0163
* 13 8192 0.0115
* 14 16384 0.0081
* 15 32768 0.0057
* 16 65536 0.0041
* 17 131072 0.0029
* 18 262144 0.0020
* 19 524288 0.0014
* 20 1048576 0.0010
*
* Keep in mind, to store N distinct longs, you only need 8N bytes.
* See SetSizeAggregator for an approach that uses an exact set
* when the cardinality is small, and switches to HLL after we have
* enough items. Ideally, you would keep an exact set until it is
* smaller to store the HLL (but actually since we use sparse vectors
* to store the HLL, a small HLL takes a lot less than the size above).
*/
def error(bits: Int): Double = 1.04 / scala.math.sqrt(twopow(bits))

/**
* This gives you a number of bits to use to have a given standard
* error
*/
def bitsForError(err: Double): Int = {
// If the error is less than 0.00003, the HLL needs more than 1 << 31 bytes
// which means the array size cannot be stored in an Int, which will cause
// problems. If you need that precise a count, use a different approach.
require(err >= 0.00003 && err < 1.0, s"Error must be in (0.00003, 1.0): $err")
math.ceil(2.0 * math.log(1.04 / err) / math.log(2.0)).toInt
}
}

sealed abstract class HLL extends java.io.Serializable {
Expand Down Expand Up @@ -562,8 +600,35 @@ object HyperLogLogAggregator {
new HyperLogLogAggregator(monoid)
}

/**
* Create an Aggregator that returns the estimate size, not the HLL
* approximate data structure itself. This is convenient, but cannot
* be combined later with another unique count like an HLL could.
*
* @param bits is the log of the size the HLL. See:
*/
def sizeAggregator(bits: Int): MonoidAggregator[Array[Byte], HLL, Double] =
apply(bits).andThenPresent(_.estimatedSize)

/**
* Give a HyperLogLog Aggregator that have the given error.
* It is up to you, using bitsForError, to see if the size is
* still practical for your application.
*
* 0.016 (1.6%), 4 KB
* 0.006 (0.6%), 32 KB
* 0.002 (0.2%), 256 KB
* 0.001 (0.1%), 1024 KB
*
* Cutting the error in half takes 4x the size.
*/
def withError(err: Double): HyperLogLogAggregator = apply(HyperLogLog.bitsForError(err))
/**
* Give an approximate set size (not the HLL) based on inputs of Array[Byte]
* see HyperLogLog.bitsForError for a size table based on the error
*/
def sizeWithError(err: Double): MonoidAggregator[Array[Byte], HLL, Double] =
withError(err).andThenPresent(_.estimatedSize)
}

case class HyperLogLogAggregator(val hllMonoid: HyperLogLogMonoid) extends MonoidAggregator[Array[Byte], HLL, HLL] {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package com.twitter.algebird
import org.scalatest._

import org.scalatest.prop.PropertyChecks
import org.scalacheck.{ Gen, Arbitrary }
import org.scalacheck.{ Gen, Arbitrary, Prop }

import scala.collection.BitSet

Expand Down Expand Up @@ -51,6 +51,12 @@ class HyperLogLogLaws extends CheckProperties {
monoidLawsEq[HLL]{ _.toDenseHLL == _.toDenseHLL }
}

property("bitsForError and error match") {
Prop.forAll(Gen.choose(0.0001, 0.999)) { err =>
val bits = HyperLogLog.bitsForError(err)
(HyperLogLog.error(bits) <= err) && (HyperLogLog.error(bits - 1) > err)
}
}
}

/* Ensure jRhoW matches referenceJRhoW */
Expand Down

0 comments on commit 743d8ac

Please sign in to comment.