Skip to content

Commit

Permalink
Merge pull request #426 from Bekbolatov/develop
Browse files Browse the repository at this point in the history
A couple of performance optimizations: HyperLogLog and BloomFilter
  • Loading branch information
johnynek committed May 6, 2015
2 parents db7eb5c + ed39056 commit e9aabba
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ sealed abstract class BF extends java.io.Serializable {

def +(other: String): BF

def checkAndAdd(item: String): (BF, ApproximateBoolean)

def contains(item: String): ApproximateBoolean

// Estimates the cardinality of the set of elements that have been
Expand All @@ -129,6 +131,8 @@ case class BFZero(hashes: BFHash, width: Int) extends BF {

def +(other: String) = BFItem(other, hashes, width)

def checkAndAdd(other: String): (BF, ApproximateBoolean) = (this + other, ApproximateBoolean.exactFalse)

def contains(item: String) = ApproximateBoolean.exactFalse

def size = Approximate.exact[Long](0)
Expand All @@ -150,6 +154,14 @@ case class BFItem(item: String, hashes: BFHash, width: Int) extends BF {

def +(other: String) = this ++ BFItem(other, hashes, width)

def checkAndAdd(other: String): (BF, ApproximateBoolean) = {
if (other == item) {
(this, ApproximateBoolean.exactTrue)
} else {
(this + other, ApproximateBoolean.exactFalse)
}
}

def contains(x: String) = ApproximateBoolean.exact(item == x)

def size = Approximate.exact[Long](1)
Expand Down Expand Up @@ -187,6 +199,8 @@ case class BFSparse(hashes: BFHash, bits: CBitSet, width: Int) extends BF {
width)
}

def checkAndAdd(other: String): (BF, ApproximateBoolean) = dense.checkAndAdd(other)

def contains(item: String): ApproximateBoolean = dense.contains(item)

def size: Approximate[Long] = dense.size
Expand Down Expand Up @@ -218,19 +232,36 @@ case class BFInstance(hashes: BFHash, bits: BitSet, width: Int) extends BF {
}

def +(item: String): BFInstance = {
val bitsToActivate = BitSet(hashes(item): _*)
val itemHashes = hashes(item)
this.+(itemHashes: _*)
}

private def +(itemHashes: Int*): BFInstance = {
val bitsToActivate = BitSet(itemHashes: _*)

BFInstance(hashes,
bits ++ bitsToActivate,
width)
}

def bitSetContains(bs: BitSet, il: Int*): Boolean = {
il.map{ i: Int => bs.contains(i) }.reduce{ _ && _ }
def checkAndAdd(item: String): (BF, ApproximateBoolean) = {
val itemHashes = hashes(item)
val contained = this.contains(itemHashes: _*)
(this.+(itemHashes: _*), contained)
}

private def bitSetContains(bs: BitSet, il: Int*): Boolean = {
il.foreach { i => if (!bs.contains(i)) return false }
true
}

def contains(item: String): ApproximateBoolean = {
val itemHashes = hashes(item)
contains(itemHashes: _*)
}

def contains(item: String) = {
if (bitSetContains(bits, hashes(item): _*)) {
private[algebird] def contains(itemHashes: Int*): ApproximateBoolean = {
if (bitSetContains(bits, itemHashes: _*)) {
// The false positive probability (the probability that the Bloom filter erroneously
// claims that an element x is in the set when x is not) is roughly
// p = (1 - e^(-numHashes * setCardinality / width))^numHashes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ object HyperLogLog {
def hash(input: Array[Byte]): Array[Byte] = {
val seed = 12345678
val (l0, l1) = MurmurHash128(seed)(input)
pairLongs2Bytes(l0, l1)
}

private[algebird] def pairLongs2Bytes(l0: Long, l1: Long): Array[Byte] = {
val buf = new Array[Byte](16)
ByteBuffer
.wrap(buf)
Expand Down Expand Up @@ -547,7 +551,16 @@ class HyperLogLogMonoid(val bits: Int) extends Monoid[HLL] {
}

def create(example: Array[Byte]): HLL = {
val hashed = hash(example)
val hashBytes = hash(example)
createFromHashBytes(hashBytes)
}

def createFromHashLongs(l0: Long, l1: Long): HLL = {
val hashBytes = pairLongs2Bytes(l0, l1)
createFromHashBytes(hashBytes)
}

def createFromHashBytes(hashed: Array[Byte]): HLL = {
val (j, rhow) = jRhoW(hashed, bits)
SparseHLL(bits, Map(j -> Max(rhow)))
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,4 +207,32 @@ class BloomFilterTest extends WordSpec with Matchers {
assert(index >= 0)
}
}

"BloomFilter method `checkAndAdd`" should {

"be identical to method `+`" in {
(0 to 100).foreach {
_ =>
{
val bfMonoid = new BloomFilterMonoid(RAND.nextInt(5) + 1, RAND.nextInt(64) + 32, SEED)
val numEntries = 5
val entries = (0 until numEntries).map(_ => RAND.nextInt.toString)
val bf = bfMonoid.create(entries: _*)
val bfWithCheckAndAdd = entries
.map { entry => (entry, bfMonoid.create(entry)) }
.foldLeft((bfMonoid.zero, bfMonoid.zero)) {
case ((left, leftAlt), (entry, right)) =>
val (newLeftAlt, contained) = leftAlt.checkAndAdd(entry)
left.contains(entry) shouldBe contained
(left + entry, newLeftAlt)
}

entries.foreach { i =>
assert(bf.contains(i.toString).isTrue)
}
}
}
}
}

}

0 comments on commit e9aabba

Please sign in to comment.