Skip to content

Commit

Permalink
Merge pull request #21 from johnynek/feature/hll-ser
Browse files Browse the repository at this point in the history
add serialization to hyperloglog
  • Loading branch information
singhala committed Aug 28, 2012
2 parents efe579a + 9c2e6ee commit f2da9f2
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 9 deletions.
42 changes: 37 additions & 5 deletions src/main/scala/com/twitter/algebird/HyperLogLog.scala
Expand Up @@ -75,6 +75,40 @@ object HyperLogLog {
(onBits.filter { _ < bits }.map { 1 << _ }.sum, (onBits.filter { _ < bits }.map { 1 << _ }.sum,
(onBits.filter { _ >= bits }.min - bits + 1).toByte) (onBits.filter { _ >= bits }.min - bits + 1).toByte)
} }

def toBytes(h : HLL) : Array[Byte] = {
h match {
case HLLZero => Array[Byte](0)
case HLLItem(sz,idx,bv) => {
val buf = new Array[Byte](1 + 4 + 4 + 1)
java.nio.ByteBuffer
.wrap(buf)
.put(1 : Byte) //Indicator of HLLItem
.putInt(sz)
.putInt(idx)
.put(bv)
buf
}
case HLLInstance(v) => (Array[Byte](2) ++ v)
}
}

def fromBytes(bytes : Array[Byte]) : HLL = {
// Make sure to be reversible so fromBytes(toBytes(x)) == x
val bb = java.nio.ByteBuffer.wrap(bytes)
bb.get.toInt match {
case 0 => HLLZero
case 1 => {
HLLItem(bb.getInt, bb.getInt, bb.get)
}
case 2 => {
HLLInstance(bytes.toIndexedSeq.tail)
}
case _ => {
throw new Exception("Unrecognized HLL type: " + bytes(0))
}
}
}
} }


sealed abstract class HLL extends java.io.Serializable { sealed abstract class HLL extends java.io.Serializable {
Expand All @@ -97,10 +131,7 @@ case class HLLItem(size : Int, j : Int, rhow : Byte) extends HLL {
} }
else { else {
//They are certainly different //They are certainly different
val vect = Vector.fill(size)(0 : Byte) HLLInstance(toHLLInstance.v.updated(oJ, oRhow))
.updated(oJ, oRhow)
.updated(j,rhow)
HLLInstance(vect)
} }
} }
case HLLInstance(ov) => { case HLLInstance(ov) => {
Expand All @@ -115,6 +146,8 @@ case class HLLItem(size : Int, j : Int, rhow : Byte) extends HLL {
} }
} }
} }

lazy val toHLLInstance = HLLInstance(Vector.fill(size)(0 : Byte).updated(j,rhow))
} }


/** /**
Expand All @@ -136,7 +169,6 @@ case class HLLInstance(v : IndexedSeq[Byte]) extends HLL {
} }
} }
} }

// Named from the parameter in the paper, probably never useful to anyone // Named from the parameter in the paper, probably never useful to anyone
// except HyperLogLogMonoid // except HyperLogLogMonoid
lazy val z : Double = 1.0 / (v.map { mj => HyperLogLog.twopow(-mj) }.sum) lazy val z : Double = 1.0 / (v.map { mj => HyperLogLog.twopow(-mj) }.sum)
Expand Down
14 changes: 10 additions & 4 deletions src/test/scala/com/twitter/algebird/HyperLogLogTest.scala
Expand Up @@ -66,10 +66,6 @@ class HyperLogLogTest extends Specification {
} }


"HyperLogLog" should { "HyperLogLog" should {
"count with 4-bits" in {
test(4)
testLong(4)
}
"count with 5-bits" in { "count with 5-bits" in {
test(5) test(5)
testLong(5) testLong(5)
Expand All @@ -82,6 +78,10 @@ class HyperLogLogTest extends Specification {
test(7) test(7)
testLong(7) testLong(7)
} }
"count with 10-bits" in {
test(10)
testLong(10)
}
"count intersections of 2" in { testLongIntersection(10,2) } "count intersections of 2" in { testLongIntersection(10,2) }
"count intersections of 3" in { testLongIntersection(10,3) } "count intersections of 3" in { testLongIntersection(10,3) }
"count intersections of 4" in { testLongIntersection(10,4) } "count intersections of 4" in { testLongIntersection(10,4) }
Expand All @@ -93,5 +93,11 @@ class HyperLogLogTest extends Specification {
val smaller = smallMon(1) // uses implicit int2Bytes to make 4 byte array val smaller = smallMon(1) // uses implicit int2Bytes to make 4 byte array
(larger + smaller) must throwA[AssertionError] (larger + smaller) must throwA[AssertionError]
} }
"Correctly serialize" in {
val mon = new HyperLogLogMonoid(10)
fromBytes(toBytes(HLLZero)) must be_==(HLLZero)
fromBytes(toBytes(mon(12))) must be_==(mon(12))
fromBytes(toBytes(mon(12) + mon(13))) must be_==(mon(12) + mon(13))
}
} }
} }

0 comments on commit f2da9f2

Please sign in to comment.