Merge pull request #354 from miguno/GH-345

GH-345: Parameterize CMS to CMS[K] and decouple counting/querying from heavy hitters
twitter · Nov 19, 2014 · 6ed1356 · 6ed1356
2 parents 5b34a8b + ba498d6
commit 6ed1356
Show file tree

Hide file tree

Showing 8 changed files with 1,608 additions and 365 deletions.
diff --git a/algebird-caliper/README.md b/algebird-caliper/README.md
@@ -0,0 +1,52 @@
+[Caliper](https://code.google.com/p/caliper/)-based Benchmarks for Algebird data structures.
+
+# Usage
+
+Run the following commands from the top-level Algebird directory:
+
+    $ ./sbt   # <<< enter sbt REPL
+    > project algebird-caliper
+
+Now you can run the following commands from within the sbt REPL:
+
+    # List available benchmarks
+    > show cappi::benchmarks
+
+    # Run a particular benchmark
+    > cappi::benchmarkOnly com.twitter.algebird.caliper.HLLBenchmark
+
+    # Debug a particular benchmark (shows e.g. number of repetitions that will be run)
+    > cappi::benchmarkOnly --debug com.twitter.algebird.caliper.HLLBenchmark
+
+    # Run all benchmarks (apparently this is broken, see https://github.com/softprops/cappi/issues/1)
+    > cappi::benchmarks
+
+You can find further details in the [cappi](https://github.com/softprops/cappi) documentation, which is the sbt plugin
+we use to run the caliper benchmarks.
+
+Example output for [CMSBenchmark](src/test/scala/com/twitter/algebird/caliper/CMSBenchmark.scala):
+
+    > cappi::benchmarkOnly com.twitter.algebird.caliper.CMSBenchmark
+    [info] Running com.google.caliper.Runner com.twitter.algebird.caliper.CMSBenchmark
+    [info]  0% Scenario{vm=java, trial=0, benchmark=PlusOfFirstHundredIntegersWithLongCms, delta=0.0000001, eps=0.1, heavyHittersPct=0.2, maxBits=2048, operations=100} 292576.31 ns; σ=1271.12 ns @ 3 trials
+    [info] 17% Scenario{vm=java, trial=0, benchmark=PlusOfFirstHundredIntegersWithBigIntCms, delta=0.0000001, eps=0.1, heavyHittersPct=0.2, maxBits=2048, operations=100} 830195.29 ns; σ=7349.10 ns @ 3 trials
+    [info] 33% Scenario{vm=java, trial=0, benchmark=PlusOfRandom2048BitNumbersWithBigIntCms, delta=0.0000001, eps=0.1, heavyHittersPct=0.2, maxBits=2048, operations=100} 3362751.81 ns; σ=104683.16 ns @ 10 trials
+    [info] 50% Scenario{vm=java, trial=0, benchmark=PlusOfFirstHundredIntegersWithLongCms, delta=0.0000001, eps=0.005, heavyHittersPct=0.2, maxBits=2048, operations=100} 384133.61 ns; σ=41211.47 ns @ 10 trials
+    [info] 67% Scenario{vm=java, trial=0, benchmark=PlusOfFirstHundredIntegersWithBigIntCms, delta=0.0000001, eps=0.005, heavyHittersPct=0.2, maxBits=2048, operations=100} 1018308.55 ns; σ=43285.12 ns @ 10 trials
+    [info] 83% Scenario{vm=java, trial=0, benchmark=PlusOfRandom2048BitNumbersWithBigIntCms, delta=0.0000001, eps=0.005, heavyHittersPct=0.2, maxBits=2048, operations=100} 3610991.09 ns; σ=195033.95 ns @ 10 trials
+    [info]
+    [info]                               benchmark   eps   us linear runtime
+    [info]   PlusOfFirstHundredIntegersWithLongCms   0.1  293 ==
+    [info]   PlusOfFirstHundredIntegersWithLongCms 0.005  384 ===
+    [info] PlusOfFirstHundredIntegersWithBigIntCms   0.1  830 ======
+    [info] PlusOfFirstHundredIntegersWithBigIntCms 0.005 1018 ========
+    [info] PlusOfRandom2048BitNumbersWithBigIntCms   0.1 3363 ===========================
+    [info] PlusOfRandom2048BitNumbersWithBigIntCms 0.005 3611 ==============================
+    [info]
+    [info] vm: java
+    [info] trial: 0
+    [info] delta: 0.0000001
+    [info] heavyHittersPct: 0.2
+    [info] maxBits: 2048
+    [info] operations: 100
+    [success] Total time: 74 s, completed Oct 12, 2014 2:36:04 PM
diff --git a/algebird-caliper/src/test/scala/com/twitter/algebird/caliper/CMSBenchmark.scala b/algebird-caliper/src/test/scala/com/twitter/algebird/caliper/CMSBenchmark.scala
@@ -0,0 +1,86 @@
+package com.twitter.algebird.caliper
+
+import com.google.caliper.{ Param, SimpleBenchmark }
+import com.twitter.algebird.{ TopPctCMS, TopCMS, CMSHasherImplicits, TopPctCMSMonoid }
+
+/**
+ * Benchmarks the Count-Min sketch implementation in Algebird.
+ *
+ * We benchmark different `K` types as well as different input data streams.
+ */
+// Once we can convince cappi (https://github.com/softprops/capp) -- the sbt plugin we use to run
+// caliper benchmarks -- to work with the latest caliper 1.0-beta-1, we would:
+//     - Let `CMSBenchmark` extend `Benchmark` (instead of `SimpleBenchmark`)
+//     - Annotate `timePlus` with `@MacroBenchmark`.
+class CMSBenchmark extends SimpleBenchmark {
+
+  @Param(Array("0.1", "0.005"))
+  val eps: Double = 0.0
+
+  @Param(Array("0.0000001" /* 1E-8 */ ))
+  val delta: Double = 0.0
+
+  @Param(Array("0.2"))
+  val heavyHittersPct: Double = 0.0
+
+  @Param(Array("100"))
+  val operations: Int = 0 // Number of operations per benchmark repetition (cf. `reps`)
+
+  @Param(Array("2048"))
+  val maxBits: Int = 0
+
+  var random: scala.util.Random = _
+  var cmsLongMonoid: TopPctCMSMonoid[Long] = _
+  var cmsBigIntMonoid: TopPctCMSMonoid[BigInt] = _
+
+  override def setUp {
+    // Required import of implicit values (e.g. for BigInt- or Long-backed CMS instances)
+    import CMSHasherImplicits._
+
+    cmsLongMonoid = {
+      val seed = 1
+      TopPctCMS.monoid[Long](eps, delta, seed, heavyHittersPct)
+    }
+
+    cmsBigIntMonoid = {
+      val seed = 1
+      TopPctCMS.monoid[BigInt](eps, delta, seed, heavyHittersPct)
+    }
+
+    random = new scala.util.Random
+  }
+
+  // Case A (K=Long): We count the first hundred integers, i.e. [1, 100]
+  def timePlusOfFirstHundredIntegersWithLongCms(reps: Int): Int = {
+    var dummy = 0
+    while (dummy < reps) {
+      (1 to operations).view.foldLeft(cmsLongMonoid.zero)((l, r) => { l ++ cmsLongMonoid.create(r) })
+      dummy += 1
+    }
+    dummy
+  }
+
+  // Case B.1 (K=BigInt): We count the first hundred integers, i.e. [1, 100]
+  def timePlusOfFirstHundredIntegersWithBigIntCms(reps: Int): Int = {
+    var dummy = 0
+    while (dummy < reps) {
+      (1 to operations).view.foldLeft(cmsBigIntMonoid.zero)((l, r) => { l ++ cmsBigIntMonoid.create(r) })
+      dummy += 1
+    }
+    dummy
+  }
+
+  // Case B.2 (K=BigInt): We draw numbers randomly from a 2^maxBits address space
+  def timePlusOfRandom2048BitNumbersWithBigIntCms(reps: Int): Int = {
+    var dummy = 0
+    while (dummy < reps) {
+      (1 to operations).view.foldLeft(cmsBigIntMonoid.zero)((l, r) => {
+        val n = scala.math.BigInt(maxBits, random)
+        l ++ cmsBigIntMonoid.create(n)
+      })
+      dummy += 1
+    }
+    dummy
+  }
+
+}