twitter · tlazaro · Sep 26, 2021 · Sep 21, 2021 · Sep 22, 2021 · Sep 24, 2021
diff --git a/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamBackend.scala b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamBackend.scala
@@ -10,7 +10,8 @@ import com.twitter.scalding.typed._
 import com.twitter.scalding.typed.functions.{
   FilterKeysToFilter,
   FlatMapValuesToFlatMap,
-  MapValuesToMap
+  MapValuesToMap,
+  ScaldingPriorityQueueMonoid
 }
 
 object BeamPlanner {
@@ -65,7 +66,12 @@ object BeamPlanner {
           config.getMapSideAggregationThreshold match {
             case None => op
             case Some(count) =>
-              op.mapSideAggregator(count, sg)
+              // Semigroup is invariant on T. We cannot pattern match as it is a Semigroup[PriorityQueue[T]]
+              if (sg.isInstanceOf[ScaldingPriorityQueueMonoid[_]]) {
+                op
+              } else {
+                op.mapSideAggregator(count, sg)
+              }
           }
         case (ReduceStepPipe(ir @ IdentityReduce(_, _, _, _, _)), rec) =>
           def go[K, V1, V2](ir: IdentityReduce[K, V1, V2]): BeamOp[(K, V2)] = {

diff --git a/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamOp.scala b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamOp.scala
@@ -1,17 +1,22 @@
 package com.twitter.scalding.beam_backend
 
 import com.twitter.algebird.Semigroup
-import com.twitter.algebird.mutable.PriorityQueueMonoid
 import com.twitter.scalding.Config
 import com.twitter.scalding.beam_backend.BeamFunctions._
 import com.twitter.scalding.typed.functions.ComposedFunctions.ComposedMapGroup
-import com.twitter.scalding.typed.functions.{EmptyGuard, MapValueStream, SumAll}
+import com.twitter.scalding.typed.functions.{
+  EmptyGuard,
+  MapValueStream,
+  ScaldingPriorityQueueMonoid,
+  SumAll
+}
 import com.twitter.scalding.typed.{CoGrouped, TypedSource}
 import java.lang
-import java.util.PriorityQueue
+import java.util.{Comparator, PriorityQueue}
 import org.apache.beam.sdk.Pipeline
 import org.apache.beam.sdk.coders.{Coder, IterableCoder, KvCoder}
 import org.apache.beam.sdk.transforms.DoFn.ProcessElement
+import org.apache.beam.sdk.transforms.Top.TopCombineFn
 import org.apache.beam.sdk.transforms._
 import org.apache.beam.sdk.transforms.join.{
   CoGbkResult,
@@ -52,6 +57,10 @@ sealed abstract class BeamOp[+A] {
     parDo(FlatMapFn(f))
 }
 
+private final case class SerializableComparator[T](comp: Comparator[T]) extends Comparator[T] {
+  override def compare(o1: T, o2: T): Int = comp.compare(o1, o2)
+}
+
 object BeamOp extends Serializable {
   implicit private def fakeClassTag[A]: ClassTag[A] = ClassTag(classOf[AnyRef]).asInstanceOf[ClassTag[A]]
 
@@ -61,19 +70,24 @@ object BeamOp extends Serializable {
   )(implicit ordK: Ordering[K], kryoCoder: KryoCoder): PCollection[KV[K, java.lang.Iterable[U]]] = {
     reduceFn match {
       case ComposedMapGroup(f, g) => planMapGroup(planMapGroup(pcoll, f), g)
-      case EmptyGuard(MapValueStream(SumAll(pqm: PriorityQueueMonoid[V]))) =>
-        pcoll.apply(MapElements.via(
-          new SimpleFunction[KV[K, java.lang.Iterable[V]], KV[K, java.lang.Iterable[U]]]() {
-            override def apply(input: KV[K, lang.Iterable[V]]): KV[K, java.lang.Iterable[U]] = {
-              // We are not using plus method defined in PriorityQueueMonoid as it is mutating
-              // input Priority Queues. We create a new PQ from the individual ones.
-              // We didn't use Top PTransformation in beam as it is not needed, also
-              // we cannot access `max` defined in PQ monoid.
-              val flattenedValues = input.getValue.asScala.flatMap { value =>
-                value.asInstanceOf[PriorityQueue[V]].iterator().asScala
-              }
-              val mergedPQ = pqm.build(flattenedValues)
-              KV.of(input.getKey, Iterable(mergedPQ.asInstanceOf[U]).asJava)
+      case EmptyGuard(MapValueStream(SumAll(pqm: ScaldingPriorityQueueMonoid[v]))) =>
+        val vCollection = pcoll.asInstanceOf[PCollection[KV[K, java.lang.Iterable[PriorityQueue[v]]]]]
+
+        vCollection.apply(MapElements.via(
+          new SimpleFunction[KV[K, java.lang.Iterable[PriorityQueue[v]]], KV[K, java.lang.Iterable[U]]]() {
+            override def apply(input: KV[K, lang.Iterable[PriorityQueue[v]]]): KV[K, java.lang.Iterable[U]] = {
+
+              val topCombineFn = new TopCombineFn[v, SerializableComparator[v]](
+                pqm.count,
+                SerializableComparator[v](pqm.ordering.reverse)
+              )
+
+              @inline def flattenedValues: Stream[v] =
+                input.getValue.asScala.toStream.flatMap(_.asScala.toStream)
+
+              val outputs: java.util.List[v] = topCombineFn.apply(flattenedValues.asJava)
+              val pqs = pqm.build(outputs.asScala)
+              KV.of(input.getKey, Iterable(pqs.asInstanceOf[U]).asJava)
             }
           })
         ).setCoder(KvCoder.of(

diff --git a/scalding-beam/src/test/scala/com/twitter/scalding/beam_backend/BeamBackendTests.scala b/scalding-beam/src/test/scala/com/twitter/scalding/beam_backend/BeamBackendTests.scala
@@ -1,9 +1,11 @@
 package com.twitter.scalding.beam_backend
 
+import com.twitter.algebird.mutable.PriorityQueueMonoid
 import com.twitter.algebird.{AveragedValue, Semigroup}
 import com.twitter.scalding.{Config, TextLine, TypedPipe}
 import java.io.File
 import java.nio.file.Paths
+import java.util.PriorityQueue
 import org.apache.beam.sdk.options.{PipelineOptions, PipelineOptionsFactory}
 import org.scalatest.{BeforeAndAfter, FunSuite}
 import scala.io.Source
@@ -113,6 +115,18 @@ class BeamBackendTests extends FunSuite with BeforeAndAfter {
       )
   }
 
+  test("bufferedTake"){
+    beamMatchesSeq(
+      TypedPipe
+        .from(1 to 50)
+        .groupAll
+        .bufferedTake(100)
+        .map(_._2),
+      1 to 50,
+      Config(Map("cascading.aggregateby.threshold" -> "100"))
+    )
+  }
+
   test("SumByLocalKeys"){
     beamMatchesSeq(
       TypedPipe

diff --git a/scalding-core/src/main/scala/com/twitter/scalding/ReduceOperations.scala b/scalding-core/src/main/scala/com/twitter/scalding/ReduceOperations.scala
@@ -28,7 +28,7 @@ import com.twitter.algebird.{
   Aggregator
 }
 
-import com.twitter.algebird.mutable.PriorityQueueMonoid
+import com.twitter.scalding.typed.functions.ScaldingPriorityQueueMonoid
 
 import java.util.PriorityQueue
 
@@ -391,7 +391,7 @@ trait ReduceOperations[+Self <: ReduceOperations[Self]] extends java.io.Serializ
   def sortedTake[T](f: (Fields, Fields), k: Int)(implicit conv: TupleConverter[T], ord: Ordering[T]): Self = {
 
     assert(f._2.size == 1, "output field size must be 1")
-    implicit val mon: PriorityQueueMonoid[T] = new PriorityQueueMonoid[T](k)
+    implicit val mon: ScaldingPriorityQueueMonoid[T] = new ScaldingPriorityQueueMonoid[T](k)
     mapPlusMap(f) { (tup: T) => mon.build(tup) } {
       (lout: PriorityQueue[T]) => lout.iterator.asScala.toList.sorted
     }

diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/Grouped.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/Grouped.scala
@@ -16,7 +16,6 @@ limitations under the License.
 package com.twitter.scalding.typed
 
 import com.twitter.algebird.Semigroup
-import com.twitter.algebird.mutable.PriorityQueueMonoid
 import com.twitter.scalding.typed.functions._
 import com.twitter.scalding.typed.functions.ComposedFunctions.ComposedMapGroup
 import scala.collection.JavaConverters._
@@ -659,7 +658,7 @@ final case class UnsortedIdentityReduce[K, V1, V2](
       // If you care which items you take, you should sort by a random number
       // or the value itself.
       val fakeOrdering: Ordering[V1] = Ordering.by { v: V1 => v.hashCode }
-      implicit val mon: PriorityQueueMonoid[V1] = new PriorityQueueMonoid[V1](n)(fakeOrdering)
+      implicit val mon: ScaldingPriorityQueueMonoid[V1] = new ScaldingPriorityQueueMonoid[V1](n)(fakeOrdering)
       // Do the heap-sort on the mappers:
       val pretake: TypedPipe[(K, V1)] = mapped.mapValues { v: V1 => mon.build(v) }
         .sumByLocalKeys
@@ -745,7 +744,7 @@ final case class IdentityValueSortedReduce[K, V1, V2](
       // This means don't take anything, which is legal, but strange
       filterKeys(Constant(false))
     } else {
-      implicit val mon: PriorityQueueMonoid[V1] = new PriorityQueueMonoid[V1](n)(valueSort)
+      implicit val mon: ScaldingPriorityQueueMonoid[V1] = new ScaldingPriorityQueueMonoid[V1](n)(valueSort)
       // Do the heap-sort on the mappers:
       val pretake: TypedPipe[(K, V1)] = mapped.mapValues { v: V1 => mon.build(v) }
         .sumByLocalKeys

diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/KeyedList.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/KeyedList.scala
@@ -19,7 +19,6 @@ import java.io.Serializable
 import scala.collection.JavaConverters._
 
 import com.twitter.algebird.{ Fold, Semigroup, Ring, Aggregator }
-import com.twitter.algebird.mutable.PriorityQueueMonoid
 
 import com.twitter.scalding.typed.functions._
 
@@ -79,7 +78,7 @@ trait KeyedListLike[K, +T, +This[K, +T] <: KeyedListLike[K, T, This]] extends Se
       // If you care which items you take, you should sort by a random number
       // or the value itself.
       val fakeOrdering: Ordering[T] = Ordering.by { v: T => v.hashCode }
-      implicit val mon = new PriorityQueueMonoid(n)(fakeOrdering)
+      implicit val mon = new ScaldingPriorityQueueMonoid(n)(fakeOrdering)
       mapValues(mon.build(_))
         // Do the heap-sort on the mappers:
         .sum
@@ -213,7 +212,7 @@ trait KeyedListLike[K, +T, +This[K, +T] <: KeyedListLike[K, T, This]] extends Se
    * to fit in memory.
    */
   def sortedTake[U >: T](k: Int)(implicit ord: Ordering[U]): This[K, Seq[U]] = {
-    val mon = new PriorityQueueMonoid[U](k)(ord)
+    val mon = new ScaldingPriorityQueueMonoid[U](k)(ord)
     mapValues(mon.build(_))
       .sum(mon) // results in a PriorityQueue
       // scala can't infer the type, possibly due to the view bound on TypedPipe

diff --git a/...ore/src/main/scala/com/twitter/scalding/typed/functions/ScaldingPriorityQueueMonoid.scala b/...ore/src/main/scala/com/twitter/scalding/typed/functions/ScaldingPriorityQueueMonoid.scala
@@ -0,0 +1,7 @@
+package com.twitter.scalding.typed.functions
+
+import com.twitter.algebird.mutable.PriorityQueueMonoid
+
+class ScaldingPriorityQueueMonoid[K](
+  val count: Int
+)(implicit val ordering: Ordering[K]) extends PriorityQueueMonoid[K](count)(ordering)