Merge branch 'develop' into rubanm/generic_lzo_scheme

twitter · May 15, 2015 · 92cd15e · 92cd15e
2 parents 925291c + 20fb806
commit 92cd15e
Show file tree

Hide file tree

Showing 58 changed files with 1,966 additions and 299 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -7,47 +7,47 @@ script:
 matrix:
   include:
 #BASE TESTS
-    - scala: 2.10.4
+    - scala: 2.10.5
       env: BUILD="base" TEST_TARGET="scalding-args scalding-date"
       script: "scripts/run_test.sh"
 
     - scala: 2.11.5
       env: BUILD="base" TEST_TARGET="scalding-args scalding-date"
       script: "scripts/run_test.sh"
 
-    - scala: 2.10.4
+    - scala: 2.10.5
       env: BUILD="base" TEST_TARGET="scalding-avro scalding-hraven scalding-commons"
       script: "scripts/run_test.sh"
 
     - scala: 2.11.5
       env: BUILD="base" TEST_TARGET="scalding-avro scalding-hraven scalding-commons"
       script: "scripts/run_test.sh"
 
-    - scala: 2.10.4
+    - scala: 2.10.5
       env: BUILD="base" TEST_TARGET="scalding-core"
       script: "scripts/run_test.sh"
 
     - scala: 2.11.5
       env: BUILD="base" TEST_TARGET="scalding-core"
       script: "scripts/run_test.sh"
 
-    - scala: 2.10.4
+    - scala: 2.10.5
       env: BUILD="base" TEST_TARGET="scalding-hadoop-test"
       script: "scripts/run_test.sh"
 
     - scala: 2.11.5
       env: BUILD="base" TEST_TARGET="scalding-hadoop-test"
       script: "scripts/run_test.sh"
 
-    - scala: 2.10.4
+    - scala: 2.10.5
       env: BUILD="base" TEST_TARGET="scalding-jdbc scalding-json"
       script: "scripts/run_test.sh"
 
     - scala: 2.11.5
       env: BUILD="base" TEST_TARGET="scalding-jdbc scalding-json"
       script: "scripts/run_test.sh"
 
-    - scala: 2.10.4
+    - scala: 2.10.5
       env: BUILD="base" TEST_TARGET="scalding-macros"
       script: "scripts/run_test.sh"
 
@@ -56,27 +56,27 @@ matrix:
       script: "scripts/run_test.sh"
 
 # not committed yet
-    # - scala: 2.10.4
+    # - scala: 2.10.5
     #   env: BUILD="base" TEST_TARGET="scalding-commons-macros"
     #   script: "scripts/run_test.sh"
 
     # - scala: 2.11.5
     #   env: BUILD="base" TEST_TARGET="scalding-commons-macros"
     #   script: "scripts/run_test.sh"
 
-    - scala: 2.10.4
+    - scala: 2.10.5
       env: BUILD="base" TEST_TARGET="scalding-parquet scalding-parquet-scrooge"
       script: "scripts/run_test.sh"
 
     - scala: 2.11.5
       env: BUILD="base" TEST_TARGET="scalding-parquet scalding-parquet-scrooge"
       script: "scripts/run_test.sh"
 
-    - scala: 2.10.4
+    - scala: 2.10.5
       env: BUILD="base" TEST_TARGET="scalding-repl"
       script: "scripts/run_test.sh"
 
-    - scala: 2.10.4
+    - scala: 2.10.5
       env: BUILD="test tutorials"
       script:
       - "scripts/build_assembly_no_test.sh scalding"
@@ -88,7 +88,7 @@ matrix:
       - "scripts/build_assembly_no_test.sh scalding"
       - "scripts/test_tutorials.sh"
 
-    - scala: 2.10.4
+    - scala: 2.10.5
       env: BUILD="test matrix tutorials"
       script:
       - "scripts/build_assembly_no_test.sh scalding"
@@ -100,7 +100,7 @@ matrix:
       - "scripts/build_assembly_no_test.sh scalding"
       - "scripts/test_matrix_tutorials.sh"
 
-    - scala: 2.10.4
+    - scala: 2.10.5
       env: BUILD="test repl and typed tutorials"
       script:
       - "scripts/build_assembly_no_test.sh scalding-repl"
@@ -114,7 +114,7 @@ matrix:
       - "scripts/build_assembly_no_test.sh scalding-core"
       - "scripts/test_typed_tutorials.sh"
 
-    - scala: 2.10.4
+    - scala: 2.10.5
       env: BUILD="test execution tutorials"
       script:
       - "scripts/build_assembly_no_test.sh execution-tutorial"

diff --git a/CHANGES.md b/CHANGES.md
@@ -1,5 +1,51 @@
 # Scalding #
 
+### Version 0.14.0 ###
+* add .unit to Execution object #1189
+* Override hashCode for Args #1190
+* Put a value in a exception message #1191
+* Add an exclusiveUpper method to DateRange #1194
+* Covert LzoTextDelimited to Cascading scheme. #1179
+* Remove Travis IRC notifications #1200
+* add LookupJoin and LookupJoinTest changes from summingbird #1199
+* Add a new ExecutionApp tutorial #1196
+* Move main simple example to be the typed API, and put the .'s at the sta... #1193
+* Add Execution.withArgs #1205
+* Config/Cascading updater #1197
+* Remove algebird serializers #1206
+* remove warnings in CumulativeSum #1215
+* Implicit execution context / easier switching between modes #1113
+* add row l1 normalize #1214
+* provide Args as an implicit val #1219
+* call sourceConfInit when reading from taps in local mode #1228
+* Add distinctCount and distinctValues helper methods to KeyedList. #1232
+* import hygiene: remove unused imports and remove JavaConversions use #1239
+* Swap hash and filename for filename-extension-sensitive code #1243
+* Remove more unused imports #1240
+* Provide useHdfsLocalMode for an easy switch to mapreduce local mode #1244
+* upgrade scalacheck and scalatest #1246
+* Optimize string and (hopefully) number comparisons a bit #1241
+* Note the active FlowProcess for Joiners #1235
+* Make sure Executions are executed at most once #1253
+* Fix Config.getUniqueIDs #1254
+* Add MustHasReducers trait. #1252
+* Make sure the EvalCache thread isDaemon #1255
+* Use non-regex split function #1251
+* make InputSizeReducerEstimator work for any CompositeTap #1256
+* TimePathedSource helper methods #1257
+* Fix for reducer estimation not working correctly if withReducers is set to 1 reducer #1263
+* Add make(dest) to TypedPipe #1217
+* Fix SimpleDateFormat caching by default #1265
+* upgrade sbt and sbt launcher script #1270
+* Add TypedPipeDiff for comparing typed pipes #1266
+* Change separator from \1 to \u0001 #1271
+* Disable reducer estimation for map-only steps #1276
+* Local sources support multiple paths #1275
+* fix the spelling of the cumulativeSumTest file #1281
+* Hydrate both sides of sampledCounts in skewJoinWithSmaller #1278
+* Bijection 0.8.0, algebird 0.10.0, chill 0.6.0, scala 2.10.5 #1287
+* Remove some deprecated items #1288
+
 ### Version 0.13.1 ###
 * Back out 4 changes to be binary compatible: https://github.com/twitter/scalding/pull/1187
 * Use java.util.Random instead of scala.util.Random: https://github.com/twitter/scalding/pull/1186

diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@ Scalding is a Scala library that makes it easy to specify Hadoop MapReduce jobs.
 
 ![Scalding Logo](https://raw.github.com/twitter/scalding/develop/logo/scalding.png)
 
-Current version: `0.13.1`
+Current version: `0.14.0`
 
 ## Word Count
 

diff --git a/project/Build.scala b/project/Build.scala
@@ -20,21 +20,23 @@ object ScaldingBuild extends Build {
   }
   def isScala210x(scalaVersion: String) = scalaBinaryVersion(scalaVersion) == "2.10"
 
-  val algebirdVersion = "0.9.0"
+  val algebirdVersion = "0.10.0"
   val avroVersion = "1.7.4"
-  val bijectionVersion = "0.7.2"
+  val bijectionVersion = "0.8.0"
   val cascadingAvroVersion = "2.1.2"
-  val chillVersion = "0.5.2"
+  val chillVersion = "0.6.0"
   val dfsDatastoresVersion = "1.3.4"
-  val elephantbirdVersion = "4.7"
+  val elephantbirdVersion = "4.8"
   val hadoopLzoVersion = "0.4.16"
   val hadoopVersion = "1.2.1"
   val hbaseVersion = "0.94.10"
   val hravenVersion = "0.9.13"
   val jacksonVersion = "2.4.2"
   val json4SVersion = "3.2.11"
+  val paradiseVersion = "2.0.1"
   val parquetVersion = "1.6.0rc4"
   val protobufVersion = "2.4.1"
+  val quasiquotesVersion = "2.0.1"
   val scalaCheckVersion = "1.12.2"
   val scalaTestVersion = "2.2.4"
   val scalameterVersion = "0.6"
@@ -47,9 +49,9 @@ object ScaldingBuild extends Build {
   val sharedSettings = Project.defaultSettings ++ assemblySettings ++ scalariformSettings ++ Seq(
     organization := "com.twitter",
 
-    scalaVersion := "2.10.4",
+    scalaVersion := "2.10.5",
 
-    crossScalaVersions := Seq("2.10.4", "2.11.5"),
+    crossScalaVersions := Seq("2.10.5", "2.11.5"),
 
     ScalariformKeys.preferences := formattingPreferences,
 
@@ -224,7 +226,7 @@ object ScaldingBuild extends Build {
     Some(subProj)
       .filterNot(unreleasedModules.contains(_))
       .map {
-      s => "com.twitter" % ("scalding-" + s + "_2.10") % "0.13.0"
+      s => "com.twitter" % ("scalding-" + s + "_2.10") % "0.14.0"
     }
 
   def module(name: String) = {
@@ -270,7 +272,7 @@ object ScaldingBuild extends Build {
       "org.slf4j" % "slf4j-api" % slf4jVersion,
       "org.slf4j" % "slf4j-log4j12" % slf4jVersion % "provided"
     )
-  ).dependsOn(scaldingArgs, scaldingDate, maple)
+  ).dependsOn(scaldingArgs, scaldingDate, scaldingSerialization, maple)
 
   lazy val scaldingCommons = module("commons").settings(
     libraryDependencies ++= Seq(
@@ -303,17 +305,20 @@ object ScaldingBuild extends Build {
   ).dependsOn(scaldingCore)
 
   lazy val scaldingParquet = module("parquet").settings(
-    libraryDependencies ++= Seq(
+    libraryDependencies <++= (scalaVersion) { scalaVersion => Seq(
       // see https://issues.apache.org/jira/browse/PARQUET-143 for exclusions
       "com.twitter" % "parquet-cascading" % parquetVersion
         exclude("com.twitter", "parquet-pig")
         exclude("com.twitter.elephantbird", "elephant-bird-pig")
         exclude("com.twitter.elephantbird", "elephant-bird-core"),
       "org.apache.thrift" % "libthrift" % "0.7.0",
       "org.slf4j" % "slf4j-api" % slf4jVersion,
-      "org.apache.hadoop" % "hadoop-core" % hadoopVersion % "provided"
-    )
-  ).dependsOn(scaldingCore)
+      "org.apache.hadoop" % "hadoop-core" % hadoopVersion % "provided",
+      "org.scala-lang" % "scala-reflect" % scalaVersion,
+      "com.twitter" %% "bijection-macros" % bijectionVersion
+    ) ++ (if(isScala210x(scalaVersion)) Seq("org.scalamacros" %% "quasiquotes" % quasiquotesVersion) else Seq())
+  }, addCompilerPlugin("org.scalamacros" % "paradise" % paradiseVersion cross CrossVersion.full))
+    .dependsOn(scaldingCore, scaldingHadoopTest)
 
   def scaldingParquetScroogeDeps(version: String) = {
     if (isScala210x(version))
@@ -387,6 +392,17 @@ object ScaldingBuild extends Build {
     run <<= (run in Unprovided)
   )
 
+  // zero dependency serialization module
+  lazy val scaldingSerialization = module("serialization")
+  lazy val scaldingSerializationMacros = module("serialization-macros").settings(
+    libraryDependencies <++= (scalaVersion) { scalaVersion => Seq(
+      "org.scala-lang" % "scala-library" % scalaVersion,
+      "org.scala-lang" % "scala-reflect" % scalaVersion
+    ) ++ (if(isScala210x(scalaVersion)) Seq("org.scalamacros" %% "quasiquotes" % "2.0.1") else Seq())
+  },
+  addCompilerPlugin("org.scalamacros" % "paradise" % "2.0.1" cross CrossVersion.full)
+  ).dependsOn(scaldingSerialization)
+
   lazy val scaldingJson = module("json").settings(
     libraryDependencies <++= (scalaVersion) { scalaVersion => Seq(
       "org.apache.hadoop" % "hadoop-core" % hadoopVersion % "provided",
@@ -423,9 +439,9 @@ object ScaldingBuild extends Build {
       "org.scala-lang" % "scala-library" % scalaVersion,
       "org.scala-lang" % "scala-reflect" % scalaVersion,
       "com.twitter" %% "bijection-macros" % bijectionVersion
-    ) ++ (if(isScala210x(scalaVersion)) Seq("org.scalamacros" %% "quasiquotes" % "2.0.1") else Seq())
+    ) ++ (if(isScala210x(scalaVersion)) Seq("org.scalamacros" %% "quasiquotes" % quasiquotesVersion) else Seq())
   },
-  addCompilerPlugin("org.scalamacros" % "paradise" % "2.0.1" cross CrossVersion.full)
+  addCompilerPlugin("org.scalamacros" % "paradise" % paradiseVersion cross CrossVersion.full)
   ).dependsOn(scaldingCore, scaldingHadoopTest)
 
   // This one uses a different naming convention

diff --git a/scalding-core/src/main/scala/com/twitter/package.scala b/scalding-core/src/main/scala/com/twitter/package.scala
@@ -34,7 +34,7 @@ package object scalding {
   /**
    * Make sure this is in sync with version.sbt
    */
-  val scaldingVersion: String = "0.13.1"
+  val scaldingVersion: String = "0.14.0"
 
   object RichPathFilter {
     implicit def toRichPathFilter(f: PathFilter) = new RichPathFilter(f)

diff --git a/scalding-core/src/main/scala/com/twitter/scalding/Job.scala b/scalding-core/src/main/scala/com/twitter/scalding/Job.scala
@@ -470,40 +470,6 @@ abstract class ExecutionJob[+T](args: Args) extends Job(args) {
   }
 }
 
-/*
- * this allows you to use ExecutionContext style, but wrap it in a job
- * val ecFn = { (implicit ec: ExecutionContext) =>
- *   // do stuff here
- * };
- * class MyClass(args: Args) extends ExecutionContextJob(args) {
- *   def job = ecFn
- * }
- * Now you can run it with Tool as a standard Job-framework style.
- * Only use this if you have an existing ExecutionContext style function
- * you want to run as a Job
- */
-@deprecated("Use ExecutionJob", "2014-07-29")
-abstract class ExecutionContextJob[+T](args: Args) extends Job(args) {
-  /**
-   * This can be assigned from a Function1:
-   * def job = (ectxJob: (ExecutionContext => T))
-   */
-  def job: Reader[ExecutionContext, T]
-  /**
-   * This is the result of calling the job on the context for this job
-   * you should NOT call this in the job Reader (or reference this class at all
-   * in reader
-   */
-  @transient final lazy val result: Try[T] = ec.map(job(_)) // mutate the flowDef with the job
-
-  private[this] final def ec: Try[ExecutionContext] =
-    Config.tryFrom(config).map { conf => ExecutionContext.newContext(conf)(flowDef, mode) }
-
-  override def buildFlow: Flow[_] = {
-    val forcedResult = result.get // make sure we have applied job once
-    super.buildFlow
-  }
-}
 
 /*
  * Run a list of shell commands through bash in the given order. Return success

diff --git a/scalding-core/src/main/scala/com/twitter/scalding/ReduceOperations.scala b/scalding-core/src/main/scala/com/twitter/scalding/ReduceOperations.scala
@@ -106,15 +106,6 @@ trait ReduceOperations[+Self <: ReduceOperations[Self]] extends java.io.Serializ
     hyperLogLogMap[T, HLL](f, errPercent) { hll => hll }
   }
 
-  @deprecated("use of approximateUniqueCount is preferred.", "0.8.3")
-  def approxUniques(f: (Fields, Fields), errPercent: Double = 1.0) = {
-    // Legacy (pre-bijection) approximate unique count that uses in.toString.getBytes to
-    // obtain a long hash code.  We specify the kludgy CTuple => Array[Byte] bijection
-    // explicitly.
-    implicit def kludgeHasher(in: CTuple) = in.toString.getBytes("UTF-8")
-    hyperLogLogMap[CTuple, Double](f, errPercent) { _.estimatedSize }
-  }
-
   private[this] def hyperLogLogMap[T <% Array[Byte]: TupleConverter, U: TupleSetter](f: (Fields, Fields), errPercent: Double = 1.0)(fn: HLL => U) = {
     //bits = log(m) == 2 *log(104/errPercent) = 2log(104) - 2*log(errPercent)
     def log2(x: Double) = scala.math.log(x) / scala.math.log(2.0)
@@ -320,17 +311,6 @@ trait ReduceOperations[+Self <: ReduceOperations[Self]] extends java.io.Serializ
   def sum[T](fs: Symbol*)(implicit sg: Semigroup[T], tconv: TupleConverter[T], tset: TupleSetter[T]): Self =
     sum[T](fs -> fs)(sg, tconv, tset)
 
-  @deprecated("Use sum", "0.9.0")
-  def plus[T](fd: (Fields, Fields))(implicit sg: Semigroup[T], tconv: TupleConverter[T], tset: TupleSetter[T]): Self =
-    sum[T](fd)(sg, tconv, tset)
-  /**
-   * The same as `plus(fs -> fs)`
-   * Assumed to be a commutative operation.  If you don't want that, use .forceToReducers
-   */
-  @deprecated("Use sum", "0.9.0")
-  def plus[T](fs: Symbol*)(implicit sg: Semigroup[T], tconv: TupleConverter[T], tset: TupleSetter[T]): Self =
-    sum[T](fs -> fs)(sg, tconv, tset)
-
   /**
    * Returns the product of all the items in this grouping
    */

diff --git a/scalding-core/src/main/scala/com/twitter/scalding/serialization/KryoHadoop.scala b/scalding-core/src/main/scala/com/twitter/scalding/serialization/KryoHadoop.scala
@@ -50,6 +50,8 @@ class KryoHadoop(config: Config) extends KryoInstantiator {
     newK.register(classOf[com.twitter.algebird.HyperLogLogMonoid], new HLLMonoidSerializer)
     newK.register(classOf[com.twitter.algebird.Moments], new MomentsSerializer)
     newK.addDefaultSerializer(classOf[com.twitter.algebird.HLL], new HLLSerializer)
+    // Don't serialize Boxed instances using Kryo.
+    newK.addDefaultSerializer(classOf[com.twitter.scalding.serialization.Boxed[_]], new ThrowingSerializer)
     /**
      * AdaptiveVector is IndexedSeq, which picks up the chill IndexedSeq serializer
      * (which is its own bug), force using the fields serializer here

diff --git a/scalding-core/src/main/scala/com/twitter/scalding/serialization/KryoSerializers.scala b/scalding-core/src/main/scala/com/twitter/scalding/serialization/KryoSerializers.scala
@@ -21,6 +21,17 @@ import com.esotericsoftware.kryo.io.{ Input, Output }
 
 import com.twitter.scalding._
 
+/**
+ * This is a runtime check for types we should never be serializing
+ */
+class ThrowingSerializer[T] extends KSerializer[T] {
+  override def write(kryo: Kryo, output: Output, t: T) {
+    sys.error(s"Kryo should never be used to serialize an instance: $t")
+  }
+  override def read(kryo: Kryo, input: Input, t: Class[T]): T =
+    sys.error("Kryo should never be used to serialize an instance, class: $t")
+}
+
 /**
  * *
  * Below are some serializers for objects in the scalding project.