New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add an Executor to run flows without a Job #915
Changes from 6 commits
993ff28
86f7fd9
66a7489
fd095a6
a4e1f76
ed56e5d
298b116
afcc833
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
/* | ||
Copyright 2014 Twitter, Inc. | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
package com.twitter.scalding.cascading_interop; | ||
|
||
import cascading.flow.FlowListener; | ||
import cascading.flow.Flow; | ||
import scala.concurrent.Promise$; | ||
import scala.concurrent.Promise; | ||
import scala.concurrent.Future; | ||
|
||
/* | ||
* The cascading API uses a raw type here which is difficult to | ||
* deal with in scala | ||
*/ | ||
public class FlowListenerPromise { | ||
/* | ||
* This starts the flow and applies a mapping function fn in | ||
* the same thread that completion happens | ||
*/ | ||
public static <Config, T> Future<T> start(Flow<Config> flow, final scala.Function1<Flow<Config>, T> fn) { | ||
final Promise<T> result = Promise$.MODULE$.<T>apply(); | ||
flow.addListener(new FlowListener() { | ||
public void onStarting(Flow f) { } // ignore | ||
public void onStopping(Flow f) { } // ignore | ||
public void onCompleted(Flow f) { | ||
// This is always called, but onThrowable is called first | ||
if(!result.isCompleted()) { | ||
// we use the above rather than trySuccess to avoid calling fn twice | ||
try { | ||
T toPut = fn.apply(f); | ||
result.success(toPut); | ||
} | ||
catch(Throwable t) { | ||
result.failure(t); | ||
} | ||
} | ||
} | ||
public boolean onThrowable(Flow f, Throwable t) { | ||
result.failure(t); | ||
// The exception is handled by the owner of the promise and should not be rethrown | ||
return true; | ||
} | ||
}); | ||
flow.start(); | ||
return result.future(); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,8 +31,9 @@ import java.security.MessageDigest | |
/** | ||
* This is a wrapper class on top of Map[String, String] | ||
*/ | ||
case class Config(toMap: Map[String, String]) { | ||
trait Config { | ||
import Config._ // get the constants | ||
def toMap: Map[String, String] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there are some non-String keys allowed. Not a ton, but they're allowed. Do we just not care? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What are they? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like we need to special case that one in Config. Dammit. |
||
|
||
def get(key: String): Option[String] = toMap.get(key) | ||
def +(kv: (String, String)): Config = Config(toMap + kv) | ||
|
@@ -44,6 +45,7 @@ case class Config(toMap: Map[String, String]) { | |
case (None, r) => (r, this - k) | ||
} | ||
|
||
def getCascadingAppName: Option[String] = get(CascadingAppName) | ||
def setCascadingAppName(name: String): Config = | ||
this + (CascadingAppName -> name) | ||
|
||
|
@@ -133,7 +135,7 @@ case class Config(toMap: Map[String, String]) { | |
this + (Config.ScaldingVersion -> scaldingVersion) | ||
|
||
/* | ||
* This is *required* is you are using counters. You must use | ||
* This is *required* if you are using counters. You must use | ||
* the same UniqueID as you used when defining your jobs. | ||
*/ | ||
def setUniqueId(u: UniqueID): Config = | ||
|
@@ -182,7 +184,11 @@ object Config { | |
.setSerialization(Right(classOf[serialization.KryoHadoop])) | ||
.setScaldingVersion | ||
|
||
implicit def from(m: Map[String, String]): Config = Config(m) | ||
def apply(m: Map[String, String]): Config = new Config { def toMap = m } | ||
/* | ||
* Implicits cannot collide in name, so making apply impliict is a bad idea | ||
*/ | ||
implicit def from(m: Map[String, String]): Config = apply(m) | ||
|
||
/** | ||
* Returns all the non-string keys on the left, the string keys/values on the right | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
/* | ||
Copyright 2014 Twitter, Inc. | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
package com.twitter.scalding | ||
|
||
import com.twitter.algebird.monad.Reader | ||
import com.twitter.scalding.cascading_interop.FlowListenerPromise | ||
|
||
import scala.concurrent.{ Future, Promise } | ||
import scala.util.{ Failure, Success, Try } | ||
import cascading.flow.{ FlowDef, Flow, FlowListener } | ||
|
||
import java.util.UUID | ||
|
||
/* | ||
* This has all the state needed to build a single flow | ||
* This is used with the implicit-arg-as-dependency-injection | ||
* style and with the Reader-as-dependency-injection | ||
*/ | ||
trait ExecutionContext { | ||
def mode: Mode | ||
def flowDef: FlowDef | ||
def uniqueId: UniqueID | ||
} | ||
|
||
/* | ||
* import ExecutionContext._ | ||
* is generally needed to use the ExecutionContext as the single | ||
* dependency injected. For instance, TypedPipe needs FlowDef and Mode | ||
* in many cases, so if you have an implicit ExecutionContext, you need | ||
* modeFromImplicit, etc... below. | ||
*/ | ||
object ExecutionContext { | ||
/* | ||
* implicit val ec = ExecutionContext.newContext | ||
* can be used inside of a Job to get an ExecutionContext if you want | ||
* to call a function that requires an implicit ExecutionContext | ||
*/ | ||
def newContext(implicit fd: FlowDef, m: Mode, u: UniqueID): ExecutionContext = | ||
new ExecutionContext { | ||
def mode = m | ||
def flowDef = fd | ||
def uniqueId = u | ||
} | ||
implicit def modeFromContext(implicit ec: ExecutionContext): Mode = ec.mode | ||
implicit def flowDefFromContext(implicit ec: ExecutionContext): FlowDef = ec.flowDef | ||
implicit def uniqueIdFromContext(implicit ec: ExecutionContext): UniqueID = ec.uniqueId | ||
} | ||
|
||
object Execution { | ||
/* | ||
* Here is the recommended way to run scalding as a library | ||
* Put all your logic is calls like this: | ||
* import ExecutionContext._ | ||
* | ||
* Reader(implicit ec: ExecutionContext => | ||
* //job here | ||
* ) | ||
* you can compose these readers in flatMaps: | ||
* for { | ||
* firstPipe <- job1 | ||
* secondPipe <- job2 | ||
* } yield firstPipe.group.join(secondPipe.join) | ||
* | ||
* Note that the only config considered is in conf. | ||
* The caller is responsible for setting up the Config | ||
* completely | ||
*/ | ||
def buildFlow[T](mode: Mode, conf: Config)(op: Reader[ExecutionContext, T]): (T, Try[Flow[_]]) = { | ||
val newFlowDef = new FlowDef | ||
conf.getCascadingAppName.foreach(newFlowDef.setName) | ||
// Set up the uniqueID, which is used to access to counters | ||
val uniqueId = UniqueID(UUID.randomUUID.toString) | ||
val finalConf = conf.setUniqueId(uniqueId) | ||
val ec = ExecutionContext.newContext(newFlowDef, mode, uniqueId) | ||
try { | ||
val resultT = op(ec) | ||
|
||
// The newFlowDef is ready now, and mutates newFlowDef as a side effect. :( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the reason is that wildcards are not interchangeable. The _ Flow[] in the return type is different from the Flow[] here. One solution is to make them both Flow[Any]. Another solution is to add a type parameter that does nothing. Another solution is to curse cascading's dumb use of wildcards. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think if you do a type parameter C and pass it to the return type, it will just get filled in with Nothing and then erased. This should work fine, no? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But, yes, I know the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But that's also a lie, since we don't really know or ever care what On Thu, Jun 26, 2014 at 11:20 AM, Jonathan Coveney <notifications@github.com
Oscar Boykin :: @posco :: http://twitter.com/posco There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh it's definitely an error in cascading's design. Thinking on why it's an error... that's a good point. Hm. |
||
// For some horrible reason, using Try( ) instead of the below gets me stuck: | ||
// [error] | ||
// /Users/oscar/workspace/scalding/scalding-core/src/main/scala/com/twitter/scalding/Execution.scala:92: | ||
// type mismatch; | ||
// [error] found : cascading.flow.Flow[_] | ||
// [error] required: cascading.flow.Flow[?0(in method buildFlow)] where type ?0(in method | ||
// buildFlow) | ||
// [error] Note: Any >: ?0, but Java-defined trait Flow is invariant in type Config. | ||
// [error] You may wish to investigate a wildcard type such as `_ >: ?0`. (SLS 3.2.10) | ||
// [error] (resultT, Try(mode.newFlowConnector(finalConf).connect(newFlowDef))) | ||
|
||
val tryFlow = try { | ||
val flow = mode.newFlowConnector(finalConf).connect(newFlowDef) | ||
Success(flow) | ||
} | ||
catch { | ||
case err: Throwable => Failure(err) | ||
} | ||
(resultT, tryFlow) | ||
} finally { | ||
FlowStateMap.clear(newFlowDef) | ||
} | ||
} | ||
|
||
/* | ||
* If you want scalding to fail if the sources cannot be validated, then | ||
* use this. | ||
* Alteratively, in your Reader, call Source.validateTaps(Mode) to | ||
* control which sources individually need validation | ||
* Suggested use: | ||
* for { | ||
* result <- job | ||
* mightErr <- validate | ||
* } yield mightErr.map(_ => result) | ||
*/ | ||
def validate: Reader[ExecutionContext, Try[Unit]] = | ||
Reader { ec => Try(FlowStateMap.validateSources(ec.flowDef, ec.mode)) } | ||
|
||
def run[T](mode: Mode, conf: Config)(op: Reader[ExecutionContext, T]): (T, Future[JobStats]) = { | ||
val (t, tryFlow) = buildFlow(mode, conf)(op) | ||
val fut = tryFlow match { | ||
case Success(flow) => run(flow) | ||
case Failure(err) => Future.failed(err) | ||
} | ||
(t, fut) | ||
} | ||
|
||
/* | ||
* This runs a Flow using Cascading's built in threads. The resulting JobStats | ||
* are put into a promise when they are ready | ||
*/ | ||
def run[C](flow: Flow[C]): Future[JobStats] = | ||
// This is in Java because of the cascading API's raw types on FlowListener | ||
FlowListenerPromise.start(flow, { f: Flow[C] => JobStats(f.getFlowStats) }) | ||
|
||
def waitFor[T](mode: Mode, conf: Config)(op: Reader[ExecutionContext, T]): (T, Try[JobStats]) = { | ||
val (t, tryFlow) = buildFlow(mode, conf)(op) | ||
(t, tryFlow.flatMap(waitFor(_))) | ||
} | ||
/* | ||
* This blocks the current thread until the job completes with either success or | ||
* failure. | ||
*/ | ||
def waitFor[C](flow: Flow[C]): Try[JobStats] = | ||
Try { | ||
flow.complete; | ||
JobStats(flow.getStats) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -143,7 +143,7 @@ class Job(val args: Args) extends FieldConversions with java.io.Serializable { | |
* Keep 100k tuples in memory by default before spilling | ||
* Turn this up as high as you can without getting OOM. | ||
* | ||
* This is ignored if there is a value set in the incoming mode.config | ||
* This is ignored if there is a value set in the incoming jobConf on Hadoop | ||
*/ | ||
def defaultSpillThreshold: Int = 100 * 1000 | ||
|
||
|
@@ -163,6 +163,8 @@ class Job(val args: Args) extends FieldConversions with java.io.Serializable { | |
* | ||
* Tip: override this method, call super, and ++ your additional | ||
* map to add or overwrite more options | ||
* | ||
* TODO: Should we bite the bullet and return Config here? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is where using github tags would be nice...it'd be good to accrue some of the bigger breaking changes we'd like before 1.0. Like this. |
||
*/ | ||
def config: Map[AnyRef, AnyRef] = { | ||
val base = Config.empty | ||
|
@@ -174,12 +176,14 @@ class Job(val args: Args) extends FieldConversions with java.io.Serializable { | |
System.setProperty(AppProps.APP_FRAMEWORKS, | ||
String.format("scalding:%s", scaldingVersion)) | ||
|
||
val (nonStrings, modeConf) = Config.stringsFrom(mode.config) | ||
// All the above are string keys, so overwrite and ++ will work | ||
val modeConf = mode match { | ||
case h: HadoopMode => Config.fromHadoop(h.jobConf) | ||
case _ => Config.empty | ||
} | ||
|
||
val init = base ++ modeConf | ||
|
||
Config.overwrite(nonStrings, | ||
defaultComparator.map(init.setDefaultComparator) | ||
defaultComparator.map(init.setDefaultComparator) | ||
.getOrElse(init) | ||
.setSerialization(Right(classOf[serialization.KryoHadoop]), ioSerializations) | ||
.setScaldingVersion | ||
|
@@ -188,7 +192,8 @@ class Job(val args: Args) extends FieldConversions with java.io.Serializable { | |
.setScaldingFlowClass(getClass) | ||
.setArgs(args) | ||
.setUniqueId(uniqueId) | ||
.maybeSetSubmittedTimestamp()._2) | ||
.maybeSetSubmittedTimestamp()._2 | ||
.toMap.toMap //second to lift to AnyRef, AnyRef | ||
} | ||
|
||
def skipStrategy: Option[FlowSkipStrategy] = None | ||
|
@@ -199,7 +204,9 @@ class Job(val args: Args) extends FieldConversions with java.io.Serializable { | |
* combine the config, flowDef and the Mode to produce a flow | ||
*/ | ||
def buildFlow: Flow[_] = { | ||
val flow = mode.newFlowConnector(config).connect(flowDef) | ||
val (nonStrings, conf) = Config.stringsFrom(config.mapValues(_.toString)) | ||
assert(nonStrings.isEmpty, "Non-string keys are not supported") | ||
val flow = mode.newFlowConnector(conf).connect(flowDef) | ||
listeners.foreach { flow.addListener(_) } | ||
stepListeners.foreach { flow.addStepListener(_) } | ||
skipStrategy.foreach { flow.setFlowSkipStrategy(_) } | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need this FN? would it not suffice to be a Future[Flow[Config]] ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't need it, but I didn't want to pass on scala.concurrent.ExecutionContext just to access the stats, which clearly is not an expensive operation.
So, this is a map tacked on the end, but logically, you are right. Since this is an adapter class, I'd say it is okay, but there is some code smell here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok cool fair enough, not used the scala futures really enough to know when those are needed/not needed.