Create DistributedCacheFile, a pleasant API for managing files to be …

…distributed via hadoop's DistributedCache Dealing with hadoop's filecache.DistributedCache is kind of awkward. This gives users an easy way of setting up paths to be added to the distributed cache, and hides the details of consistent-symlink-naming and such. This is a prerequisite for loading our indexes in hadoop.
twitter · Jun 4, 2013 · 7fd4b15 · 7fd4b15 · wli12 · Jun 18, 2014
1 parent b9d18b9
commit 7fd4b15
Show file tree

Hide file tree

Showing 5 changed files with 192 additions and 1 deletion.
diff --git a/project/Build.scala b/project/Build.scala
@@ -22,7 +22,8 @@ object ScaldingBuild extends Build {
 
     libraryDependencies ++= Seq(
       "org.scalacheck" %% "scalacheck" % "1.10.0" % "test",
-      "org.scala-tools.testing" %% "specs" % "1.6.9" % "test"
+      "org.scala-tools.testing" %% "specs" % "1.6.9" % "test",
+      "org.mockito" % "mockito-all" % "1.8.5" % "test"
     ),
 
     resolvers ++= Seq(

diff --git a/scalding-core/src/main/scala/com/twitter/scalding/filecache/DistributedCache.scala b/scalding-core/src/main/scala/com/twitter/scalding/filecache/DistributedCache.scala
@@ -0,0 +1,35 @@
+package com.twitter.scalding.filecache
+
+import java.net.URI
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.filecache.{DistributedCache => HDistributedCache}
+import org.apache.hadoop.fs.Path
+
+trait DistributedCache {
+  def createSymlink(conf: Configuration)
+  def addCacheFile(uri: URI, conf: Configuration)
+  def makeQualified(path: String, conf: Configuration): URI
+  def makeQualified(uri: URI, conf: Configuration): URI
+  def makeQualified(p: Path, conf: Configuration): URI
+}
+
+// used to supply the implicit cache argument to UncachedFile, allows us to stub this in tests
+class HadoopDistributedCache extends DistributedCache {
+  def createSymlink(conf: Configuration) {
+    HDistributedCache.createSymlink(conf)
+  }
+
+  def addCacheFile(uri: URI, conf: Configuration) {
+    HDistributedCache.addCacheFile(uri, conf)
+  }
+
+  def makeQualified(path: String, conf: Configuration): URI =
+    makeQualified(new Path(path), conf)
+
+  def makeQualified(uri: URI, conf: Configuration) =
+    makeQualified(new Path(uri.toString), conf) // uri.toString because hadoop 0.20.2 doesn't take a URI
+
+  def makeQualified(p: Path, conf: Configuration): URI =
+    p.makeQualified(p.getFileSystem(conf)).toUri  // make sure we have fully-qualified URI
+}
+
diff --git a/...ding-core/src/main/scala/com/twitter/scalding/filecache/DistributedCacheContextLike.scala b/...ding-core/src/main/scala/com/twitter/scalding/filecache/DistributedCacheContextLike.scala
@@ -0,0 +1,11 @@
+package com.twitter.scalding.filecache
+
+trait DistributedCacheContextLike {
+  implicit val distributedCache: DistributedCache
+}
+
+trait DistributedCacheContext extends DistributedCacheContextLike {
+  @transient
+  implicit lazy val distributedCache: DistributedCache = new HadoopDistributedCache
+}
+
diff --git a/scalding-core/src/main/scala/com/twitter/scalding/filecache/DistributedCacheFile.scala b/scalding-core/src/main/scala/com/twitter/scalding/filecache/DistributedCacheFile.scala
@@ -0,0 +1,95 @@
+package com.twitter.scalding.filecache
+
+import com.google.common.hash.Hashing
+import java.io.File
+import java.net.URI
+import org.apache.hadoop.conf.Configuration
+
+
+object DistributedCacheFile {
+  // TODO: make this pluggable
+  private val HashFunc = Hashing.md5()
+
+  /**
+   * Create an object that can be used to register a given URI (representing an hdfs file)
+   * that should be added to the DistributedCache.
+   *
+   * @param uri The fully qualified URI that points to the hdfs file to add
+   * @return A DistributedCacheFile that must have its add() method called with the current
+   *         Configuration before use.
+   */
+  def apply(uri: URI)(implicit distCache: DistributedCache): UncachedFile =
+    UncachedFile(Right(uri))
+
+  def apply(path: String)(implicit distCache: DistributedCache): UncachedFile =
+    UncachedFile(Left(path))
+
+  def symlinkNameFor(uri: URI): String = {
+    val hexsum = HashFunc.hashString(uri.toString).toString
+    val fileName = new File(uri.toString).getName
+
+    Seq(fileName, hexsum).mkString("-")
+  }
+
+  def symlinkedUriFor(sourceUri: URI): URI =
+    new URI(sourceUri.getScheme, sourceUri.getSchemeSpecificPart, symlinkNameFor(sourceUri))
+}
+
+
+/**
+ * The distributed cache is simply hadoop's method for allowing each node local access to a
+ * specific file. The registration of that file must be called with the Configuration of the job,
+ * and not when it's on a mapper or reducer. Additionally, a unique name for the node-local access
+ * path must be used to prevent collisions in the cluster. This class provides this functionality.
+ *
+ * In the configuration phase, the file URI is used to construct an UncachedFile instance. The name
+ * of the symlink to use on the mappers is only available after calling the add() method, which
+ * registers the file and computes the unique symlink name and returns a CachedFile instance.
+ * The CachedFile instance is Serializable, it's designed to be assigned to a val and accessed later.
+ *
+ * The local symlink is available thorugh .file or .path depending on what type you need.
+ */
+sealed abstract class DistributedCacheFile {
+  def isDefined: Boolean
+
+  def add(conf: Configuration): CachedFile
+}
+
+// the reason we use an implicit here is that we don't want to concern our users with
+// the DistributedCache class, which is a hack for wrapping the actual Hadoop DistributedCache
+// object to allow for stubbing during tests.
+//
+final case class UncachedFile private[scalding] (source: Either[String, URI])(implicit cache: DistributedCache)
+    extends DistributedCacheFile {
+
+  import DistributedCacheFile._
+
+  def isDefined = false
+
+  def add(conf: Configuration): CachedFile = {
+    cache.createSymlink(conf)
+
+    val sourceUri =
+      source match {
+        case Left(strPath) => cache.makeQualified(strPath, conf)
+        case Right(uri) => cache.makeQualified(uri, conf)
+      }
+
+    cache.addCacheFile(symlinkedUriFor(sourceUri), conf)
+    CachedFile(sourceUri)
+  }
+}
+
+final case class CachedFile private[scalding] (sourceUri: URI) extends DistributedCacheFile {
+
+  import DistributedCacheFile._
+
+  def path: String =
+    Seq("./", symlinkNameFor(sourceUri)).mkString("")
+
+  def file: File =
+    new File(path)
+
+  def isDefined = true
+  def add(conf: Configuration) = this
+}
diff --git a/scalding-core/src/test/scala/com/twitter/scalding/filecache/DistributedCacheFileSpec.scala b/scalding-core/src/test/scala/com/twitter/scalding/filecache/DistributedCacheFileSpec.scala
@@ -0,0 +1,49 @@
+package com.twitter.scalding.filecache
+
+import com.google.common.hash.Hashing
+import java.io.File
+import java.net.URI
+import org.apache.hadoop.conf.Configuration
+import org.specs.mock.Mockito
+import org.specs.Specification
+
+class DistributedCacheFileSpec extends Specification with Mockito {
+  implicit val distCache = smartMock[DistributedCache]
+  val conf = smartMock[Configuration]
+  val uriString = "hdfs://foo.example:1234/path/to/the/stuff/thefilename.blah"
+  val md5Hex = Hashing.md5().hashString(uriString).toString
+  val hashedFilename = "thefilename.blah-" + md5Hex
+  val uri = new URI(uriString)
+
+
+  distCache.makeQualified(uri, conf) returns uri
+  distCache.makeQualified(uriString, conf) returns uri
+
+  "DistributedCacheFile" should {
+    "symlinkNameFor must return a hashed name" in {
+      DistributedCacheFile.symlinkNameFor(uri) must_== hashedFilename
+    }
+  }
+
+  "UncachedFile" should {
+    "not be defined" in {
+      DistributedCacheFile(uri).isDefined must beTrue
+    }
+  }
+
+  "UncachedFile.add" should {
+    "register the uri with the cache and return the appropriate CachedFile" in {
+      val expectedUri = new URI("%s#%s".format(uriString, hashedFilename))
+
+      val dcf = new UncachedFile(Right(uri))
+      val cf = dcf.add(conf)
+
+      there was one(distCache).createSymlink(conf)
+      there was one(distCache).addCacheFile(expectedUri, conf)
+
+      val cachedPath = "./" + hashedFilename
+      cf.path must_== cachedPath
+      cf.file must_== (new File(cachedPath))
+    }
+  }
+}