-
Notifications
You must be signed in to change notification settings - Fork 703
/
DistributedCacheFile.scala
253 lines (212 loc) · 7.95 KB
/
DistributedCacheFile.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
package com.twitter.scalding.filecache
import com.twitter.algebird.MurmurHash128
import com.twitter.scalding._
import java.io.File
import java.net.URI
import java.nio.ByteBuffer
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.mapreduce.MRJobConfig
import org.apache.hadoop.mapreduce.filecache.{DistributedCache => HDistributedCache}
import org.apache.hadoop.fs.Path
object URIHasher {
private[this] final val HashFunc = MurmurHash128(1L)
private[this] def deSign(b: Byte): Int =
if (b < 0) b + 0xff else b
def apply(stringUri: String): String =
apply(new URI(stringUri))
/**
* generates hashes of hdfs URIs using algebird's MurmurHash128
* @param uri
* the URI to generate a hash for
* @return
* a hex-encoded string of the bytes of the 128 bit hash. The results are zero padded on the left, so this
* string will always be 32 characters long.
*/
def apply(uri: URI): String = {
val (h1, h2) = HashFunc(uri.toASCIIString)
val bytes = ByteBuffer.allocate(16).putLong(h1).putLong(h2).array()
bytes.map(deSign).map("%02x".format(_)).reduceLeft(_ + _) // lifted gently from com.twitter.util.U64
}
}
/**
* The distributed cache is simply hadoop's method for allowing each node local access to a specific file. The
* registration of that file must be called with the Configuration of the job, and not when it's on a mapper
* or reducer. Additionally, a unique name for the node-local access path must be used to prevent collisions
* in the cluster. This class provides this functionality.
*
* In the configuration phase, the file URI is used to construct an UncachedFile instance. The name of the
* symlink to use on the mappers is only available after calling the add() method, which registers the file
* and computes the unique symlink name and returns a CachedFile instance. The CachedFile instance is
* Serializable, it's designed to be assigned to a val and accessed later.
*
* The local symlink is available thorugh .file or .path depending on what type you need.
*
* example:
*
* {{{
* class YourJob(args: Args) extends Job(args) {
* val theCachedFile = DistributedCacheFile("/path/to/your/file.txt")
*
* def somethingThatUsesTheCachedFile() {
* doSomethingWith(theCachedFile.path) // or theCachedFile.file
* }
* }
*
* example with Execution:
*
* {{{
* object YourExecJob extends ExecutionApp {
* override def job =
* DistributedCacheFile
* .execution("/path/to/your/file.txt") { file =>
* doSomething(theCachedFile.path)
* }
* }
*
* example with Execution and multiple files:
*
* object YourExecJob extends ExecutionApp {
* override def job =
* DistributedCacheFile.execution("/path/to/your/one.txt") { one =>
* DistributedCacheFile.execution("/path/to/your/second.txt") { second =>
* doSomething(one.path, second.path)
* }
* }
* }
*
* }}}
*/
object DistributedCacheFile {
/**
* Create an object that can be used to register a given URI (representing an hdfs file) that should be
* added to the DistributedCache.
*
* @param uri
* The fully qualified URI that points to the hdfs file to add
* @return
* A CachedFile instance
*/
def apply(uri: URI)(implicit mode: Mode): CachedFile = {
val cachedFile = UncachedFile(Right(uri)).cached(mode)
addCachedFile(cachedFile, mode)
cachedFile
}
def apply(path: String)(implicit mode: Mode): CachedFile = {
val cachedFile = UncachedFile(Left(path)).cached(mode)
addCachedFile(cachedFile, mode)
cachedFile
}
private[scalding] def cachedFile(uri: URI, mode: Mode): CachedFile =
UncachedFile(Right(uri)).cached(mode)
private[scalding] def cachedFile(path: String, mode: Mode): CachedFile =
UncachedFile(Left(path)).cached(mode)
private[scalding] def addCachedFile(cachedFile: CachedFile, mode: Mode): Unit =
(cachedFile, mode) match {
case (hadoopFile: HadoopCachedFile, hadoopMode: HadoopMode) =>
HDistributedCache.addCacheFile(symlinkedUriFor(hadoopFile.sourceUri), hadoopMode.jobConf)
case _ =>
}
def symlinkNameFor(uri: URI): String = {
val hexsum = URIHasher(uri)
val fileName = new File(uri.toString).getName
Seq(hexsum, fileName).mkString("-")
}
def symlinkedUriFor(sourceUri: URI): URI =
new URI(sourceUri.getScheme, sourceUri.getSchemeSpecificPart, symlinkNameFor(sourceUri))
/**
* Make a file available to an Execution
*/
def execution[A](path: String)(use: CachedFile => Execution[A]): Execution[A] =
Execution.getMode.flatMap { mode =>
val cached = cachedFile(path, mode)
Execution.withConfig(use(cached))(addDistributedCacheFiles(_, cached))
}
/**
* Add files to be localized to the config. Intended to be used by user code.
* @param cachedFiles
* CachedFiles to be added
* @return
* new Config with cached files
*/
def addDistributedCacheFiles(config: Config, cachedFiles: CachedFile*): Config =
cachedFiles.foldLeft(config) { case (config, file) =>
file match {
case hadoopFile: HadoopCachedFile =>
/*
* @see
* basic logic from [[org.apache.hadoop.mapreduce.filecache.DistributedCache.addCacheFile]]
*/
val newFile = DistributedCacheFile
.symlinkedUriFor(hadoopFile.sourceUri)
.toString
val newFiles = config
.get(MRJobConfig.CACHE_FILES)
.map(files => files + "," + newFile)
.getOrElse(newFile)
config + (MRJobConfig.CACHE_FILES -> newFiles)
case _ => config
}
}
/**
* Get cached files from config
*/
def getDistributedCachedFiles(config: Config): Seq[CachedFile] =
config
.get(MRJobConfig.CACHE_FILES)
.toSeq
.flatMap(_.split(","))
.filter(_.nonEmpty)
.map { file =>
val symlinkedUri = new URI(file)
val qualifiedUri = new URI(symlinkedUri.getScheme, symlinkedUri.getSchemeSpecificPart, null)
HadoopCachedFile(qualifiedUri)
}
}
final case class UncachedFile private[scalding] (source: Either[String, URI]) {
def cached(mode: Mode): CachedFile =
mode match {
case Hdfs(_, conf) => addHdfs(conf)
case HadoopTest(conf, _) => addHdfs(conf)
case (Local(_) | Test(_)) => addLocal()
case _ => throw new RuntimeException("unhandled mode: %s".format(mode))
}
private[this] def addLocal(): CachedFile = {
val path =
source match {
case Left(strPath) => strPath
case Right(uri) => uri.getPath
}
LocallyCachedFile(path)
}
private[this] def addHdfs(conf: Configuration): CachedFile = {
def makeQualifiedStr(path: String, conf: Configuration): URI =
makeQualified(new Path(path), conf)
def makeQualifiedURI(uri: URI, conf: Configuration): URI =
makeQualified(new Path(uri.toString), conf) // uri.toString because hadoop 0.20.2 doesn't take a URI
def makeQualified(p: Path, conf: Configuration): URI = {
val fileSystem = p.getFileSystem(conf)
p.makeQualified(fileSystem.getUri, fileSystem.getWorkingDirectory).toUri
}
val sourceUri =
source match {
case Left(strPath) => makeQualifiedStr(strPath, conf)
case Right(uri) => makeQualifiedURI(uri, conf)
}
HadoopCachedFile(sourceUri)
}
}
sealed abstract class CachedFile {
/** The path to the cahced file on disk (the symlink registered at configuration time) */
def path: String
/** The path to the cached file on disk as a File object. */
def file: File
}
final case class LocallyCachedFile private[scalding] (sourcePath: String) extends CachedFile {
def path: String = file.getCanonicalPath
def file: File = new File(sourcePath).getCanonicalFile
}
final case class HadoopCachedFile private[scalding] (sourceUri: URI) extends CachedFile {
import DistributedCacheFile._
def path: String = Seq("./", symlinkNameFor(sourceUri)).mkString("")
def file: File = new File(path)
}