Add archiving job for Airflow

* This job was previously included in Oozie. * Its goal is to archive a directory to a specific location on HDFS. * There is a new code mutualization with MediawikiHistoryDumper Bug: T300039 Change-Id: I9636c44f860fa0a7211551a3484cf3eb79430438
wikimedia · Apr 6, 2022 · fce97f8 · fce97f8
1 parent fa0ea90
commit fce97f8
Show file tree

Hide file tree

Showing 17 changed files with 385 additions and 32 deletions.
diff --git a/refinery-core/src/main/scala/org/wikimedia/analytics/refinery/core/config/ConfigHelper.scala b/refinery-core/src/main/scala/org/wikimedia/analytics/refinery/core/config/ConfigHelper.scala
@@ -7,6 +7,7 @@ import profig._
 import cats.syntax.either._
 import io.circe.CursorOp.DownField
 import io.circe.{Decoder, DecodingFailure}
+import org.apache.hadoop.fs.Path
 
 import scala.language.experimental.macros
 import scala.reflect.macros.blackbox
@@ -152,6 +153,9 @@ trait ConfigHelper {
         )
     }
 
+    // implicit conversion from string to hadoop.fs.Path
+    implicit def stringToHadoopFsPath(s: String): Path = new Path(s)
+
     // Support implicit DateTime conversion from string to DateTime
     // The opt can either be given in integer hours ago, or
     // as a ISO-8601 formatted date time.

diff --git a/refinery-job/src/main/scala/org/wikimedia/analytics/refinery/job/HDFSArchiver.scala b/refinery-job/src/main/scala/org/wikimedia/analytics/refinery/job/HDFSArchiver.scala
@@ -0,0 +1,236 @@
+package org.wikimedia.analytics.refinery.job
+
+import scala.collection.mutable.ListBuffer
+import scala.collection.immutable.ListMap
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path}
+import org.wikimedia.analytics.refinery.core.LogHelper
+import org.wikimedia.analytics.refinery.core.config._
+
+/**
+ * Job to archive a file on HDFS.
+ * The source file:
+ * - ends with a specific string,
+ * - is uniq but could be beside an empty flag file,
+ * - and is not empty
+ * The target is set with specific permissions.
+ */
+object HDFSArchiver extends LogHelper with ConfigHelper {
+
+    /**
+     * Config class for use config files and args.
+     */
+    case class Config(
+        source_directory: String,
+        archive_file: String,
+        archive_parent_umask: String = "022",
+        archive_perms: String = "644",
+        expected_filename_ending: String = ".gz",
+        check_done: Boolean = false,
+        done_file: String = "_SUCCESS"
+    )
+
+    def loadConfig(args: Array[String]): Config = {
+        val config = try {
+            configureArgs[Config](args)
+        } catch {
+            case e: ConfigHelperException =>
+                log.fatal(e.getMessage + ". Aborting.")
+                sys.exit(1)
+        }
+        log.info("Loaded configuration:\n" + prettyPrint(config))
+        config
+    }
+
+    object Config {
+        // This is just used to ease generating help message with default values.
+        // Required configs are set to dummy values.
+        val default = Config("", "")
+
+        val propertiesDoc: ListMap[String, String] = ListMap(
+            "source_directory" -> "Path of the directory where the source is located.",
+            "archive_file" -> "Path of the archive file, where to put the source file.",
+            "archive_parent_umask" ->
+                s"""Umask for the archive directory permission.
+                   | default: ${default.archive_parent_umask}""",
+            "archive_perms" ->
+                s"""Permissions given to the archive file.
+                   | default: ${default.archive_perms}""",
+            "expected_filename_ending" ->
+                s"""The ending of the source file name.
+                   |default: ${default.expected_filename_ending}""",
+            "check_done" -> s"Check for a done file flag. default: ${default.check_done}",
+            "done_file" -> s"Name of the done file flag. default: ${default.done_file}"
+        )
+
+        val usage: String =
+            """
+              |Job to archive a file on HDFS.
+              |
+              |Example:
+              |  java -cp refinery-job.jar:$(/usr/bin/hadoop classpath) org.wikimedia.analytics.refinery.job.HDFSArchiver \
+              |      --source_directory=/tmp/bob/source \
+              |      --archive_file=/tmp/bob/public_archive\.gz
+              |"""
+    }
+
+    /**
+     * Entry point to run this job.
+     *
+     * @param args
+     */
+    def main(args: Array[String]): Unit = {
+        if (args.contains("--help")) {
+            println(help(Config.usage, Config.propertiesDoc))
+            sys.exit(0)
+        }
+
+        val config = loadConfig(args)
+
+        // Make sure to log in the console when launched from Airflow
+        addConsoleLogAppender()
+
+        val statusCode: Int = if (apply(
+            config.source_directory,
+            config.expected_filename_ending,
+            config.check_done,
+            Path.mergePaths(config.source_directory, config.done_file),
+            config.archive_file,
+            config.archive_parent_umask,
+            config.archive_perms
+        )) 1 else 0
+        System.exit(statusCode)
+    }
+
+    /**
+     * The heart of the HDFS Archiver job
+     *
+     * @param sourceDirectory
+     * @param expectedFilenameEnding
+     * @param checkDone
+     * @param doneFilePath
+     * @param archiveFile
+     * @param archiveParentUmask
+     * @param archivePerms
+     * @return boolean true in case of success
+     */
+    def apply(
+        sourceDirectory: Path,
+        expectedFilenameEnding: String,
+        checkDone: Boolean,
+        doneFilePath: Path,
+        archiveFile: Path,
+        archiveParentUmask: String,
+        archivePerms: String
+    ): Boolean = {
+        val conf: Configuration = new Configuration
+        conf.set("fs.permissions.umask-mode", archiveParentUmask)
+        val fs: FileSystem = sourceDirectory.getFileSystem(conf)
+        identifySourceFile(fs, sourceDirectory, expectedFilenameEnding, checkDone, doneFilePath) match {
+            case None => false
+            case Some(file) => createParentFolder(fs, archiveFile) &&
+                archiveSource(fs, file, archiveFile, archivePerms)
+        }
+    }
+
+    /**
+     * identifySourceFile is validating that the source file is checking all conditions,
+     * and if it does, will return the Path to the source file.
+     * It checks:
+     * - if the source directory exists
+     * - if the check file exists (ex: _SUCCESS)
+     * - if there is 1 not empty file in the directory beside the optional check file
+     */
+    def identifySourceFile(
+        fs: FileSystem,
+        sourceDirectory: Path,
+        expectedFilenameEnding: String,
+        checkDone: Boolean,
+        doneFilePath: Path
+    ): Option[Path] = {
+        if (directoryExists(fs, sourceDirectory)) {
+            if (checkDone && !fs.exists(doneFilePath)) {
+                log.error(s"Done file ${doneFilePath.toString} is not present.")
+                None
+            } else {
+                getSourceFileFromDirectory(fs, sourceDirectory, expectedFilenameEnding, checkDone)
+            }
+        } else {
+            None
+        }
+    }
+
+    /**
+     * Checks if a specific path exists in the file system, and checks that it is a directory.
+     */
+    def directoryExists(fs: FileSystem, path: Path): Boolean = {
+        if (!fs.exists(path)) {
+            log.error(s"Dir ${path.toString} does not exist.")
+            false
+        } else if (!fs.isDirectory(path)) {
+            log.error(s"Dir ${path.toString} is not a directory.")
+            false
+        } else {
+            true
+        }
+    }
+
+    /**
+     * Try to get the Path to the source file and checks that a there is single non empty file in the source directory
+     * (except the success file).
+     */
+    def getSourceFileFromDirectory(
+        fs: FileSystem,
+        sourceDirectory: Path,
+        expectedFilenameEnding: String,
+        checkDone: Boolean
+    ): Option[Path] = {
+        val files = listFilesInDir(fs, sourceDirectory)
+        if ((checkDone && files.length != 2) || (!checkDone && files.length != 1)) {
+            log.error(s"Wrong file count in ${sourceDirectory.toString}")
+            None
+        } else {
+            val sourceFile: Option[LocatedFileStatus] = files
+                .find {
+                    _.getPath.getName.endsWith(expectedFilenameEnding)
+                }
+            if (sourceFile.isEmpty) {
+                log.error(s"Missing source in ${sourceDirectory.toString} (ending in: $expectedFilenameEnding)")
+                None
+            } else if (sourceFile.get.getLen == 0) {
+                log.error(s"Empty source in ${sourceDirectory.toString} (ending in: $expectedFilenameEnding)")
+                None
+            } else {
+                Some(sourceFile.get.getPath)
+            }
+        }
+    }
+
+    /**
+     * Lists the files into a directory, and converts the iterator into a list for convenience.
+     */
+    def listFilesInDir(fs: FileSystem, dir: Path): List[LocatedFileStatus] = {
+        val result = ListBuffer[LocatedFileStatus]()
+        val iterator = fs.listFiles(dir, true)
+        while (iterator.hasNext) result += iterator.next
+        result.toList
+    }
+
+    def createParentFolder(fs: FileSystem, file: Path): Boolean = {
+        // This import has to happen after setting the umask mode
+        import org.apache.hadoop.fs.permission.FsPermission
+        fs.mkdirs(file.getParent, new FsPermission("777")) // Only restrict through umask.
+    }
+
+    def archiveSource(fs: FileSystem, sourceFile: Path, archiveFile: Path, archivePerms: String): Boolean = {
+        import org.apache.hadoop.fs.permission.FsPermission
+        fs.delete(archiveFile, false)
+        val result = fs.rename(sourceFile, archiveFile) &&
+            fs.delete(sourceFile.getParent, true) &&
+            fs.setPermission(archiveFile, new FsPermission(archivePerms)).equals() // "equals" is here to get a bool return.
+        if (result) {
+            log.info(s"Archive created: ${archiveFile.toString}")
+        }
+        result
+    }
+}
diff --git a/.../scala/org/wikimedia/analytics/refinery/job/mediawikihistory/MediawikiHistoryDumper.scala b/.../scala/org/wikimedia/analytics/refinery/job/mediawikihistory/MediawikiHistoryDumper.scala
@@ -1,20 +1,16 @@
 
 package org.wikimedia.analytics.refinery.job.mediawikihistory
 
-import java.util.{TimeZone, Calendar}
-
+import java.util.{Calendar, TimeZone}
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.functions.col
 import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
 import org.apache.spark.sql.{Row, SaveMode, SparkSession}
 import org.apache.spark.SparkConf
-import org.wikimedia.analytics.refinery.job.mediawikihistory.denormalized.{
-    MediawikiEvent,
-    MediawikiEventUserDetails,
-    MediawikiEventPageDetails
-}
+import org.wikimedia.analytics.refinery.job.HDFSArchiver
+import org.wikimedia.analytics.refinery.job.mediawikihistory.denormalized.{MediawikiEvent, MediawikiEventPageDetails, MediawikiEventUserDetails}
 import scopt.OptionParser
 
 /**
@@ -25,18 +21,17 @@ import scopt.OptionParser
  * Size of the time bucket varies depending on the size of the wiki.
  * For example, big wikis might be split in 1-month buckets, while
  * medium wikis might be split in 1-year buckets, and small wikis
- * might be outputed as a single file.
+ * might be output as a single file.
  *
  * Parameters:
- *   snapshot         Mediawiki snapshot to dump (usually YYYY-MM).
- *   inputBasePath    HDFS base path where to read data from.
- *   tempDirectory    HDFS temporary directory for intermediate files.
- *   tempPartitions   Number of partitions to rehash data with (internal).
- *   outputBasePath   HDFS base path where to write the dump.
+ *   - snapshot         Mediawiki snapshot to dump (usually YYYY-MM).
+ *   - inputBasePath    HDFS base path where to read data from.
+ *   - tempDirectory    HDFS temporary directory for intermediate files.
+ *   - tempPartitions   Number of partitions to rehash data with (internal).
+ *   - outputBasePath   HDFS base path where to write the dump.
  *
  * Example of usage:
- *
- * sudo -u analytics spark2-submit \
+ * {{{ sudo -u analytics spark2-submit \
  *     --master yarn \
  *     --deploy-mode cluster \
  *     --executor-memory 32G \
@@ -49,7 +44,7 @@ import scopt.OptionParser
  *     --input-base-path /wmf/data/wmf/mediawiki/history \
  *     --temp-directory /tmp/mforns/mediawiki_history_dumps_12345 \
  *     --temp-partitions 256 \
- *     --output-base-path /wmf/data/archive/mediawiki/history
+ *     --output-base-path /wmf/data/archive/mediawiki/history}}}
  *
  */
 
@@ -186,11 +181,11 @@ object MediawikiHistoryDumper {
             flatMap(r => {
                 val event = MediawikiEvent.fromRow(r)
                 eventTimeBucket(event)
-                  // We can get the value of event_timestamp as None are filtered out in eventTimeBucket function
-                  .map(timeBucket => Seq(Row.fromTuple((event.wikiDb, timeBucket, event.eventTimestamp.get.getTime, event.toTSVLine))))
-                  .getOrElse(Seq.empty[Row])
-                    // The first 3 dimensions are used for proper partitioning
-                    // and ordering of the data, only the tsv line will be output.
+                    // We can get the value of event_timestamp as None are filtered out in eventTimeBucket function
+                    .map(timeBucket => Seq(Row.fromTuple((event.wikiDb, timeBucket, event.eventTimestamp.get.getTime, event.toTSVLine))))
+                    .getOrElse(Seq.empty[Row])
+                // The first 3 dimensions are used for proper partitioning
+                // and ordering of the data, only the tsv line will be output.
             })(partitionedDatasetRowEncoder).
             // The following line applies the repartitioning. It redistributes
             // the data among tempPartitions partitions. And makes sure that
@@ -269,20 +264,18 @@ object MediawikiHistoryDumper {
                     val wiki = wikiDirectory.getPath.getName.substring(5)
                     // The substring removes Hive partition prefix (time_bucket=).
                     val timeBucket = timeDirectory.getPath.getName.substring(12)
-
-                    val dataFiles = fs.listStatus(timeDirectory.getPath)
-                    if (dataFiles.length > 1) {
-                        // This should not happen.
-                        // Just making sure that we do not leave out any file.
-                        throw new RuntimeException("More than one file per folder generated.")
-                    }
-                    val sourcePath = dataFiles(0).getPath
-
                     val destinationDirectory = s"$outputBasePath/$snapshot/$wiki"
-                    fs.mkdirs(new Path(destinationDirectory))
                     val destinationFile = s"$snapshot.$wiki.$timeBucket.tsv.bz2"
                     val destinationPath = new Path(s"$destinationDirectory/$destinationFile")
-                    fs.rename(sourcePath, destinationPath)
+                    HDFSArchiver(
+                        sourceDirectory = timeDirectory.getPath,
+                        expectedFilenameEnding = "",
+                        checkDone = false,
+                        doneFilePath = new Path(""),
+                        archiveFile = destinationPath,
+                        archiveParentUmask = "022",
+                        archivePerms = "644"
+                    )
                 }
             }
         }

diff --git a/refinery-job/src/test/resources/hdfs_archiver_test/1/.gitkeep b/refinery-job/src/test/resources/hdfs_archiver_test/1/.gitkeep
diff --git a/refinery-job/src/test/resources/hdfs_archiver_test/2/_SUCCESS b/refinery-job/src/test/resources/hdfs_archiver_test/2/_SUCCESS
diff --git a/refinery-job/src/test/resources/hdfs_archiver_test/3/_SUCCESS b/refinery-job/src/test/resources/hdfs_archiver_test/3/_SUCCESS
diff --git a/refinery-job/src/test/resources/hdfs_archiver_test/3/archive_1.gz b/refinery-job/src/test/resources/hdfs_archiver_test/3/archive_1.gz
diff --git a/refinery-job/src/test/resources/hdfs_archiver_test/3/archive_2.gz b/refinery-job/src/test/resources/hdfs_archiver_test/3/archive_2.gz
diff --git a/refinery-job/src/test/resources/hdfs_archiver_test/4/archive.gz b/refinery-job/src/test/resources/hdfs_archiver_test/4/archive.gz
@@ -0,0 +1 @@
+blob
diff --git a/refinery-job/src/test/resources/hdfs_archiver_test/4/archive.tgz b/refinery-job/src/test/resources/hdfs_archiver_test/4/archive.tgz
@@ -0,0 +1 @@
+blob
diff --git a/refinery-job/src/test/resources/hdfs_archiver_test/5/_SUCCESS b/refinery-job/src/test/resources/hdfs_archiver_test/5/_SUCCESS
diff --git a/refinery-job/src/test/resources/hdfs_archiver_test/5/archive.gz b/refinery-job/src/test/resources/hdfs_archiver_test/5/archive.gz
@@ -0,0 +1 @@
+blob
diff --git a/refinery-job/src/test/resources/hdfs_archiver_test/6/empty_archive.gz b/refinery-job/src/test/resources/hdfs_archiver_test/6/empty_archive.gz
diff --git a/refinery-job/src/test/resources/hdfs_archiver_test/7_bkp/_SUCCESS b/refinery-job/src/test/resources/hdfs_archiver_test/7_bkp/_SUCCESS
diff --git a/refinery-job/src/test/resources/hdfs_archiver_test/7_bkp/archive.gz b/refinery-job/src/test/resources/hdfs_archiver_test/7_bkp/archive.gz
@@ -0,0 +1 @@
+blob
diff --git a/refinery-job/src/test/resources/hdfs_archiver_test/a_file b/refinery-job/src/test/resources/hdfs_archiver_test/a_file