Skip to content

Commit

Permalink
Write GC summary file using UTF-8
Browse files Browse the repository at this point in the history
Tested by adding a unit test for writeJsonSummary.

Fixes #4644.
  • Loading branch information
arielshaqed committed Dec 13, 2022
1 parent c33ba5f commit 0b22406
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -661,9 +661,12 @@ object GarbageCollector {

val stream = dstFS.create(dstPath)
try {
stream.writeChars(compact(render(jsonSummary)))
val bytes = compact(render(jsonSummary)).getBytes("UTF-8")
stream.write(bytes)
} finally {
stream.close()
}
}

def writeJsonSummaryForTesting = writeJsonSummary _
}
Original file line number Diff line number Diff line change
@@ -1,12 +1,29 @@
package io.treeverse.clients

import scala.collection.JavaConverters._
import org.scalatest._
import matchers.should._
import funspec._

import io.treeverse.lakefs.catalog

import org.apache.commons.io.FileUtils
import org.apache.spark.{HashPartitioner, SparkConf}
import org.apache.spark.sql.{Dataset, SparkSession}
import io.treeverse.lakefs.catalog

import org.json4s._
import org.json4s.native.JsonMethods

import java.nio.file.{Files, Path, Paths}

trait TempDirectory {
def withTempDirectory(testMethod: (Path) => Any) {
val tempDir = Files.createTempDirectory("test-gc")

testMethod(tempDir)
FileUtils.deleteDirectory(tempDir.toFile)
}
}

class ARangeGetter(
val repo: String,
Expand Down Expand Up @@ -182,6 +199,38 @@ class GarbageCollectorSpec extends AnyFunSpec with Matchers with SparkSessionSet
}
}

class GarbageCollectorJsonOutputSpec extends AnyFunSpec with Matchers with SparkSessionSetup with TempDirectory {
describe("writeJsonSummary") {
it("should write a summary") {
withSparkSession(spark =>
withTempDirectory(tempDir => {
val sc = spark.sparkContext
val configMapper = new ConfigMapper(sc.broadcast(Array[(String, String)]()))
val dstRoot = tempDir.resolve("writeJsonSummary/")
val numDeletedObjects = 2906
val gcRules = "gobble gobble"
val time = "I always will remember, 'Twas a year ago November"

GarbageCollector.writeJsonSummaryForTesting(configMapper, dstRoot.toAbsolutePath.toString, numDeletedObjects, gcRules, time)

val written = FileUtils.listFiles(dstRoot.toFile, null, true)
.asScala
.iterator
.filter((f) => !f.toString.endsWith(".crc"))
.toSeq
written.size should be(1)
val actualBytes = Files.readAllBytes(Paths.get(written(0).toString))
Console.out.println("GOT: " + new String(actualBytes, "UTF-8"))
// Explicitly verify that we received UTF-8 encoded data!
val actual = JsonMethods.parse(new String(actualBytes, "UTF-8"))
(actual \ "gc_rules") should be(JString(gcRules))
(actual \ "num_deleted_objects") should be(JInt(numDeletedObjects))
// TODO(ariels): Verify dt=${time} in path.
}))
}
}
}

trait SparkSessionSetup {
def withSparkSession(testMethod: (SparkSession) => Any) {
val conf = new SparkConf()
Expand Down

0 comments on commit 0b22406

Please sign in to comment.