# Chargement des données

In [6]:
case class Prenom(sexe: String, prenom: String, annee: Int, codeDept: Int, nombre: Int)

In [7]:
val prenomsRDD = sc.textFile("prenoms.txt").filter(l => l.startsWith("sexe") == false).filter(l => l.contains("XX") == false)

In [8]:
// Pour les conversions implicites de RDDs vers DataFrames
val sparkRO = spark // bricolage pour que cela fonctionne dans le notebool (inutile sinon)
import sparkRO.implicits._

In [12]:
val prenoms = prenomsRDD.map(_.split('\t')).map(a => Prenom(a(0), a(1), a(2).toInt, a(3).toInt, a(4).toDouble.toInt)).toDS()
prenoms.cache

[sexe: string, prenom: string ... 3 more fields]

# Sauvegarde dans les différents formats

In [7]:
import java.nio.file._
import java.nio.file.attribute.BasicFileAttributes
import java.util.concurrent.atomic.AtomicLong

def pathSize(path: Path): Long = {
    var size = new AtomicLong(0)

    Files.walkFileTree(path, new SimpleFileVisitor[Path]() {
        override def visitFile(file: Path, attrs: BasicFileAttributes): FileVisitResult = {
            size.addAndGet(attrs.size())
            FileVisitResult.CONTINUE
        }
    });
    size.get()
}

In [1]:
val formats = Map(
    "csv" -> List("uncompressed", "bzip2", "deflate", "gzip"),
    "json" -> List("uncompressed", "bzip2", "deflate", "gzip"),
    "parquet" -> List("uncompressed", "gzip", "snappy"),
    "orc" -> List("uncompressed", "snappy", "zlib")
)

In [9]:
for (format <- formats.keys) {
    for (codec <- formats(format)) {
        prenoms.write.mode("overwrite").option("compression", codec).format(format).save("prenoms")
        val prenomsPath = Paths.get("prenoms")
        println(s"$format, $codec, " + pathSize(prenomsPath))
    }
}

Sauvegarde au format csv compressé en uncompressed : 

Name: org.apache.spark.SparkException
Message: Job aborted.
StackTrace:   at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply$mcV$sp(FileFormatWriter.scala:215)
  at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:173)
  at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:173)
  at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
  at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:173)
  at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:145)
  at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:58)
  at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56)
  at org.apache.spark.sql.execution.com

# Affichage des résultats

In [25]:
val resultats = """csv, uncompressed, 68804952                                                     
csv, bzip2, 10676889                                                            
csv, deflate, 11998839          
csv, gzip, 11998875                                                             
json, uncompressed, 238735998                                                   
json, bzip2, 9491007                                                            
json, deflate, 14662059                                                         
json, gzip, 14662095                                                            
parquet, uncompressed, 8344008                                                  
parquet, gzip, 4820150                                                          
parquet, snappy, 6476035                                 
orc, uncompressed, 13265926                                                     
orc, snappy, 5810818                                                            
orc, zlib, 4377612"""

val sizes = resultats.split("\n")

In [26]:
var data = scala.collection.mutable.Map[Tuple2[String, String], Long]()
for (line <- sizes) {
    val cols = line.split(",").map(_.trim)
    data += ((cols(0), cols(1)) -> cols(2).toLong)
}
println(data)

Map((orc,snappy) -> 5810818, (json,deflate) -> 14662059, (json,bzip2) -> 9491007, (csv,uncompressed) -> 68804952, (parquet,snappy) -> 6476035, (csv,deflate) -> 11998839, (csv,gzip) -> 11998875, (json,uncompressed) -> 238735998, (parquet,gzip) -> 4820150, (orc,uncompressed) -> 13265926, (csv,bzip2) -> 10676889, (json,gzip) -> 14662095, (parquet,uncompressed) -> 8344008, (orc,zlib) -> 4377612)


## Taille (en Mo)

In [50]:
val fileFormats = List("csv", "json", "parquet", "orc")
val codecs = List("uncompressed", "bzip2", "deflate", "gzip", "snappy", "zlib")
println("Format | uncomp.  | bzip2    | def.     | gzip     | snappy   | zlib     |")
for (format <- fileFormats) {
    print(f"$format%-7s|")
    for (codec <- codecs) {
        val size = data.getOrElse[Long]((format, codec), 0) / (1024.0 * 1024)
        print(f"$size%9.2f |")
    }
    println
}

Format | uncomp.  | bzip2    | def.     | gzip     | snappy   | zlib     |
csv    |    65.62 |    10.18 |    11.44 |    11.44 |     0.00 |     0.00 |
json   |   227.68 |     9.05 |    13.98 |    13.98 |     0.00 |     0.00 |
parquet|     7.96 |     0.00 |     0.00 |     4.60 |     6.18 |     0.00 |
orc    |    12.65 |     0.00 |     0.00 |     0.00 |     5.54 |     4.17 |
