# 11.1 Dataset을 사용할 시기
- DataFrame 기능만으로는 수행할 연산을 표현할 수 없는 경우
- 성능 저하를 감수하더라도 타입 안정성(type-safe)을 가진 데이터 타입을 사용하고 싶은 경우

# 11.2 Dataset 생성

## 11.2.1 자바 : Encoders

## 11.2.2 스칼라 : 케이스 클래스
- 케이스 클래스
    - 불변성
    - 패턴 매칭으로 분해 가능
    - 참조값 대신 클래스 구조를 기반으로 비교
    - 사용하기 쉽고 다루기 편함

In [4]:
spark

res0: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@5450c6e8


In [5]:
case class Flight(DEST_COUNTRY_NAME: String,
                  ORIGIN_COUNTRY_NAME: String, count: BigInt)

defined class Flight


In [6]:
val flightDF = spark.read
.parquet("/Users/choyubin/Downloads/Spark-The-Definitive-Guide-master/data/flight-data/parquet/2010-summary.parquet/")
val flights = flightDF.as[Flight]

flightDF: org.apache.spark.sql.DataFrame = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]
flights: org.apache.spark.sql.Dataset[Flight] = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


# 11.3 액션

In [8]:
flights.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|    1|
|       United States|            Ireland|  264|
|       United States|              India|   69|
|               Egypt|      United States|   24|
|   Equatorial Guinea|      United States|    1|
|       United States|          Singapore|   25|
|       United States|            Grenada|   54|
|          Costa Rica|      United States|  477|
|             Senegal|      United States|   29|
|       United States|   Marshall Islands|   44|
|              Guyana|      United States|   17|
|       United States|       Sint Maarten|   53|
|               Malta|      United States|    1|
|             Bolivia|      United States|   46|
|            Anguilla|      United States|   21|
|Turks and Caicos ...|      United States|  136|
|       United States|        Afghanistan|    2|
|Saint Vincent and..

In [9]:
flights.first.DEST_COUNTRY_NAME

res2: String = United States


# 11.4 트랜스포메이션
- DataFrame의 모든 트랜스포메이션은 Dataset에서 사용 가능

## 11.4.1 필터링

In [10]:
def originIsDestination(flight_row: Flight): Boolean = {
    return flight_row.ORIGIN_COUNTRY_NAME == flight_row.DEST_COUNTRY_NAME
}

originIsDestination: (flight_row: Flight)Boolean


In [13]:
flights.collect().filter(flight_row => originIsDestination(flight_row))

res4: Array[Flight] = Array(Flight(United States,United States,348113))


## 11.4.2 매핑
- 필터링은 단순한 트랜스포메이션
- 때로는 특정 값을 다른 값으로 매핑해야할 때가 있음

In [14]:
val destinations = flights.map(f => f.DEST_COUNTRY_NAME)

destinations: org.apache.spark.sql.Dataset[String] = [value: string]


In [15]:
val localDestinations = destinations.take(5)

23/03/24 20:03:45 ERROR Executor: Exception in task 0.0 in stage 5.0 (TID 5)
java.lang.ClassCastException: class $line9.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw cannot be cast to class $line9.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw ($line9.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw is in unnamed module of loader org.apache.spark.repl.ExecutorClassLoader @526c5c28; $line9.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw is in unnamed module of loader scala.tools.nsc.interpreter.IMain$TranslatingClassLoader @3874601d)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.deserializetoobject_doConsume_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at org.apache.spark.sql.execution

org.apache.spark.SparkException:  Job aborted due to stage failure: Task 0 in stage 5.0 failed 1 times, most recent failure: Lost task 0.0 in stage 5.0 (TID 5) (192.168.0.2 executor driver): java.lang.ClassCastException: class $iw cannot be cast to class $iw ($iw is in unnamed module of loader org.apache.spark.repl.ExecutorClassLoader @526c5c28; $iw is in unnamed module of loader scala.tools.nsc.interpreter.IMain$TranslatingClassLoader @3874601d)

# 11.5 조인

In [18]:
case class FlightMetadata(count: BigInt, randomData: BigInt)

defined class FlightMetadata


In [19]:
val flightsMeta = spark.range(500).map(x => (x, scala.util.Random.nextLong))
.withColumnRenamed("_1", "count").withColumnRenamed("_2", "randomData")
.as[FlightMetadata]

flightsMeta: org.apache.spark.sql.Dataset[FlightMetadata] = [count: bigint, randomData: bigint]


In [21]:
val flights2 = flights
.joinWith(flightsMeta, flights.col("count") === flightsMeta.col("count"))

flights2: org.apache.spark.sql.Dataset[(Flight, FlightMetadata)] = [_1: struct<DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field>, _2: struct<count: bigint, randomData: bigint>]


In [22]:
flights2.selectExpr("_1.DEST_COUNTRY_NAME")

res5: org.apache.spark.sql.DataFrame = [DEST_COUNTRY_NAME: string]


In [23]:
flights2.take(2)

res6: Array[(Flight, FlightMetadata)] = Array((Flight(United States,Uganda,1),FlightMetadata(1,999576434334245861)), (Flight(United States,French Guiana,1),FlightMetadata(1,999576434334245861)))


In [24]:
val flights2 = flights.join(flightsMeta, Seq("count"))

flights2: org.apache.spark.sql.DataFrame = [count: bigint, DEST_COUNTRY_NAME: string ... 2 more fields]


In [25]:
val fights2 = flights.join(flightsMeta.toDF(), Seq("count"))

fights2: org.apache.spark.sql.DataFrame = [count: bigint, DEST_COUNTRY_NAME: string ... 2 more fields]


# 11.6 그룹화와 집계

In [27]:
flights.groupBy("DEST_COUNTRY_NAME").count().show()

+--------------------+-----+
|   DEST_COUNTRY_NAME|count|
+--------------------+-----+
|            Anguilla|    1|
|              Russia|    1|
|            Paraguay|    1|
|             Senegal|    1|
|              Sweden|    1|
|            Kiribati|    1|
|              Guyana|    1|
|         Philippines|    1|
|            Malaysia|    1|
|           Singapore|    1|
|                Fiji|    1|
|              Turkey|    1|
|             Germany|    1|
|         Afghanistan|    1|
|              Jordan|    1|
|               Palau|    1|
|Turks and Caicos ...|    1|
|              France|    1|
|              Greece|    1|
|              Taiwan|    1|
+--------------------+-----+
only showing top 20 rows



In [29]:
flights.groupByKey(x => x.DEST_COUNTRY_NAME).count()

res10: org.apache.spark.sql.Dataset[(String, Long)] = [key: string, count(1): bigint]
