# 6.2 스파크 데이터 타입으로 변환하기
- lit 함수를 사용
- lit 함수 : 다른 언어의 데이터 타입을 스파크 데이터 타입에 맞게 변환

In [1]:
spark

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.0.2:4040
SparkContext available as 'sc' (version = 3.3.2, master = local[*], app id = local-1681990673327)
SparkSession available as 'spark'


res0: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@5450c6e8


In [2]:
import org.apache.spark.sql.functions.lit

val df = spark.read.format("csv")
.option("header", "true")
.option("inferSchema", "true")
.load("sample_data/retail-data/by-day/2010-12-01.csv")

df.select(lit(5), lit("five"), lit(5.0))

import org.apache.spark.sql.functions.lit
df: org.apache.spark.sql.DataFrame = [InvoiceNo: string, StockCode: string ... 6 more fields]
res1: org.apache.spark.sql.DataFrame = [5: int, five: string ... 1 more field]


# 6.3 불리언 데이터 타입 다루기

In [3]:
import org.apache.spark.sql.functions.col

df.where(col("InvoiceNo").equalTo(536365)).select("InvoiceNo", "Description").show(5,false)

+---------+-----------------------------------+
|InvoiceNo|Description                        |
+---------+-----------------------------------+
|536365   |WHITE HANGING HEART T-LIGHT HOLDER |
|536365   |WHITE METAL LANTERN                |
|536365   |CREAM CUPID HEARTS COAT HANGER     |
|536365   |KNITTED UNION FLAG HOT WATER BOTTLE|
|536365   |RED WOOLLY HOTTIE WHITE HEART.     |
+---------+-----------------------------------+
only showing top 5 rows



import org.apache.spark.sql.functions.col


In [4]:
import org.apache.spark.sql.functions.col

df.where(col("InvoiceNo") === 536365)
.select("InvoiceNo", "Description")
.show(5,false)

+---------+-----------------------------------+
|InvoiceNo|Description                        |
+---------+-----------------------------------+
|536365   |WHITE HANGING HEART T-LIGHT HOLDER |
|536365   |WHITE METAL LANTERN                |
|536365   |CREAM CUPID HEARTS COAT HANGER     |
|536365   |KNITTED UNION FLAG HOT WATER BOTTLE|
|536365   |RED WOOLLY HOTTIE WHITE HEART.     |
+---------+-----------------------------------+
only showing top 5 rows



import org.apache.spark.sql.functions.col


In [6]:
//필터링 조건에 불리언 표현식 사용
val priceFilter = col("UnitPrice") >600
val descripFilter = col("Description").contains("POSTAGE")

df.where(col("StockCode").isin("DOT")).where(priceFilter.or(descripFilter)).show()

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



priceFilter: org.apache.spark.sql.Column = (UnitPrice > 600)
descripFilter: org.apache.spark.sql.Column = contains(Description, POSTAGE)


In [7]:
//불리언 컬럼을 사용해 DataFrame을 필터링
val DOTCodeFilter = col("StockCode") === "DOT"
val priceFilter = col("UnitPrice") > 600
val descripFilter = col("Description").contains("POSTAGE")

df.withColumn("isExpensive", DOTCodeFilter.and(priceFilter.or(descripFilter)))
.where("isExpensive")
.select("unitPrice", "isExpensive").show(5)

+---------+-----------+
|unitPrice|isExpensive|
+---------+-----------+
|   569.77|       true|
|   607.49|       true|
+---------+-----------+



DOTCodeFilter: org.apache.spark.sql.Column = (StockCode = DOT)
priceFilter: org.apache.spark.sql.Column = (UnitPrice > 600)
descripFilter: org.apache.spark.sql.Column = contains(Description, POSTAGE)


# 6.4 수치형 데이터 타입 다루기

In [9]:
import org.apache.spark.sql.functions.{expr, pow}

//pow
val fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select(expr("CustomerId"), fabricatedQuantity.alias("realQuantity")).show()

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
|   17850.0|             489.0|
|   17850.0|          418.7156|
|   17850.0|          418.7156|
|   17850.0|239.09000000000003|
|   17850.0|            655.25|
|   17850.0|128.21000000000004|
|   17850.0|128.21000000000004|
|   13047.0|2929.6463999999996|
|   13047.0|163.76000000000005|
|   13047.0|163.76000000000005|
|   13047.0|             905.0|
|   13047.0|103.00999999999998|
|   13047.0|            655.25|
|   13047.0|225.52250000000004|
|   13047.0|401.00999999999993|
|   13047.0|323.62250000000006|
|   13047.0|323.62250000000006|
|   13047.0|           1016.24|
+----------+------------------+
only showing top 20 rows



import org.apache.spark.sql.functions.{expr, pow}
fabricatedQuantity: org.apache.spark.sql.Column = (POWER((Quantity * UnitPrice), 2.0) + 5)


In [10]:
//반올림 round, 내림 bround
import org.apache.spark.sql.functions.{round, bround}

df.select(round(col("UnitPrice"), 1).alias("rounded"), col("UnitPrice")).show(5)

+-------+---------+
|rounded|UnitPrice|
+-------+---------+
|    2.6|     2.55|
|    3.4|     3.39|
|    2.8|     2.75|
|    3.4|     3.39|
|    3.4|     3.39|
+-------+---------+
only showing top 5 rows



import org.apache.spark.sql.functions.{round, bround}


In [12]:
//피어슨 상관계수
import org.apache.spark.sql.functions.{corr}

df.stat.corr("Quantity", "UnitPrice")
df.select(corr("Quantity", "UnitPrice")).show()

+-------------------------+
|corr(Quantity, UnitPrice)|
+-------------------------+
|     -0.04112314436835551|
+-------------------------+



import org.apache.spark.sql.functions.corr


In [13]:
//요약 통계
df.describe().show()

+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|summary|        InvoiceNo|         StockCode|         Description|          Quantity|         UnitPrice|        CustomerID|       Country|
+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|  count|             3108|              3108|                3098|              3108|              3108|              1968|          3108|
|   mean| 536516.684944841|27834.304044117645|                null| 8.627413127413128| 4.151946589446603|15661.388719512195|          null|
| stddev|72.89447869788873|17407.897548583845|                null|26.371821677029203|15.638659854603892|1854.4496996893627|          null|
|    min|           536365|             10002| 4 PURPLE FLOCK D...|               -24|               0.0|           12431.0|     Australia|
|    max|          C

# 6.5 문자열 데이터  타입 다루기

In [14]:
//대소문자 변환
//initcap 주어진 문자열에서 공백으로 나뉘는 모든 단어의 첫 글자를 대문자로 변경
import org.apache.spark.sql.functions.{initcap}

df.select(initcap(col("Description"))).show(2, false)

+----------------------------------+
|initcap(Description)              |
+----------------------------------+
|White Hanging Heart T-light Holder|
|White Metal Lantern               |
+----------------------------------+
only showing top 2 rows



import org.apache.spark.sql.functions.initcap


In [15]:
//lower, upper
import org.apache.spark.sql.functions.{lower, upper}

df.select(col("Description"),
          lower(col("Description")),
          upper(lower(col("Description")))).show(2)

+--------------------+--------------------+-------------------------+
|         Description|  lower(Description)|upper(lower(Description))|
+--------------------+--------------------+-------------------------+
|WHITE HANGING HEA...|white hanging hea...|     WHITE HANGING HEA...|
| WHITE METAL LANTERN| white metal lantern|      WHITE METAL LANTERN|
+--------------------+--------------------+-------------------------+
only showing top 2 rows



import org.apache.spark.sql.functions.{lower, upper}


## 6.5.1 정규 표현식
- 정규 표현식을 사용해 문자열에서 값을 추출하거나 다른 값으로 치환하는 데 필요한 규칙 모음을 정의할 수 있음

In [16]:
import org.apache.spark.sql.functions.regexp_replace

val simpleColors = Seq("black", "white", "red", "green", "blue")
val regexString = simpleColors.map(_.toUpperCase).mkString("|")

df.select(
    regexp_replace(col("Description"), regexString, "COLOR").alias("color_clean"),
    col("Description")).show(2)

+--------------------+--------------------+
|         color_clean|         Description|
+--------------------+--------------------+
|COLOR HANGING HEA...|WHITE HANGING HEA...|
| COLOR METAL LANTERN| WHITE METAL LANTERN|
+--------------------+--------------------+
only showing top 2 rows



import org.apache.spark.sql.functions.regexp_replace
simpleColors: Seq[String] = List(black, white, red, green, blue)
regexString: String = BLACK|WHITE|RED|GREEN|BLUE


In [19]:
import org.apache.spark.sql.functions.translate

df.select(translate(col("Description"), "LEET", "1337"), col("Description")).show(2)

+----------------------------------+--------------------+
|translate(Description, LEET, 1337)|         Description|
+----------------------------------+--------------------+
|              WHI73 HANGING H3A...|WHITE HANGING HEA...|
|               WHI73 M37A1 1AN73RN| WHITE METAL LANTERN|
+----------------------------------+--------------------+
only showing top 2 rows



import org.apache.spark.sql.functions.translate


# 6.6 날짜와 타임스탬프 데이터 타입 다루기

In [20]:
//오늘 날짜와 현재 타임스탬프 값 구하기
import org.apache.spark.sql.functions.{current_date, current_timestamp}

val dateDF = spark.range(10)
.withColumn("today", current_date())
.withColumn("now", current_timestamp())
dateDF.createOrReplaceTempView("dateTable")

import org.apache.spark.sql.functions.{current_date, current_timestamp}
dateDF: org.apache.spark.sql.DataFrame = [id: bigint, today: date ... 1 more field]


In [21]:
//위 DataFrame을 사용해 오늘을 기준으로 5일 전후의 날짜 구하기
//date_sub, date_add
import org.apache.spark.sql.functions.{date_add, date_sub}

dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1)

+------------------+------------------+
|date_sub(today, 5)|date_add(today, 5)|
+------------------+------------------+
|        2023-04-15|        2023-04-25|
+------------------+------------------+
only showing top 1 row



import org.apache.spark.sql.functions.{date_add, date_sub}


In [24]:
//두 날짜 사이의 일 수, 두 날짜 사이의 개월 수
import org.apache.spark.sql.functions.{datediff, months_between, to_date}

dateDF.withColumn("week_ago", date_sub(col("today"), 7))
.select(datediff(col("week_ago"), col("today"))).show(1)

+-------------------------+
|datediff(week_ago, today)|
+-------------------------+
|                       -7|
+-------------------------+
only showing top 1 row



import org.apache.spark.sql.functions.{datediff, months_between, to_date}


In [25]:
dateDF.select(
    to_date(lit("2016-01-01")).alias("start"),
    to_date(lit("2017-05-22")).alias("end"))
        .select(months_between(col("start"), col("end"))).show(1)

+--------------------------------+
|months_between(start, end, true)|
+--------------------------------+
|                    -16.67741935|
+--------------------------------+
only showing top 1 row



In [26]:
//날짜 포맷 지정
import org.apache.spark.sql.functions.to_date

val dateFormat = "yyyy-dd-MM"
val cleanDateDF = spark.range(1).select(
    to_date(lit("2017-12-11"), dateFormat).alias("date"),
    to_date(lit("2017-20-12"), dateFormat).alias("date2"))
cleanDateDF.createOrReplaceTempView("dateTable2")

import org.apache.spark.sql.functions.to_date
dateFormat: String = yyyy-dd-MM
cleanDateDF: org.apache.spark.sql.DataFrame = [date: date, date2: date]


In [27]:
import org.apache.spark.sql.functions.to_timestamp

cleanDateDF.select(to_timestamp(col("date"), dateFormat)).show()

+------------------------------+
|to_timestamp(date, yyyy-dd-MM)|
+------------------------------+
|           2017-11-12 00:00:00|
+------------------------------+



import org.apache.spark.sql.functions.to_timestamp


# 6.7 null 값 다루기

# 6.9 복합 데이터 타입 다루기

## 6.9.1 구조체
- DataFrame 내부의 DataFrame

In [29]:
//df.selectExpr("(Description, InvoiceNo) as complex", "\*")
//df.selectExpr("struct(Description, InvoiceNo) as complex", "\*")

import org.apache.spark.sql.functions.struct

val complexDF = df.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.createOrReplaceTempView("complexDF")

import org.apache.spark.sql.functions.struct
complexDF: org.apache.spark.sql.DataFrame = [complex: struct<Description: string, InvoiceNo: string>]


## 6.9.2 배열

In [30]:
//split 함수를 통해 Description 컬럼을 배열로 변환
import org.apache.spark.sql.functions.split

df.select(split(col("Description"), " ")).show(2)

+-------------------------+
|split(Description,  , -1)|
+-------------------------+
|     [WHITE, HANGING, ...|
|     [WHITE, METAL, LA...|
+-------------------------+
only showing top 2 rows



import org.apache.spark.sql.functions.split


In [31]:
df.select(split(col("Description"), " ").alias("array_col")).selectExpr("array_col[0]").show(2)

+------------+
|array_col[0]|
+------------+
|       WHITE|
|       WHITE|
+------------+
only showing top 2 rows



### 배열의 길이

In [32]:
import org.apache.spark.sql.functions.size

df.select(size(split(col("Description"), " "))).show(2)

+-------------------------------+
|size(split(Description,  , -1))|
+-------------------------------+
|                              5|
|                              3|
+-------------------------------+
only showing top 2 rows



import org.apache.spark.sql.functions.size


### array_contains

In [33]:
import org.apache.spark.sql.functions.array_contains

df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2)

+------------------------------------------------+
|array_contains(split(Description,  , -1), WHITE)|
+------------------------------------------------+
|                                            true|
|                                            true|
+------------------------------------------------+
only showing top 2 rows



import org.apache.spark.sql.functions.array_contains


### explode
- 배열 타입의 컬럼을 입력받음

In [34]:
import org.apache.spark.sql.functions.{split, explode}

df.withColumn("splitted", split(col("Description"), " "))
.withColumn("exploded", explode(col("splitted")))
.select("Description", "InvoiceNo", "exploded").show(2)

+--------------------+---------+--------+
|         Description|InvoiceNo|exploded|
+--------------------+---------+--------+
|WHITE HANGING HEA...|   536365|   WHITE|
|WHITE HANGING HEA...|   536365| HANGING|
+--------------------+---------+--------+
only showing top 2 rows



import org.apache.spark.sql.functions.{split, explode}


## 6.9.3 맵

In [35]:
import org.apache.spark.sql.functions.map

df.select(map(col("Description"), col("InvoiceNo")).alias("complex_map")).show(2)

+--------------------+
|         complex_map|
+--------------------+
|{WHITE HANGING HE...|
|{WHITE METAL LANT...|
+--------------------+
only showing top 2 rows



import org.apache.spark.sql.functions.map


In [36]:
//적합한 키를 사용해 데이터 조회
df.select(map(col("Description"), col("InvoiceNo")).alias("complex_map"))
.selectExpr("complex_map['WHITE METAL LANTERN']").show(2)

+--------------------------------+
|complex_map[WHITE METAL LANTERN]|
+--------------------------------+
|                            null|
|                          536365|
+--------------------------------+
only showing top 2 rows



# 6.10 JSON 다루기

In [37]:
//JSON 컬럼 생성
val jsonDF = spark.range(1).selectExpr("""
'{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString""")

jsonDF: org.apache.spark.sql.DataFrame = [jsonString: string]


In [38]:
//get_json_object 함수로 JSON 객체를 인라인 쿼리로 조회
import org.apache.spark.sql.functions.{get_json_object, json_tuple}

jsonDF.select(
    get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]") as "column",
    json_tuple(col("jsonString"), "myJSONKey")).show(2)

+------+--------------------+
|column|                  c0|
+------+--------------------+
|     2|{"myJSONValue":[1...|
+------+--------------------+



import org.apache.spark.sql.functions.{get_json_object, json_tuple}


In [39]:
//StructType을 JSON 문자열로 변경
import org.apache.spark.sql.functions.to_json

df.selectExpr("(InvoiceNo, Description) as myStruct")
.select(to_json(col("myStruct")))

import org.apache.spark.sql.functions.to_json
res33: org.apache.spark.sql.DataFrame = [to_json(myStruct): string]


# 6.11 사용자 정의 함수

In [40]:
val udfExampleDF = spark.range(5).toDF("num")
def power3(number:Double):Double = number * number * number
power3(2.0)

udfExampleDF: org.apache.spark.sql.DataFrame = [num: bigint]
power3: (number: Double)Double
res34: Double = 8.0
