In [1]:
# [+] SparkSession 설정
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('udf').getOrCreate()

In [7]:
# 샘플 데이터
age_category = [
    ('안현', 39),
    ('김태훈', 24),
    ('강동원', 43),
    ('박효신', 43),
    ('태연', 35),
    ('한소희', 30),
    ('최민식', 63)
]

In [9]:
# [+] 데이터프레임 생성
df = spark.createDataFrame(age_category, schema)

In [8]:
# [+] 스키마 정의
schema = ['name', 'age']

In [10]:
# [+] 데이터프레임 출력
df.show()

+------+---+
|  name|age|
+------+---+
|  안현| 39|
|김태훈| 24|
|강동원| 43|
|박효신| 43|
|  태연| 35|
|한소희| 30|
|최민식| 63|
+------+---+



In [12]:
# [+] 데이터프레임 스키마 출력
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



In [13]:
# [+] Temporary View 생성
df.createOrReplaceTempView('age')

In [18]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

In [19]:
def age_category(age):
    if age < 35:
        return 'young'
    elif age < 60:
        return 'adult'
    else:
        return 'senior'

In [20]:
age_category_udf = udf(age_category, StringType())

In [21]:
df.withColumn('age_category', age_category_udf(df['age'])).show()

+------+---+------------+
|  name|age|age_category|
+------+---+------------+
|  안현| 39|       adult|
|김태훈| 24|       young|
|강동원| 43|       adult|
|박효신| 43|       adult|
|  태연| 35|       adult|
|한소희| 30|       young|
|최민식| 63|      senior|
+------+---+------------+



In [22]:
result = spark.sql(
    "SELECT name, CASE \
    WHEN age < 35 THEN 'young' \
    WHEN age >= 35 AND age < 60 THEN 'adult' \
    ELSE 'senior' \
    END AS age_category \
    FROM age").show()


+------+------------+
|  name|age_category|
+------+------------+
|  안현|       adult|
|김태훈|       young|
|강동원|       adult|
|박효신|       adult|
|  태연|       adult|
|한소희|       young|
|최민식|      senior|
+------+------------+

