In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.master('local').appName('udf_age').getOrCreate()

In [3]:
age = [
    ('이예은', 22),
    ('안현', 39),
    ('사쿠라', 26),
    ('김양순', 82),
    ('신승옥', 49),
    ('한석규', 60),
    ('김민정', 23),
    ('김소연', 44),
    ('안성기', 72),
    ('최동오', 19)
]

In [4]:
schema = ['name', 'age']

In [5]:
df = spark.createDataFrame(data=age, schema=schema)

In [6]:
df.show()

+------+---+
|  name|age|
+------+---+
|이예은| 22|
|  안현| 39|
|사쿠라| 26|
|김양순| 82|
|신승옥| 49|
|한석규| 60|
|김민정| 23|
|김소연| 44|
|안성기| 72|
|최동오| 19|
+------+---+



In [7]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



In [8]:
df.createOrReplaceTempView('categories')

In [9]:
def age_category(age):
   
    if (age < 35):
        return "young"
    elif (age >= 35 and age <= 59):
        return "adult"
    else:
        return "senior"

In [10]:
spark.udf.register('age_category', age_category)

<function __main__.age_category(age)>

In [11]:
spark.sql('SELECT name, age, age_category(age) FROM categories').show()

+------+---+-----------------+
|  name|age|age_category(age)|
+------+---+-----------------+
|이예은| 22|            young|
|  안현| 39|            adult|
|사쿠라| 26|            young|
|김양순| 82|           senior|
|신승옥| 49|            adult|
|한석규| 60|           senior|
|김민정| 23|            young|
|김소연| 44|            adult|
|안성기| 72|           senior|
|최동오| 19|            young|
+------+---+-----------------+

