## Setting Environment Variables

In [2]:
import os
import sys
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

## Creating Data Frame

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .appName("UDF")\
        .getOrCreate()

In [5]:
columns = ["Seqno","Name"]
data = [("1", "john jones"),
    ("2", "tracey smith"),
    ("3", "amy sanders")]

df = spark.createDataFrame(data=data,schema=columns)

## UDF

In [11]:
# Create a Python Function
def convertCase(str):
    resStr = ''
    arr = str.split()
    for x in arr :
        resStr = resStr + x[0:1].upper() + x[1:len(x)] + " "
    return resStr

### udf() function

In [12]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
convertUdf = udf(lambda z : convertCase(z), StringType())

In [13]:
from pyspark.sql.functions import col
df.select(col("Seqno"),\
         convertUdf("Name").alias("Name"))\
    .show()

+-----+-------------+
|Seqno|         Name|
+-----+-------------+
|    1|  John Jones |
|    2|Tracey Smith |
|    3| Amy Sanders |
+-----+-------------+



### UDF() withColumn()

In [14]:
def upperCase(str):
    return str.upper()

In [15]:
upperCaseUDF = udf(lambda z : upperCase(z))
df.withColumn("Curerated Name : ", upperCaseUDF(col("Name")))\
.show()

+-----+------------+-----------------+
|Seqno|        Name|Curerated Name : |
+-----+------------+-----------------+
|    1|  john jones|       JOHN JONES|
|    2|tracey smith|     TRACEY SMITH|
|    3| amy sanders|      AMY SANDERS|
+-----+------------+-----------------+



### Registering PySpark UDF

In [19]:
spark.udf.register("convertUDF", convertCase,StringType())
df.createOrReplaceTempView("TAB")
ud = spark.sql(
    """
        SELECT Seqno, convertUDF(Name) as Name from TAB
    """
)
ud.show()

+-----+-------------+
|Seqno|         Name|
+-----+-------------+
|    1|  John Jones |
|    2|Tracey Smith |
|    3| Amy Sanders |
+-----+-------------+



### Creating UDF using Annotation

In [21]:
@udf(returnType = StringType())
def upperCase(str):
    return str.upper()

In [23]:
df.withColumn("Cureated Name", upperCase("Name")).show()

+-----+------------+-------------+
|Seqno|        Name|Cureated Name|
+-----+------------+-------------+
|    1|  john jones|   JOHN JONES|
|    2|tracey smith| TRACEY SMITH|
|    3| amy sanders|  AMY SANDERS|
+-----+------------+-------------+



In [24]:
columns = ["Seqno","Name"]
data = [("1", "john jones"),
    ("2", "tracey smith"),
    ("3", "amy sanders"),
    ('4',None)]

df2 = spark.createDataFrame(data=data,schema=columns)
df2.show(truncate=False)
df2.createOrReplaceTempView("NAME_TABLE2")

spark.sql("select convertUDF(Name) from NAME_TABLE2") \
     .show(truncate=False)

# Note that from the above snippet, record with “Seqno 4” has value “None” for “name” column. 
# Since we are not handling null with UDF function, using this on DataFrame returns below error.
# Note that in Python None is considered null.

+-----+------------+
|Seqno|Name        |
+-----+------------+
|1    |john jones  |
|2    |tracey smith|
|3    |amy sanders |
|4    |null        |
+-----+------------+



PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "C:\Users\Yateesh Chandra\AppData\Local\Temp\ipykernel_26956\900373106.py", line 4, in convertCase
AttributeError: 'NoneType' object has no attribute 'split'
