In [1]:
import sys
import os

In [2]:
os.environ.get('JAVA_HOME')

'C:\\Program Files\\Java\\jdk1.8.0_311'

In [3]:
import findspark
findspark.init()

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType

In [5]:
spark = SparkSession.builder.config("spark.sql.warehouse.dir", "temp").appName("SparkSQL").getOrCreate()

In [6]:
columns = ["id", "name"]
data = [
    ("1", "john jones"),
    ("2", "tracey smith"),
    ("3", "amy sanders")
]
df = spark.createDataFrame(data=data, schema=columns)
df.show(truncate=False)

+---+------------+
|id |name        |
+---+------------+
|1  |john jones  |
|2  |tracey smith|
|3  |amy sanders |
+---+------------+



In [7]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)



## UDF FUNCTION

In [8]:
def convertSentCase(sent):
    arr = []
    for word in sent.split(" "):
        arr.append(f"{word[0:1].upper()}{word[1:]}")
    return " ".join(arr)

In [9]:
convertSentCase("ali jones")

'Ali Jones'

In [10]:
sent_case_udf = udf(lambda x: convertSentCase(x))

In [11]:
df.select(col("id").cast("int"),
          sent_case_udf(col("name")).alias("user")).show(truncate=False)

+---+------------+
|id |user        |
+---+------------+
|1  |John Jones  |
|2  |Tracey Smith|
|3  |Amy Sanders |
+---+------------+



## UDF SQL FUNCTION

In [12]:
spark.udf.register("sentenceCaseUDF", convertSentCase, StringType())

<function __main__.convertSentCase(sent)>

In [13]:
df.createOrReplaceTempView("users_tbl")
spark.sql("SELECT id, sentenceCaseUDF(name) AS users from users_tbl").show(truncate=False)

+---+------------+
|id |users       |
+---+------------+
|1  |John Jones  |
|2  |Tracey Smith|
|3  |Amy Sanders |
+---+------------+



## NULL SAFE UDF

In [14]:
columns = ["id", "name"]
data = [
    ("1", "john jones"),
    ("2", "tracey smith"),
    ("3", "amy sanders"),
    ("4", None)
]
df = spark.createDataFrame(data=data, schema=columns)
df.show(truncate=False)

+---+------------+
|id |name        |
+---+------------+
|1  |john jones  |
|2  |tracey smith|
|3  |amy sanders |
|4  |null        |
+---+------------+



In [15]:
spark.udf.register("nullSafeUDF", lambda x: convertSentCase(x) if x is not None else "", StringType())

<function __main__.<lambda>(x)>

In [16]:
spark.sql("SELECT nullSafeUDF(name) FROM users_tbl").show(truncate=False)

+-----------------+
|nullSafeUDF(name)|
+-----------------+
|John Jones       |
|Tracey Smith     |
|Amy Sanders      |
+-----------------+

