In [24]:
import sys
import os

In [25]:
os.environ.get('JAVA_HOME')

'C:\\Program Files\\Java\\jdk1.8.0_311'

In [26]:
# import findspark
# findspark.init()

In [27]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType

In [28]:
spark = SparkSession.builder.config("spark.sql.warehouse.dir", "temp").appName("SparkSQL").getOrCreate()

In [29]:
columns = ["id", "name"]
data = [
    ("1", "john jones"),
    ("2", "tracey smith"),
    ("3", "amy sanders")
]
df = spark.createDataFrame(data=data, schema=columns)
df.show(truncate=False)

+---+------------+
|id |name        |
+---+------------+
|1  |john jones  |
|2  |tracey smith|
|3  |amy sanders |
+---+------------+



In [30]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)



## UDF FUNCTION

In [31]:
def convertSentCase(sent):
    arr = []
    for word in sent.split(" "):
        arr.append(f"{word[0:1].upper()}{word[1:]}")
    return " ".join(arr)

In [32]:
convertSentCase("ali jones")

'Ali Jones'

In [33]:
# sent_case_udf = udf(lambda x: convertSentCase(x))
sent_case_udf = udf(convertSentCase)

In [34]:
df.select(col("id").cast("int"),
          sent_case_udf(col("name")).alias("user")).show(truncate=False)

+---+------------+
|id |user        |
+---+------------+
|1  |John Jones  |
|2  |Tracey Smith|
|3  |Amy Sanders |
+---+------------+



## UDF SQL FUNCTION

In [35]:
spark.udf.register("sentenceCaseUDF", convertSentCase, StringType())

<function __main__.convertSentCase(sent)>

In [19]:
df.createOrReplaceTempView("users_tbl")

In [None]:
spark.sql("SELECT id, sentenceCaseUDF(name) AS users from users_tbl").show(truncate=False)

## NULL SAFE UDF

In [None]:
columns = ["id", "name"]
data = [
    ("1", "john jones"),
    ("2", "tracey smith"),
    ("3", "amy sanders"),
    ("4", None)
]
df = spark.createDataFrame(data=data, schema=columns)
df.show(truncate=False)

In [None]:
spark.udf.register("nullSafeUDF", lambda x: convertSentCase(x) if x is not None else "", StringType())

In [None]:
spark.sql("SELECT nullSafeUDF(name) FROM users_tbl").show(truncate=False)