In [0]:
# This execerise demonstrates how to use User Defined Functions (UDF)
# Based on Databrics documentation https://docs.databricks.com/spark/latest/spark-sql/udf-python.html

In [0]:
# Register a function as a UDF
def squared(s):
  return s * s
spark.udf.register("squaredWithPython", squared)

Out[17]: <function __main__.squared(s)>

In [0]:
# Create a temporary table with 20 rows and 
spark.range(1, 20).createOrReplaceTempView("test")
sqlContext.sql("select * from test").show()

+---+
| id|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+



In [0]:
# Alterantive way of showing the content of the table "test"

In [0]:
%sql select * from test

id
1
2
3
4
5
6
7
8
9
10


In [0]:
# Call the UDF in Spark SQL

In [0]:
%sql select id, squaredWithPython(id) as id_squared from test

id,id_squared
1,1
2,4
3,9
4,16
5,25
6,36
7,49
8,64
9,81
10,100


In [0]:
# Use UDF with DataFrames
from pyspark.sql.functions import udf

squared_udf = udf(squared)
df = spark.table("test")
display(df.select("id", squared_udf("id").alias("id_squared")))

id,id_squared
1,1
2,4
3,9
4,16
5,25
6,36
7,49
8,64
9,81
10,100
