In [2]:
import sys
sys.path.append('/home/iceberg/notebooks/PyCon_LT_Workshop')

from helpers.utils import get_spark_session, get_yellow_taxi_data, get_dim_data
from pyspark.sql import functions as f
spark = get_spark_session("udfs")

yellow_taxi_data = get_yellow_taxi_data(spark=spark)

dim_taxi_zones, dim_rates, dim_payments, dim_vendor = get_dim_data(spark)

24/02/16 15:25:53 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


# UDF

Shorthand for User Defined Function. Useful when we want to have a some more complex function
which we can't get by using Spark functions. it's biggest flaw is that it takes a lot
of time to serialize data from python objects to JVM.

Two type of UDF are available:

1. Generic UDF
2. Pandas UDF (this one works fast in Spark 3.0)

# Generic UDFs

In [4]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf, col

lets create a simple function which would return length of a string.

Annotation @udf means that it will be registered as user defined function and it's return type
is StringType type.

In [5]:
@udf(returnType=IntegerType())
def vendor_name_length(name):
    return len(name)

In [10]:
joined = yellow_taxi_data.filter("fare_amount>10").join(dim_vendor, yellow_taxi_data.VendorID == dim_vendor.vendor_id, "full")

In [12]:
joined = joined.withColumn("vendor_name_length", vendor_name_length(col("vendor_name")))
joined.where("VendorID==6").show(10)

24/02/16 15:28:21 ERROR Executor: Exception in task 0.0 in stage 20.0 (TID 21)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/tmp/ipykernel_1622/3848275267.py", line 3, in vendor_name_length
TypeError: object of type 'NoneType' has no len()

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:94)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/tmp/ipykernel_1622/3848275267.py", line 3, in vendor_name_length
TypeError: object of type 'NoneType' has no len()


What happened?

Apparently we have null values in vendor_name column and len function doesn't handle them. (No value for key 6 in our dimension table)
So we can fix it:

In [15]:
@udf(returnType=IntegerType())
def vendor_name_length(name):
    return len(name) if name else 0

Also works with
```python
from pyspark.sql.types import IntegerType
def vendor_name_length(name):
    return len(name)
spark.udf.register("vendor_name_length", vendor_name_length, IntegerType())
df = df.withColumn("vendor_name_length", vendor_name_length(col("vendor_name")))
df.show(5, False)
```

In [14]:
joined = joined.withColumn("vendor_name_length", vendor_name_length(col("vendor_name")))
joined.where("VendorID==6").show(10)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+---------+-----------+------------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|vendor_id|vendor_name|vendor_name_length|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+---------+-----------+------------------+
|       6| 2022-01-01 00:01:45|  2022-01-01 00:01:48|           