# Built-In Functions
These built-in functions are designed to generate optimized code for execution at
runtime 
> they are designed to take one
or more columns of the same row as the input, and they return only a single column as
the output.

| Category       | Description                                                                                                                 |
|----------------|-----------------------------------------------------------------------------------------------------------------------------|
| Date time      | unix_timestamp, from_unixtime, to_date, current_date, current_timestamp, date_add, date_sub, add_months, datediff, months_between, dayofmonth, dayofyear, weekofyear, second, minute, hour, month, make_date, make_timestamp, make_interval |
| String         | concat, length, levenshtein, locate, lower, upper, ltrim, rtrim, trim, lpad, rpad, repeat, reverse, split, substring, base64 |
| Math           | cos, acos, sin, asin, tan, atan, ceil, floor, exp, factorial, log, pow, radian, degree, sqrt, hex, unhex                    |
| Cryptography   | crc32, hash, md5, sha1, sha2                                                                                                 |
| Aggregation    | approx_count_distinct, countDistinct, sumDistinct, avg, corr, count, first, last, max, min, skewness, sum                    |
| Collection     | array_contains, explode, from_json, size, sort_array, to_json                                                                |
| Window         | dense_rank, lag, lead, ntile, rank, row_number                                                                               |
| Misc.          | coalesce, isNan, isnull, isNotNull, monotonically_increasing_id, lit, when                                                  |


In [1]:
from pyspark.sql import SparkSession
import os
import pyspark.sql.functions as F

spark = SparkSession.builder.appName("SQL").getOrCreate()

24/05/24 14:12:56 WARN Utils: Your hostname, msi-MAG resolves to a loopback address: 127.0.1.1; using 192.168.0.129 instead (on interface wlp3s0)
24/05/24 14:12:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/24 14:12:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# the last two columns don't follow the default date format
testDF = spark.createDataFrame([(1, "2018-01-01", "2018-01-01 15:04:58:865", 
  "01-01-2018", "12-05-2017 00:45:50")], ["id", "date", "timestamp", "date_str", "ts_str"])

testDF.show(truncate=False)



+---+----------+-----------------------+----------+-------------------+
|id |date      |timestamp              |date_str  |ts_str             |
+---+----------+-----------------------+----------+-------------------+
|1  |2018-01-01|2018-01-01 15:04:58:865|01-01-2018|12-05-2017 00:45:50|
+---+----------+-----------------------+----------+-------------------+



## TimeFunctions

convert strings to dates and timestamp 

In [4]:
testResultDF = testDF.select(
    F.to_date("date").alias("date1"),
    F.to_timestamp("timestamp", "yyyy-MM-dd HH:mm:ss:SSS").alias("ts1"),  # Specify format with milliseconds
    F.to_date("date_str", "MM-dd-yyyy").alias("date2"),
    F.to_timestamp("ts_str", "MM-dd-yyyy HH:mm:ss").alias("ts2"),  # Corrected format with padding
    F.unix_timestamp("timestamp", "yyyy-MM-dd HH:mm:ss:SSS").alias("unix_ts")  # Specify format for unix_timestamp
)

In [5]:
print(testResultDF.printSchema())

testResultDF.show(truncate=False)

root
 |-- date1: date (nullable = true)
 |-- ts1: timestamp (nullable = true)
 |-- date2: date (nullable = true)
 |-- ts2: timestamp (nullable = true)
 |-- unix_ts: long (nullable = true)

None
+----------+-----------------------+----------+-------------------+----------+
|date1     |ts1                    |date2     |ts2                |unix_ts   |
+----------+-----------------------+----------+-------------------+----------+
|2018-01-01|2018-01-01 15:04:58.865|2018-01-01|2017-12-05 00:45:50|1514815498|
+----------+-----------------------+----------+-------------------+----------+



> get internal of a date
> * get day
> * get month
> * get year
> * get hour
> * get minute 
> * get second

In [13]:
#hour, minute,second
data=[["1","2020-02-01 11:01:19.06"],["2","2019-03-01 12:01:19.406"],["3","2021-03-01 12:01:19.406"]]
df3=spark.createDataFrame(data,["id","input"])

df3.select(
    F.col("id"),
    F.col("input"), 
    F.hour(F.col("input")).alias("hour"), 
    F.minute(F.col("input")).alias("minute"),
    F.second(F.col("input")).alias("second"),
    F.year(F.col("input")).alias("year"), 
    F.month(F.col("input")).alias("month"),
    F.dayofmonth(F.col("input")).alias("day"),
    F.dayofyear(F.col("input")).alias("doy"),
    F.weekofyear(F.col("input")).alias("week_of_year"),
    F.dayofweek(F.col("input")).alias("day_of_week")
  ).show(truncate=False)


+---+-----------------------+----+------+------+----+-----+---+---+------------+-----------+
|id |input                  |hour|minute|second|year|month|day|doy|week_of_year|day_of_week|
+---+-----------------------+----+------+------+----+-----+---+---+------------+-----------+
|1  |2020-02-01 11:01:19.06 |11  |1     |19    |2020|2    |1  |32 |5           |7          |
|2  |2019-03-01 12:01:19.406|12  |1     |19    |2019|3    |1  |60 |9           |6          |
|3  |2021-03-01 12:01:19.406|12  |1     |19    |2021|3    |1  |60 |9           |2          |
+---+-----------------------+----+------+------+----+-----+---+---+------------+-----------+



> **Operation between 2 dates**
> * datediff
> * months_between
> * last_day of the month

In [10]:
employeeData = spark.createDataFrame([("John", "2016-01-01", "2017-10-15"),
("May", "2017-02-06", "2017-12-25")], ("name", "join_date", "leave_date"))
employeeData.show()

+----+----------+----------+
|name| join_date|leave_date|
+----+----------+----------+
|John|2016-01-01|2017-10-15|
| May|2017-02-06|2017-12-25|
+----+----------+----------+



In [11]:
employeeData.select("name",
            F.datediff("leave_date", "join_date").alias("n_days"),
            F.months_between("leave_date", "join_date").alias("n_months"),
            F.last_day("leave_date").alias("last_day_of_mon")).show()

+----+------+-----------+---------------+
|name|n_days|   n_months|last_day_of_mon|
+----+------+-----------+---------------+
|John|   653| 21.4516129|     2017-10-31|
| May|   322|10.61290323|     2017-12-31|
+----+------+-----------+---------------+



> **Aggregations over a Date**
* add days
* sub days
* add months
* sub months


In [14]:
# perform date addition and subtraction
oneDate = spark.createDataFrame([("2018-01-01",)], ["new_year"])
oneDate.show()

+----------+
|  new_year|
+----------+
|2018-01-01|
+----------+



In [15]:
oneDate.select(F.col("new_year"),
    F.date_add("new_year", 10).alias("date_plus_10_days"),
    F.date_sub("new_year", 10).alias("date_minus_10_days"),
    F.add_months("new_year", 2).alias("date_plus_2_months"),
    F.add_months("new_year", -2).alias("date_minus_2_months")
    ).show()

+----------+-----------------+------------------+------------------+-------------------+
|  new_year|date_plus_10_days|date_minus_10_days|date_plus_2_months|date_minus_2_months|
+----------+-----------------+------------------+------------------+-------------------+
|2018-01-01|       2018-01-11|        2017-12-22|        2018-03-01|         2017-11-01|
+----------+-----------------+------------------+------------------+-------------------+



### Compare to current date

In [39]:
# current_timestamp()
oneDate.select(F.col("new_year"),F.current_date().alias("current_date"),
               F.datediff(oneDate.new_year, F.current_date()).alias("datediff")).show(truncate=False)

+----------+------------+--------+
|new_year  |current_date|datediff|
+----------+------------+--------+
|2018-01-01|2024-05-20  |-2331   |
+----------+------------+--------+



### trunc(column, format)	
Truncate a date or timestamp column in a DataFrame to a specified level of granularity. For example, `trunc(df['date_column'], 'month')` would truncate the dates in the `“date_column”` of DataFrame `“df”` to the first day of the month, effectively removing the day component and retaining only the month and year.
> Possible levels
> * Month
> * Year

In [48]:
df3.select(F.col("input"), 
    F.trunc(F.col("input"),"Month").alias("Keep_Month"), 
    F.trunc(F.col("input"),"Year").alias("Keep_Year") 
   ).show(truncate=False)

+-----------------------+----------+----------+
|input                  |Keep_Month|Keep_Year |
+-----------------------+----------+----------+
|2020-02-01 11:01:19.06 |2020-02-01|2020-01-01|
|2019-03-01 12:01:19.406|2019-03-01|2019-01-01|
|2021-03-01 12:01:19.406|2021-03-01|2021-01-01|
+-----------------------+----------+----------+



## Advance: Bucketing
### Date and Timestamp Window Functions
| DATE & TIME WINDOW FUNCTION SYNTAX | DATE & TIME WINDOW FUNCTION DESCRIPTION |
|------------------------------------|-----------------------------------------|
| window(timeColumn, windowDuration, slideDuration, startTime) | Bucketize rows into one or more time windows given a timestamp specifying column. Window starts are inclusive but the window ends are exclusive, [12:05,12:10). Windows can support microsecond precision. Windows in the order of months are not supported. |
| window(timeColumn, windowDuration, slideDuration) | Bucketize rows into one or more time windows given a timestamp specifying column. e.g. [12:05,12:10). Windows can support microsecond precision. Windows in the order of months are not supported. The windows start beginning at 1970-01-01 00:00:00 UTC |
| window(timeColumn, windowDuration) | Generates tumbling time windows given a timestamp specifying column. Window starts are inclusive but the window ends are exclusive, e.g. [12:05,12:10) |


# Strings
----

In [21]:
sparkDF = spark.createDataFrame([("   Spark ",)], ["name"])
sparkDF.show()

+---------+
|     name|
+---------+
|   Spark |
+---------+



trimming

In [22]:
# trim removes spaces on both sides of a string
# ltrim only removes spaces on the left side of a string
# rtrim only removes spaces on the right side of a string
sparkDF.select(
    F.trim("name").alias("trim"),
    F.ltrim("name").alias("ltrim"),
    F.rtrim("name").alias("rtrim")).show()

+-----+------+--------+
| trim| ltrim|   rtrim|
+-----+------+--------+
|Spark|Spark |   Spark|
+-----+------+--------+



**Padding** <br>
Padding = add values to complete a given lenght <br>
first trim spaces around string "Spark" and then pad it so the final
length is 8 characters long

In [23]:
# lpad pads the left side of the trim column with - to the length of 8
# rpad pads the right side of the trim colum with = to the length of 8
sparkDF.select(F.trim("name").alias("trim")).select(
    F.lpad("trim", 12, "-").alias("lpad"),
    F.rpad("trim", 12, "=").alias("rpad")).show()

+------------+------------+
|        lpad|        rpad|
+------------+------------+
+------------+------------+



In [25]:
sparkAwesomeDF = spark.createDataFrame([("Spark", "is", "awesome")], ["subject", "verb", "adj"])
sparkAwesomeDF.show()

+-------+----+-------+
|subject|verb|    adj|
+-------+----+-------+
|  Spark|  is|awesome|
+-------+----+-------+



In [28]:
# F.concat_ws("_separator_", "part1", "part2", ...) = part1_separator_part2_separator_part3
sparkAwesomeDF.select(F.concat_ws(" ","subject", "verb",
"adj").alias("sentence")).select(
    F.lower("sentence").alias("lower"),
    F.upper("sentence").alias("upper"),
    F.initcap("sentence").alias("initcap"),
    F.reverse("sentence").alias("reverse")
).show()

+----------------+----------------+----------------+----------------+
|           lower|           upper|         initcap|         reverse|
+----------------+----------------+----------------+----------------+
|spark is awesome|SPARK IS AWESOME|Spark Is Awesome|emosewa si krapS|
+----------------+----------------+----------------+----------------+



replace = translate

In [31]:
# translate from one character to another
sparkAwesomeDF.select("subject", F.translate(F.col("subject"), "ar","oc").alias("translate")).show()

+-------+---------+
|subject|translate|
+-------+---------+
|  Spark|    Spock|
+-------+---------+



Regular expressions

In [37]:
rhymeDF = spark.createDataFrame([("A fox saw a crow sitting on a tree singing \"Caw! Caw! Caw!\"", )],["rhyme"])
rhymeDF.show(truncate=False)


+-----------------------------------------------------------+
|rhyme                                                      |
+-----------------------------------------------------------+
|A fox saw a crow sitting on a tree singing "Caw! Caw! Caw!"|
+-----------------------------------------------------------+



In [42]:
rhymeDF.select(F.regexp_extract("rhyme","[a-z]*o*[xw]",0)
.alias("substring")).show()

+---------+
|substring|
+---------+
|      fox|
+---------+



`regexp_extract_all` is available from Spark 3.1

In [49]:
rhymeDF.select(F.regexp_replace("rhyme", "fox|crow", "animal").alias("new_rhyme")).show(truncate=False)

+----------------------------------------------------------------+
|new_rhyme                                                       |
+----------------------------------------------------------------+
|A animal saw a animal sitting on a tree singing "Caw! Caw! Caw!"|
+----------------------------------------------------------------+



# Working with Math Functions
---

In [5]:
numberDF = spark.createDataFrame([(3.14159, 3.5, 2018),], ["pie","gpa", "year"])
numberDF.show()


+-------+---+----+
|    pie|gpa|year|
+-------+---+----+
|3.14159|3.5|2018|
+-------+---+----+



In [7]:
numberDF.select(
    F.round("pie").alias("pie0"),
    F.round("pie", 1).alias("pie1"),
    F.round("pie", 2).alias("pie2"),
    F.round("gpa").alias("gpa"),
    F.round("year").alias("year")).show()

+----+----+----+---+----+
|pie0|pie1|pie2|gpa|year|
+----+----+----+---+----+
| 3.0| 3.1|3.14|4.0|2018|
+----+----+----+---+----+



# Working with Collection Functions
The collection functions are designed to work with complex data types such as arrays,

In [13]:
tasksDF = spark.createDataFrame( [("Monday", ["Pick Up John",
"Buy Milk", "Pay Bill"]), ], ["day", "tasks"])

# schema of tasksDF
print(tasksDF.printSchema())
tasksDF.show(truncate=False)

root
 |-- day: string (nullable = true)
 |-- tasks: array (nullable = true)
 |    |-- element: string (containsNull = true)

None
+------+----------------------------------+
|day   |tasks                             |
+------+----------------------------------+
|Monday|[Pick Up John, Buy Milk, Pay Bill]|
+------+----------------------------------+



In [15]:
tasksDF.select("day", 
                F.size(F.col("tasks")).alias("size"),
                F.sort_array(F.col("tasks")).alias("sorted_tasks"),
                F.array_contains(F.col("tasks"), "Pay Bill").alias("payBill")).show(truncate=False)

+------+----+----------------------------------+-------+
|day   |size|sorted_tasks                      |payBill|
+------+----+----------------------------------+-------+
|Monday|3   |[Buy Milk, Pay Bill, Pick Up John]|true   |
+------+----+----------------------------------+-------+



// the explode function will create a new row for each element in the array

In [16]:
tasksDF.select("day", F.explode("tasks")).show()

+------+------------+
|   day|         col|
+------+------------+
|Monday|Pick Up John|
|Monday|    Buy Milk|
|Monday|    Pay Bill|
+------+------------+



> from JSON to Spark

In [2]:
import pyspark.sql.types as T

In [3]:
todos = """{"day": "Monday","tasks": ["Pick Up John","Buy Milk","Pay Bill"]}"""

todoStrDF = spark.createDataFrame([(todos,)], ["todos_str"])
print(todoStrDF.printSchema())
todoStrDF.show(truncate=False)

root
 |-- todos_str: string (nullable = true)

None
+-----------------------------------------------------------------+
|todos_str                                                        |
+-----------------------------------------------------------------+
|{"day": "Monday","tasks": ["Pick Up John","Buy Milk","Pay Bill"]}|
+-----------------------------------------------------------------+



JSON approach requires a new data type

In [4]:
# Define schema for the JSON string
schema = T.StructType([
    T.StructField("day", T.StringType(), True),
    T.StructField("tasks", T.ArrayType(T.StringType()), True)
])

In [5]:
# Parse the JSON string into a structured DataFrame
parsedDF = todoStrDF.select(F.from_json(F.col("todos_str"), schema=schema).alias("todos_json"))

print(parsedDF.printSchema())
parsedDF.show(truncate=False)

root
 |-- todos_json: struct (nullable = true)
 |    |-- day: string (nullable = true)
 |    |-- tasks: array (nullable = true)
 |    |    |-- element: string (containsNull = true)

None
+--------------------------------------------+
|todos_json                                  |
+--------------------------------------------+
|{Monday, [Pick Up John, Buy Milk, Pay Bill]}|
+--------------------------------------------+



// retrieving value out of struct data type using the getItem function of
Column class

In [17]:
parsedDF.select(F.col("todos_json.day")).show()

+------+
|   day|
+------+
|Monday|
+------+



In [16]:

parsedDF.select(
    F.col("todos_json.day"), 
    F.col("todos_json.tasks"),
    F.col("todos_json.tasks")[0].alias("first_task")).show(truncate=False)

+------+----------------------------------+------------+
|day   |tasks                             |first_task  |
+------+----------------------------------+------------+
|Monday|[Pick Up John, Buy Milk, Pay Bill]|Pick Up John|
+------+----------------------------------+------------+



get the JSON

In [20]:

parsedDF.select(F.to_json("todos_json")).show(truncate=False)

+---------------------------------------------------------------+
|to_json(todos_json)                                            |
+---------------------------------------------------------------+
|{"day":"Monday","tasks":["Pick Up John","Buy Milk","Pay Bill"]}|
+---------------------------------------------------------------+



## Worth Mention Functions
This section covers the following functions: monotonically_
increasing_id, when, coalesce, and lit.

**WHEN**

In [23]:
# create a DataFrame with values from 1 to 7 to represent each day of the week
dayOfWeekDF = spark.range(1,8,1).toDF("id")
# convert each numerical value to a string
dayOfWeekDF.select("id", 
                   F.when(F.col("id") == 1, "Mon")
                    .when(F.col("id") == 2, "Tue")
                    .when(F.col("id") == 3, "Wed")
                    .when(F.col("id") == 4, "Thu")
                    .when(F.col("id") == 5, "Fri")
                    .when(F.col("id") == 6, "Sat")
                    .when(F.col("id") == 7, "Sun").alias("dow")).show()

+---+---+
| id|dow|
+---+---+
|  1|Mon|
|  2|Tue|
|  3|Wed|
|  4|Thu|
|  5|Fri|
|  6|Sat|
|  7|Sun|
+---+---+



**COALESCE**

In [24]:
# create a movie with null title
schema_ = "`actor_name` String, `movie_title` String, `produced_year` Long"
badMoviesDF = spark.createDataFrame( [
    (None, None, 2018), ("John Doe", "Awesome Movie", 2018)], 
    schema = schema_)
# use coalesce function to handle null value in the title column


In [26]:
badMoviesDF.select(F.coalesce("actor_name", F.lit("no_name")).alias("new_title")).show()

+---------+
|new_title|
+---------+
|  no_name|
| John Doe|
+---------+



# User Defined Functions (UDFs)
 UDFs must be
registered with Spark before they are used, so Spark knows to ship them to executors to
be used and executed.

In [27]:
from pyspark.sql.functions import udf

In [28]:
# Define the letterGrade function in Python
def letterGrade(score):
    if score > 100:
        return "Cheating"
    elif score >= 90:
        return "A"
    elif score >= 80:
        return "B"
    elif score >= 70:
        return "C"
    else:
        return "F"

In [29]:
# Register the function as a UDF
letterGradeUDF = udf(letterGrade, T.StringType())

In [30]:
# Example DataFrame
data = [(95,), (85,), (75,), (65,), (110,)]
df = spark.createDataFrame(data, ["score"])

# Use the UDF in a DataFrame
df = df.withColumn("letter_grade", letterGradeUDF(df["score"]))

# Show the DataFrame
df.show()

+-----+------------+
|score|letter_grade|
+-----+------------+
|   95|           A|
|   85|           B|
|   75|           C|
|   65|           F|
|  110|    Cheating|
+-----+------------+



In [31]:
spark.stop()