In [0]:
df = spark.read.format("csv").options(header='true', inferSchema= 'true')\
    .load("/Volumes/dev_data/test/source_raw/source_crm/sales_details.csv")

df.show(5)

In [0]:
display(df)

In [0]:
df_orders = spark.sql("select * from dev_data.test.orders_online")

In [0]:
df_orders.show()

In [0]:
df_cust = spark.sql("select * from dev_data.test.customers_online")

In [0]:
df_cust.show()

In [0]:
df_prod = spark.sql("select * from dev_data.test.products")
df_prod.show()

In [0]:
from pyspark.sql.functions import udf , col
from pyspark.sql.types import StringType

def price_category(price):
    return "High" if price >= 300 else "Low"

price_udf = udf(price_category, StringType())

df_orders.withColumn(
    "price_flag", price_udf(col("order_amount"))
).show()

# drawbacks -> UDFs breaks the spark optimizations and can be slow
# Slower than built in functions
# avoid UDFs when possible and prefer built in spark sql functions performance

In [0]:
# we can use built in functions instead of UDF
from pyspark.sql.functions import when

df_orders.withColumn(
    "price_flag", when(col("order_amount") >= 250, 'High')\
        .otherwise('Low')
).show()

# using built in functions is faster than UDF
# optimized and catalyt-friendly

In [0]:
# pandas udf(Arrow based) - advanced one
# use only when logic can't be expressed otherwise

from pyspark.sql.functions import pandas_udf , col, round
from pyspark.sql.types import DoubleType
import pandas as pd

@pandas_udf(DoubleType())
def tax_udf(amount : pd.Series) -> pd.Series:
    return amount * 0.18

df_orders.withColumn(
    "tax", 
    round(tax_udf(col("order_amount")),2)
).show()
# pandas udf is faster than python UDFs but still use it only when needed

In [0]:
# explode() - nexted data essential
# used in -> json, arrays, kafka payloads and api data

from pyspark.sql.functions import explode

data = [
    (1, ["Laptop", "Mouse"]),
    (2, ["Mobile"])
]

df = spark.createDataFrame(data, ["order_id", "Items"])

df.select("order_id", explode("Items").alias("item")).show()

In [0]:
from pyspark.sql.functions import split, concat, substring, lit

data = [
    (1, "Laptop, Mouse"),
    (2, "Mobile")
]

df = spark.createDataFrame(data, ["order_id", "Items"])

df.withColumn("first_item", split(col("Items"), ",")).show()



In [0]:
from pyspark.sql.functions import concat_ws
df2 = df.withColumn(
    "items_str", concat_ws(",", "items")
)
df2.show()