In [0]:
# Explode converts arrays/lists/ nested values to multiple rows
# we use it in > JSON API, kafks, nested events
from pyspark.sql.functions import explode

data = [
    (1, ['Laptop', 'Mouse']),
    (2, ['Mobile']),
    (3, ['Keyboard', 'Monitor','CPU'])
]

df = spark.createDataFrame(data, ['order_id', 'items'])
df.show(truncate = False)

In [0]:
df_explo = df.select(
    'order_id',
    explode("items").alias('item')
)
# for each element in array/list create new row
display(df_explo)

# imp > explode works only on array/lists map and nested structures, not on strings & numbers

In [0]:
# real example / business problem
# each order has multiple items, find total sold items

from pyspark.sql.functions import count, col

df.select(
    explode('items').alias('item'))\
        .groupBy('item').count().show()
        
#explode is used to flatten nested array or map columns into multiple rows for processing.


In [0]:
dataa = [
    (1, ['Phone', 'Charger']),
    (2, ['Laptop'])
]

df1 = spark.createDataFrame(dataa, ['Customer_id', 'Products'])
df1.show(truncate = False)

In [0]:
from pyspark.sql.functions import count, explode

df1.select(
    explode("Products").alias("Product")
).groupBy("Product").agg(count("*").alias("total")).show(truncate = False)

In [0]:
from pyspark.sql.functions import count, explode

df1.select(
    explode("Products").alias("Product")
).groupBy("Product") \
 .agg(count("*").alias("total")) \
 .show(truncate=False)
"""
count("*") is a Column expression

groupBy().agg() expects Column expressions

groupBy().count() is a shortcut that doesn't accept arguments

| Column Type   | What To Do                 |
| ------------- | -------------------------- |
| Array         | `explode` → then `groupBy` |
| String/Scalar | Just `groupBy`             |
| Map           | `explode` → then `groupBy` |
"""

In [0]:
"""
| Function      | Purpose               |
| ------------- | --------------------- |
| explode       | flatten array         |
| explode_outer | keeps null rows       |
| posexplode    | returns index + value |

"""

In [0]:
from pyspark.sql.functions import explode, count

df_exploded = df1.select(
    explode("Products").alias("Product")
)

df_sales = df_exploded.groupBy("Product").agg(
    count("Product").alias("total")
)

df_sales.show()


In [0]:
# String functions are used in data cleaning , parsing text, fixing formats, transforming raw filed

from pyspark.sql.functions import length, trim, ltrim, rtrim, lower, upper, initcap, lpad, rpad, substring, regexp_replace

data = [
    (1, "   Laptop,Mouse   ", "Arjun", "Kumar"),
    (2, "Mobile,Charger", "Ravi", "raj"),
    (3, "Tablet", "Meena", "S")
]

df = spark.createDataFrame(data, ["id", "items_str", "first", "last"])

df.show(truncate=False)

In [0]:
from pyspark.sql.functions import col
df.withColumn(
    "Cleaned_items", trim(col("items_str"))
).show(truncate=False)

In [0]:
from pyspark.sql.functions import col, split, lit, concat
df.withColumns({
    "Cleaned_items": trim(col("items_str")), 
    "items_array": split(trim(col("items_str")), ","),
    "Full_Name": concat(col("first"), lit(" "), col("last"))}
).show(truncate=False)

In [0]:
display(df.withColumns({
    "Cleaned_items": trim(col("items_str")), 
    "items_array": split(trim(col("items_str")), ","),
    "Full_Name": concat(col("first"), lit(" "), col("last"))}
))#.show(truncate=False)

In [0]:
from pyspark.sql.functions import col, concat_ws
# cleaner concating for many columns join with separator
df.withColumn(
    "Full_Name", concat_ws(" ", col("first"), col("last"))
).show(truncate=False)

In [0]:
from pyspark.sql.functions import lower, upper

df.withColumn("First_Lower", lower(col("first")))\
    .withColumn("Last_Upper", upper(col("last"))).show(truncate=False)

# used in before joins & comparisons

In [0]:
# substring - extract part of string
from pyspark.sql.functions import substring, col


df.withColumn(
    "Short_name",
    substring(col("first"), 1, 3)
).show()


In [0]:
# regexp_replace - remove unwanted characters
# used for removing special characters, phone number cleaning, log parsing, messy data cleanup

from pyspark.sql.functions import regexp_replace

df.withColumn(
    "Clean_text",
    regexp_replace(col("items_str"), ",", "")
).show(truncate=False)


In [0]:
# trim -> split -> explode -> aggregate

In [0]:
from pyspark.sql.functions import explode, count, split

df.select(
    explode(split(trim(col("items_str")), ",")).alias("item")
).groupBy("item").agg(count("*").alias("total")).show(truncate=False)

trim()            → remove spaces,
split()           → string → array,
concat_ws()       → combine columns,
lower/upper()     → standardize text,
substring()       → extract part,
regexp_replace()  → clean patterns


In [0]:
# remove spaces from items_str

df.withColumn(
    "cleaned_items", trim(col("items_str"))
).show(truncate=False)

In [0]:
# split into array

df.withColumn(
    "splited_items", split(trim(col("items_str")), ",")
).show(truncate=False)

In [0]:
#explode items
from pyspark.sql.functions import explode
df.withColumn(
    "exploded_items", explode(split(trim(col("items_str")), ","))
).show(truncate=False)
#

In [0]:
# count each item
from pyspark.sql.functions import count

df.withColumn(
    "exploded_items", explode(split(trim(col("items_str")), ","))
).groupBy("exploded_items").agg(count("*").alias("total")).show(truncate=False)

In [0]:
# create full name in lowercase
from pyspark.sql.functions import concat_ws
df.withColumn(
    "Full_name" , concat_ws(" ", lower(col("first")), lower(col("last")))
).show(truncate=False)

In [0]:
df.select(
    explode(split(trim(col("items_str")), ",")).alias("item")
)
