## 11. Databricks | Pyspark: Explode Function

In [0]:
from pyspark.sql.types import StructType, StringType, IntegerType, StructField, ArrayType, MapType
from pyspark.sql.functions import explode, explode_outer, posexplode, posexplode_outer

In [0]:
# Sample data with array column
data = [
    ("Alice", [85, 90, 95]),
    ("Bob", [70, 75]),
    ("Charlie", [100, 100, None]),
     ("Peter", None)
]

# Define Schema
schema = StructType([
    StructField("name", StringType()),
    StructField("scores", ArrayType(IntegerType()))])
  
# Create DataFrame
df_array = spark.createDataFrame(data, schema)
df_array.printSchema()
display(df_array)


In [0]:
# Sample data with Map column
data = [
    ("Alice", {"math": 85, "science": 90}),
    ("Bob", {"math": 78}),
    ("Charlie", {"science": 95, "english": 88}),
    ("Peter", None)
]

# Define schema
schema = StructType([
    StructField("name", StringType(), True),
    StructField("scores", MapType(StringType(), IntegerType()), True)
])

# Create DataFrame
df_map = spark.createDataFrame(data, schema=schema)
df_map.printSchema()
df_map.show(truncate=False)


In [0]:
# Explode the array column
exploded_df_array = df_array.select(df_array.name, explode(df_array.scores).alias("Scores"))
display(exploded_df_array)

In [0]:
# Explode the map column
exploded_df_map = df_map.select(df_map.name, explode(df_map.scores).alias("Subject","Scores"))
display(exploded_df_map)

In [0]:
# Explode the array column with nulls
exploded_df_array_outer = df_array.select(df_array.name, explode_outer(df_array.scores).alias("Scores"))

display(exploded_df_array_outer)

In [0]:
# Explode with map column with nulls
exploded_df_map_outer = df_map.select(df_map.name, explode_outer(df_map.scores).alias("Subject", "Scores"))

display(exploded_df_map_outer)

In [0]:
# Explode with array column with position
exploded_df_array_pos = df_array.select(df_array.name, posexplode(df_array.scores).alias("Positions", "Scores"))
display(exploded_df_array_pos)

In [0]:
# Explode with map column with position
exploded_df_map_pos = df_map.select(df_map.name, posexplode(df_map.scores).alias("Positions","Subject","Scores"))
display(exploded_df_map_pos)

In [0]:
# Explode with array column with position and nulls
exploded_df_array_pos_outer = df_array.select(df_array.name, posexplode_outer(df_array.scores).alias("Positions", "Scores"))
display(exploded_df_array_pos_outer)

In [0]:
# Explode with map column with position and nulls
exploded_df_map_pos_outer = df_map.select(df_map.name, posexplode_outer(df_map.scores).alias("Positions", "Subject","Scores"))
display(exploded_df_map_pos_outer)