###Create DataFrame with array column

In [0]:
array_appliance = [
    ('Raja',['TV','Refrigerator','Oven','AC']),
    ('Rahgava',['AC','Washing Machinee',None]),
    ('Ram',['Grinder','TV']),
    ('Ramesh',['Refrigerator','TV',None]),
     ('Rajesh',None)
]
df_app = spark.createDataFrame(data=array_appliance, schema=['name','appliances'])
df_app.printSchema()
display(df_app)

root
 |-- name: string (nullable = true)
 |-- appliances: array (nullable = true)
 |    |-- element: string (containsNull = true)



name,appliances
Raja,"List(TV, Refrigerator, Oven, AC)"
Rahgava,"List(AC, Washing Machinee, null)"
Ram,"List(Grinder, TV)"
Ramesh,"List(Refrigerator, TV, null)"
Rajesh,


###Create dataframe with map column

In [0]:
map_brand = [
    ('Raja',{'TV':'LG','Refrigerator':'Samsung','Oven':'Phillips','AC':'Voltas'}),
    ('Rahgava',{'AC':'Samsung','Washing Machinee':'LG'}),
    ('Ram',{'Grinder':'Preethi','TV':''}),
    ('Ramesh',{'Refrigerator':'LG','TV':'Croma'}),
     ('Rajesh',None)
]
df_brand = spark.createDataFrame(data= map_brand, schema=['name','brand'])
df_brand.printSchema()
display(df_brand)

root
 |-- name: string (nullable = true)
 |-- brand: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



name,brand
Raja,"Map(Refrigerator -> Samsung, AC -> Voltas, TV -> LG, Oven -> Phillips)"
Rahgava,"Map(Washing Machinee -> LG, AC -> Samsung)"
Ram,"Map(TV -> , Grinder -> Preethi)"
Ramesh,"Map(Refrigerator -> LG, TV -> Croma)"
Rajesh,


###Explode array field

In [0]:
from pyspark.sql.functions import explode

df2 = df_app.select(df_app.name, explode(df_app.appliances))

df_app.printSchema()
display(df_app)

df2.printSchema()
display(df2)

root
 |-- name: string (nullable = true)
 |-- appliances: array (nullable = true)
 |    |-- element: string (containsNull = true)



name,appliances
Raja,"List(TV, Refrigerator, Oven, AC)"
Rahgava,"List(AC, Washing Machinee, null)"
Ram,"List(Grinder, TV)"
Ramesh,"List(Refrigerator, TV, null)"
Rajesh,


root
 |-- name: string (nullable = true)
 |-- col: string (nullable = true)



name,col
Raja,TV
Raja,Refrigerator
Raja,Oven
Raja,AC
Rahgava,AC
Rahgava,Washing Machinee
Rahgava,
Ram,Grinder
Ram,TV
Ramesh,Refrigerator


###Explode Map

In [0]:
from pyspark.sql.functions import explode

df3 = df_brand.select(df_brand.name, explode(df_brand.brand))

df_brand.printSchema()
display(df_app)

df3.printSchema()
display(df3)

root
 |-- name: string (nullable = true)
 |-- brand: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



name,appliances
Raja,"List(TV, Refrigerator, Oven, AC)"
Rahgava,"List(AC, Washing Machinee, null)"
Ram,"List(Grinder, TV)"
Ramesh,"List(Refrigerator, TV, null)"
Rajesh,


root
 |-- name: string (nullable = true)
 |-- key: string (nullable = false)
 |-- value: string (nullable = true)



name,key,value
Raja,Refrigerator,Samsung
Raja,AC,Voltas
Raja,TV,LG
Raja,Oven,Phillips
Rahgava,Washing Machinee,LG
Rahgava,AC,Samsung
Ram,TV,
Ram,Grinder,Preethi
Ramesh,Refrigerator,LG
Ramesh,TV,Croma


###Explode outer to consider NULL values

In [0]:
from pyspark.sql.functions import explode_outer

display(df_app.select(df_app.name, explode_outer(df_app.appliances)))

display(df_brand.select(df_brand.name, explode_outer(df_brand.brand)))


name,col
Raja,TV
Raja,Refrigerator
Raja,Oven
Raja,AC
Rahgava,AC
Rahgava,Washing Machinee
Rahgava,
Ram,Grinder
Ram,TV
Ramesh,Refrigerator


name,key,value
Raja,Refrigerator,Samsung
Raja,AC,Voltas
Raja,TV,LG
Raja,Oven,Phillips
Rahgava,Washing Machinee,LG
Rahgava,AC,Samsung
Ram,TV,
Ram,Grinder,Preethi
Ramesh,Refrigerator,LG
Ramesh,TV,Croma


###Positional Explode

In [0]:
from pyspark.sql.functions import posexplode

display(df_app.select(df_app.name, posexplode(df_app.appliances)))

display(df_brand.select(df_brand.name, posexplode(df_brand.brand)))


name,pos,col
Raja,0,TV
Raja,1,Refrigerator
Raja,2,Oven
Raja,3,AC
Rahgava,0,AC
Rahgava,1,Washing Machinee
Rahgava,2,
Ram,0,Grinder
Ram,1,TV
Ramesh,0,Refrigerator


name,pos,key,value
Raja,0,Refrigerator,Samsung
Raja,1,AC,Voltas
Raja,2,TV,LG
Raja,3,Oven,Phillips
Rahgava,0,Washing Machinee,LG
Rahgava,1,AC,Samsung
Ram,0,TV,
Ram,1,Grinder,Preethi
Ramesh,0,Refrigerator,LG
Ramesh,1,TV,Croma


###Positional Explode with outer

In [0]:
from pyspark.sql.functions import posexplode_outer

display(df_app.select(df_app.name, posexplode_outer(df_app.appliances)))

display(df_brand.select(df_brand.name, posexplode_outer(df_brand.brand)))


name,pos,col
Raja,0.0,TV
Raja,1.0,Refrigerator
Raja,2.0,Oven
Raja,3.0,AC
Rahgava,0.0,AC
Rahgava,1.0,Washing Machinee
Rahgava,2.0,
Ram,0.0,Grinder
Ram,1.0,TV
Ramesh,0.0,Refrigerator


name,pos,key,value
Raja,0.0,Refrigerator,Samsung
Raja,1.0,AC,Voltas
Raja,2.0,TV,LG
Raja,3.0,Oven,Phillips
Rahgava,0.0,Washing Machinee,LG
Rahgava,1.0,AC,Samsung
Ram,0.0,TV,
Ram,1.0,Grinder,Preethi
Ramesh,0.0,Refrigerator,LG
Ramesh,1.0,TV,Croma
