In [0]:
data = [
    ("john", "tomato", 2),
    ("𝚋𝚒𝚕𝚕", "𝚊𝚙𝚙𝚕𝚎", 2),
    ("john", "𝚋𝚊𝚗𝚊𝚗𝚊", 2),
    ("john", "tomato", 3),
    ("𝚋𝚒𝚕𝚕", "𝚝𝚊𝚌𝚘", 2),
    ("𝚋𝚒𝚕𝚕", "𝚊𝚙𝚙𝚕𝚎", 2),
]
schema = "name string,item string,weight int"
df = spark.createDataFrame(data, schema)
df.show(truncate=False)

+--------+------------+------+
|name    |item        |weight|
+--------+------------+------+
|john    |tomato      |2     |
|𝚋𝚒𝚕𝚕|𝚊𝚙𝚙𝚕𝚎  |2     |
|john    |𝚋𝚊𝚗𝚊𝚗𝚊|2     |
|john    |tomato      |3     |
|𝚋𝚒𝚕𝚕|𝚝𝚊𝚌𝚘    |2     |
|𝚋𝚒𝚕𝚕|𝚊𝚙𝚙𝚕𝚎  |2     |
+--------+------------+------+



In [0]:
from pyspark.sql.functions import *
df_agg=df.groupBy(col('name'),col('item')).agg(sum(col('weight')).alias('weight'))
display(df_agg)

name,item,weight
john,tomato,5
𝚋𝚒𝚕𝚕,𝚊𝚙𝚙𝚕𝚎,4
john,𝚋𝚊𝚗𝚊𝚗𝚊,2
𝚋𝚒𝚕𝚕,𝚝𝚊𝚌𝚘,2


In [0]:
from pyspark.sql.functions import *
df_agg1 = df_agg.groupBy(col("name")) \
            .agg(collect_list(struct(col("item"), col("weight"))).alias("items"))

display(df_agg1)

name,items
𝚋𝚒𝚕𝚕,"List(List(𝚊𝚙𝚙𝚕𝚎, 4), List(𝚝𝚊𝚌𝚘, 2))"
john,"List(List(tomato, 5), List(𝚋𝚊𝚗𝚊𝚗𝚊, 2))"


In [0]:
df_exploded = df_agg1.select("name", explode("items").alias("props"))
df_exploded.show(truncate=False)

+--------+-----------------+
|name    |props            |
+--------+-----------------+
|𝚋𝚒𝚕𝚕|{𝚊𝚙𝚙𝚕𝚎, 4}  |
|𝚋𝚒𝚕𝚕|{𝚝𝚊𝚌𝚘, 2}    |
|john    |{tomato, 5}      |
|john    |{𝚋𝚊𝚗𝚊𝚗𝚊, 2}|
+--------+-----------------+



In [0]:
df_split=df_exploded.withColumn("item",col("props.item"))\
                    .withColumn("weight",col("props.weight"))
display(df_split)

name,props,item,weight
𝚋𝚒𝚕𝚕,"List(𝚊𝚙𝚙𝚕𝚎, 4)",𝚊𝚙𝚙𝚕𝚎,4
𝚋𝚒𝚕𝚕,"List(𝚝𝚊𝚌𝚘, 2)",𝚝𝚊𝚌𝚘,2
john,"List(tomato, 5)",tomato,5
john,"List(𝚋𝚊𝚗𝚊𝚗𝚊, 2)",𝚋𝚊𝚗𝚊𝚗𝚊,2


In [0]:
df_json=df_exploded.select("name",to_json(col("props")).alias("props"))
df_json.show(truncate=False)

+--------+----------------------------------+
|name    |props                             |
+--------+----------------------------------+
|𝚋𝚒𝚕𝚕|{"item":"𝚊𝚙𝚙𝚕𝚎","weight":4}  |
|𝚋𝚒𝚕𝚕|{"item":"𝚝𝚊𝚌𝚘","weight":2}    |
|john    |{"item":"tomato","weight":5}      |
|john    |{"item":"𝚋𝚊𝚗𝚊𝚗𝚊","weight":2}|
+--------+----------------------------------+



In [0]:
df_tuple=df_json.select("name",json_tuple(col("props"),"item","weight").alias("item","weight"))
df_tuple.show(truncate=False)
display(df_tuple)

+--------+------------+------+
|name    |item        |weight|
+--------+------------+------+
|𝚋𝚒𝚕𝚕|𝚊𝚙𝚙𝚕𝚎  |4     |
|𝚋𝚒𝚕𝚕|𝚝𝚊𝚌𝚘    |2     |
|john    |tomato      |5     |
|john    |𝚋𝚊𝚗𝚊𝚗𝚊|2     |
+--------+------------+------+



name,item,weight
𝚋𝚒𝚕𝚕,𝚊𝚙𝚙𝚕𝚎,4
𝚋𝚒𝚕𝚕,𝚝𝚊𝚌𝚘,2
john,tomato,5
john,𝚋𝚊𝚗𝚊𝚗𝚊,2


In [0]:
df_pivot=df_tuple.groupBy("name").pivot("item").agg(first("weight").alias("weight"))
df_pivot.show(truncate=False)
display(df_pivot)


+--------+------+----------+------------+--------+
|name    |tomato|𝚊𝚙𝚙𝚕𝚎|𝚋𝚊𝚗𝚊𝚗𝚊|𝚝𝚊𝚌𝚘|
+--------+------+----------+------------+--------+
|john    |5     |null      |2           |null    |
|𝚋𝚒𝚕𝚕|null  |4         |null        |2       |
+--------+------+----------+------------+--------+



name,tomato,𝚊𝚙𝚙𝚕𝚎,𝚋𝚊𝚗𝚊𝚗𝚊,𝚝𝚊𝚌𝚘
john,5.0,,2.0,
𝚋𝚒𝚕𝚕,,4.0,,2.0


In [0]:
df_pivot=df_tuple.groupBy("name").pivot("item").agg(collect_list("weight").alias("weight"))
df_pivot.show(truncate=False)
display(df_pivot)

+--------+------+----------+------------+--------+
|name    |tomato|𝚊𝚙𝚙𝚕𝚎|𝚋𝚊𝚗𝚊𝚗𝚊|𝚝𝚊𝚌𝚘|
+--------+------+----------+------------+--------+
|𝚋𝚒𝚕𝚕|[]    |[4]       |[]          |[2]     |
|john    |[5]   |[]        |[2]         |[]      |
+--------+------+----------+------------+--------+



name,tomato,𝚊𝚙𝚙𝚕𝚎,𝚋𝚊𝚗𝚊𝚗𝚊,𝚝𝚊𝚌𝚘
𝚋𝚒𝚕𝚕,List(),List(4),List(),List(2)
john,List(5),List(),List(2),List()
