In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("test").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/08 21:34:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/01/08 21:34:54 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [2]:
from pyspark.sql import Row

students = [
    Row(name="Jane", continent="America"),
    Row(name="Pascal", continent="Europe"),
    Row(name="Xi", continent="Asia"),
    Row(name="Jack", continent="America")
]

df = spark.createDataFrame(students)
df.show()

print(df.dtypes)

                                                                                

+------+---------+
|  name|continent|
+------+---------+
|  Jane|  America|
|Pascal|   Europe|
|    Xi|     Asia|
|  Jack|  America|
+------+---------+

[('name', 'string'), ('continent', 'string')]


In [3]:
df.createOrReplaceTempView("Student")

In [7]:
result = spark.sql(
    """
    -- Method 1: first attach row_id using WINDOW function
    select *, row_number() over(partition by continent order by name) as row_id
        from Student
    """
)
result.show()

+------+---------+------+
|  name|continent|row_id|
+------+---------+------+
|  Jack|  America|     1|
|  Jane|  America|     2|
|    Xi|     Asia|     1|
|Pascal|   Europe|     1|
+------+---------+------+



In [13]:
result = spark.sql(
    """          
    -- expanding each name (1 column) into 3 columns (2 nulls)
    select case when continent = 'America' then name end as America, 
           case when continent = 'Asia' then name end as Asia,
           case when continent = 'Europe' then name end as Europe,
           row_id
    from (select *, row_number() over(partition by continent order by name) as row_id
            from Student)
    """
)
result.show()

+-------+----+------+------+
|America|Asia|Europe|row_id|
+-------+----+------+------+
|   Jack|NULL|  NULL|     1|
|   Jane|NULL|  NULL|     2|
|   NULL|  Xi|  NULL|     1|
|   NULL|NULL|Pascal|     1|
+-------+----+------+------+



In [16]:
result = spark.sql(
    """  
    -- final step: 
    -- using GROUP BY row_id, bring the non-null value into the same row with MAX(name)
    select max(case when continent = 'America' then name end) as America, 
           max(case when continent = 'Asia' then name end) as Asia,
           max(case when continent = 'Europe' then name end) as Europe
    from (select *, row_number() over(partition by continent order by name) as row_id
            from Student)
    group by row_id
    """
)
result.show()

+-------+----+------+
|America|Asia|Europe|
+-------+----+------+
|   Jack|  Xi|Pascal|
|   Jane|NULL|  NULL|
+-------+----+------+



In [19]:
result = spark.sql(
    """  
    -- Or reorganize the query a little bit:
    select max(America) as America, 
           max(Asia) as Asia,
           max(Europe) as Europe
    from (select *, 
            row_number() over(partition by continent order by name) as row_id, 
            CASE WHEN continent = 'America' THEN name END America,
            CASE WHEN continent = 'Asia'    THEN name END Asia,
            CASE WHEN continent = 'Europe'  THEN name END Europe
            from Student)
    group by row_id
    """
)
result.show()

+-------+----+------+
|America|Asia|Europe|
+-------+----+------+
|   Jack|  Xi|Pascal|
|   Jane|NULL|  NULL|
+-------+----+------+

