In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=7952f065367ed0bbc457287b6427a658ee82644ca223a7821bea4f028512e7c7
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3


In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [3]:
spark

In [4]:
arrayData = [
        ('Bob',['New York','Boston']),
        ('Kim',['Los Angeles','Chicago',None]),
        ('Lee',['Phoenix','']),
        ('Peter',None),
        ('Sam',['San Diego','Dallas'])]

col = ['name','location']

In [5]:
df = spark.createDataFrame(data=arrayData, schema=col)
df.printSchema()
df.show()

root
 |-- name: string (nullable = true)
 |-- location: array (nullable = true)
 |    |-- element: string (containsNull = true)

+-----+--------------------+
| name|            location|
+-----+--------------------+
|  Bob|  [New York, Boston]|
|  Kim|[Los Angeles, Chi...|
|  Lee|         [Phoenix, ]|
|Peter|                NULL|
|  Sam| [San Diego, Dallas]|
+-----+--------------------+



In [6]:
from pyspark.sql.functions import explode
df2 = df.select(df.name,explode(df.location))
df2.printSchema()
df2.show()

root
 |-- name: string (nullable = true)
 |-- col: string (nullable = true)

+----+-----------+
|name|        col|
+----+-----------+
| Bob|   New York|
| Bob|     Boston|
| Kim|Los Angeles|
| Kim|    Chicago|
| Kim|       NULL|
| Lee|    Phoenix|
| Lee|           |
| Sam|  San Diego|
| Sam|     Dallas|
+----+-----------+



`posexplode()` dodatkowo zwraca indeks (pozycję elementu) w danym zbiorze danych, czyli dla każdej wartości wyciągniętej z tablicy (lub innej kolekcji), oprócz wartości, zwraca także jej pozycję (indeks).

In [7]:
from pyspark.sql.functions import posexplode
df2 = df.select(df.name,posexplode(df.location))
df2.printSchema()
df2.show()

root
 |-- name: string (nullable = true)
 |-- pos: integer (nullable = false)
 |-- col: string (nullable = true)

+----+---+-----------+
|name|pos|        col|
+----+---+-----------+
| Bob|  0|   New York|
| Bob|  1|     Boston|
| Kim|  0|Los Angeles|
| Kim|  1|    Chicago|
| Kim|  2|       NULL|
| Lee|  0|    Phoenix|
| Lee|  1|           |
| Sam|  0|  San Diego|
| Sam|  1|     Dallas|
+----+---+-----------+



W przeciwieństwie do `explode()`, która ignoruje wartości null i puste tablice, `explode_outer()` zachowuje te wartości, zamieniając null na wiersze z wartością null.

In [8]:
from pyspark.sql.functions import explode_outer
df2 = df.select(df.name,explode_outer(df.location))
df2.printSchema()
df2.show()

root
 |-- name: string (nullable = true)
 |-- col: string (nullable = true)

+-----+-----------+
| name|        col|
+-----+-----------+
|  Bob|   New York|
|  Bob|     Boston|
|  Kim|Los Angeles|
|  Kim|    Chicago|
|  Kim|       NULL|
|  Lee|    Phoenix|
|  Lee|           |
|Peter|       NULL|
|  Sam|  San Diego|
|  Sam|     Dallas|
+-----+-----------+



In [9]:
from pyspark.sql.functions import posexplode_outer
df2 = df.select(df.name,posexplode_outer(df.location))
df2.printSchema()
df2.show()

root
 |-- name: string (nullable = true)
 |-- pos: integer (nullable = true)
 |-- col: string (nullable = true)

+-----+----+-----------+
| name| pos|        col|
+-----+----+-----------+
|  Bob|   0|   New York|
|  Bob|   1|     Boston|
|  Kim|   0|Los Angeles|
|  Kim|   1|    Chicago|
|  Kim|   2|       NULL|
|  Lee|   0|    Phoenix|
|  Lee|   1|           |
|Peter|NULL|       NULL|
|  Sam|   0|  San Diego|
|  Sam|   1|     Dallas|
+-----+----+-----------+



In [10]:
mapData = [
        ('Bob',{'cuisine':'Chinese','color':'blue'}),
        ('Kim',{'cuisine':'Indian','color':None}),
        ('Lee',{'cuisine':'Japanese','color':''}),
        ('Peter',None),
        ('Sam',{})]

col = ['name','favorites']

In [11]:
df = spark.createDataFrame(data=mapData, schema = col)
df.printSchema()
df.show()

root
 |-- name: string (nullable = true)
 |-- favorites: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+-----+--------------------+
| name|           favorites|
+-----+--------------------+
|  Bob|{color -> blue, c...|
|  Kim|{color -> NULL, c...|
|  Lee|{color -> , cuisi...|
|Peter|                NULL|
|  Sam|                  {}|
+-----+--------------------+



In [12]:
from pyspark.sql.functions import explode
df2 = df.select(df.name,explode(df.favorites))
df2.printSchema()
df2.show()

root
 |-- name: string (nullable = true)
 |-- key: string (nullable = false)
 |-- value: string (nullable = true)

+----+-------+--------+
|name|    key|   value|
+----+-------+--------+
| Bob|  color|    blue|
| Bob|cuisine| Chinese|
| Kim|  color|    NULL|
| Kim|cuisine|  Indian|
| Lee|  color|        |
| Lee|cuisine|Japanese|
+----+-------+--------+



In [13]:
from pyspark.sql.functions import posexplode
df2 = df.select(df.name,posexplode(df.favorites))
df2.printSchema()
df2.show()

root
 |-- name: string (nullable = true)
 |-- pos: integer (nullable = false)
 |-- key: string (nullable = false)
 |-- value: string (nullable = true)

+----+---+-------+--------+
|name|pos|    key|   value|
+----+---+-------+--------+
| Bob|  0|  color|    blue|
| Bob|  1|cuisine| Chinese|
| Kim|  0|  color|    NULL|
| Kim|  1|cuisine|  Indian|
| Lee|  0|  color|        |
| Lee|  1|cuisine|Japanese|
+----+---+-------+--------+



In [14]:
from pyspark.sql.functions import explode_outer
df2 = df.select(df.name,explode_outer(df.favorites))
df2.printSchema()
df2.show()

root
 |-- name: string (nullable = true)
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)

+-----+-------+--------+
| name|    key|   value|
+-----+-------+--------+
|  Bob|  color|    blue|
|  Bob|cuisine| Chinese|
|  Kim|  color|    NULL|
|  Kim|cuisine|  Indian|
|  Lee|  color|        |
|  Lee|cuisine|Japanese|
|Peter|   NULL|    NULL|
|  Sam|   NULL|    NULL|
+-----+-------+--------+



In [15]:
from pyspark.sql.functions import posexplode_outer
df2 = df.select(df.name,posexplode_outer(df.favorites))
df2.printSchema()
df2.show()

root
 |-- name: string (nullable = true)
 |-- pos: integer (nullable = true)
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)

+-----+----+-------+--------+
| name| pos|    key|   value|
+-----+----+-------+--------+
|  Bob|   0|  color|    blue|
|  Bob|   1|cuisine| Chinese|
|  Kim|   0|  color|    NULL|
|  Kim|   1|cuisine|  Indian|
|  Lee|   0|  color|        |
|  Lee|   1|cuisine|Japanese|
|Peter|NULL|   NULL|    NULL|
|  Sam|NULL|   NULL|    NULL|
+-----+----+-------+--------+

