## Setting Environment Variables

In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

## Creating SparkSession

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .appName("Explode")\
        .getOrCreate()

## explode()

### Create a Dataframe

In [3]:
arrayArrayData = [
  ("James",[["Java","Scala","C++"],["Spark","Java"]]),
  ("Michael",[["Spark","Java","C++"],["Spark","Java"]]),
  ("Robert",[["CSharp","VB"],["Spark","Python"]])
]

df = spark.createDataFrame(data=arrayArrayData, schema = ['name','subjects'])
df.printSchema()
df.show(truncate=False)

root
 |-- name: string (nullable = true)
 |-- subjects: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)

+-------+-----------------------------------+
|name   |subjects                           |
+-------+-----------------------------------+
|James  |[[Java, Scala, C++], [Spark, Java]]|
|Michael|[[Spark, Java, C++], [Spark, Java]]|
|Robert |[[CSharp, VB], [Spark, Python]]    |
+-------+-----------------------------------+



### exploding data

In [6]:
from pyspark.sql.functions import explode
df.select(df.name, explode(df.subjects))\
    .show(truncate = False)

+-------+------------------+
|name   |col               |
+-------+------------------+
|James  |[Java, Scala, C++]|
|James  |[Spark, Java]     |
|Michael|[Spark, Java, C++]|
|Michael|[Spark, Java]     |
|Robert |[CSharp, VB]      |
|Robert |[Spark, Python]   |
+-------+------------------+



### flatten

In [8]:
from pyspark.sql.functions import flatten
df.select(df.name, flatten(df.subjects).alias("subs")).show(truncate = False)

+-------+-------------------------------+
|name   |subs                           |
+-------+-------------------------------+
|James  |[Java, Scala, C++, Spark, Java]|
|Michael|[Spark, Java, C++, Spark, Java]|
|Robert |[CSharp, VB, Spark, Python]    |
+-------+-------------------------------+



## array()

### Dataframe

In [11]:
from pyspark.sql.types import StringType, ArrayType,StructType,StructField
data = [
 ("James,,Smith",["Java","Scala","C++"],["Spark","Java"],"OH","CA"),
 ("Michael,Rose,",["Spark","Java","C++"],["Spark","Java"],"NY","NJ"),
 ("Robert,,Williams",["CSharp","VB"],["Spark","Python"],"UT","NV")
]
schema = StructType([ 
    StructField("name",StringType(),True), 
    StructField("languagesAtSchool",ArrayType(StringType()),True), 
    StructField("languagesAtWork",ArrayType(StringType()),True), 
    StructField("currentState", StringType(), True), 
    StructField("previousState", StringType(), True)
  ])
df = spark.createDataFrame(data, schema)
df.show(truncate = False)

+----------------+------------------+---------------+------------+-------------+
|name            |languagesAtSchool |languagesAtWork|currentState|previousState|
+----------------+------------------+---------------+------------+-------------+
|James,,Smith    |[Java, Scala, C++]|[Spark, Java]  |OH          |CA           |
|Michael,Rose,   |[Spark, Java, C++]|[Spark, Java]  |NY          |NJ           |
|Robert,,Williams|[CSharp, VB]      |[Spark, Python]|UT          |NV           |
+----------------+------------------+---------------+------------+-------------+



In [14]:
# explode example in array
from pyspark.sql.functions import explode
df.select(df.name, explode(df.languagesAtSchool)).show()

+----------------+------+
|            name|   col|
+----------------+------+
|    James,,Smith|  Java|
|    James,,Smith| Scala|
|    James,,Smith|   C++|
|   Michael,Rose,| Spark|
|   Michael,Rose,|  Java|
|   Michael,Rose,|   C++|
|Robert,,Williams|CSharp|
|Robert,,Williams|    VB|
+----------------+------+



### split

In [16]:
from pyspark.sql.functions import split
df.select(split(df.name, ",").alias("split_name"), df.languagesAtWork).show()

+--------------------+---------------+
|          split_name|languagesAtWork|
+--------------------+---------------+
|    [James, , Smith]|  [Spark, Java]|
|   [Michael, Rose, ]|  [Spark, Java]|
|[Robert, , Williams]|[Spark, Python]|
+--------------------+---------------+



### array

In [18]:
from pyspark.sql.functions import array
df.select(df.name, array(df.currentState, df.previousState).alias('State'))\
    .show()

+----------------+--------+
|            name|   State|
+----------------+--------+
|    James,,Smith|[OH, CA]|
|   Michael,Rose,|[NY, NJ]|
|Robert,,Williams|[UT, NV]|
+----------------+--------+



### array_contains

In [21]:
from pyspark.sql.functions import array_contains
row = input("Enter a lang to check if it exists : ")
df.select(df.name, array_contains(df.languagesAtSchool,row).alias("isExists")).show()

Enter a lang to check if it exists :  Java


+----------------+--------+
|            name|isExists|
+----------------+--------+
|    James,,Smith|    true|
|   Michael,Rose,|    true|
|Robert,,Williams|   false|
+----------------+--------+



## Collect()

### collect_list

In [27]:
from pyspark.sql.functions import collect_list
df = df.withColumn("languages", explode("languagesAtSchool"))\
    .drop("languagesAtSchool")
df.show(truncate = False)

+----------------+------------------+---------------+------------+-------------+
|name            |languagesAtSchool |languagesAtWork|currentState|previousState|
+----------------+------------------+---------------+------------+-------------+
|James,,Smith    |[Java, Scala, C++]|[Spark, Java]  |OH          |CA           |
|Michael,Rose,   |[Spark, Java, C++]|[Spark, Java]  |NY          |NJ           |
|Robert,,Williams|[CSharp, VB]      |[Spark, Python]|UT          |NV           |
+----------------+------------------+---------------+------------+-------------+

+----------------+---------------+------------+-------------+---------+
|name            |languagesAtWork|currentState|previousState|languages|
+----------------+---------------+------------+-------------+---------+
|James,,Smith    |[Spark, Java]  |OH          |CA           |Java     |
|James,,Smith    |[Spark, Java]  |OH          |CA           |Scala    |
|James,,Smith    |[Spark, Java]  |OH          |CA           |C++      |


In [31]:
df.select(collect_list("languages")).show(truncate = False)

+------------------------------------------------+
|collect_list(languages)                         |
+------------------------------------------------+
|[Java, Scala, C++, Spark, Java, C++, CSharp, VB]|
+------------------------------------------------+



### collect_set() function

In [33]:
from pyspark.sql.functions import collect_set
df.select(collect_set("languages")).show(truncate = False)

+-------------------------------------+
|collect_set(languages)               |
+-------------------------------------+
|[CSharp, VB, Scala, Spark, Java, C++]|
+-------------------------------------+



### countDistinct() function

In [38]:
from pyspark.sql.functions import countDistinct, count
df.select(count("languages")).show()
df.select(countDistinct("languages")).show()

+----------------+
|count(languages)|
+----------------+
|               8|
+----------------+

+-------------------------+
|count(DISTINCT languages)|
+-------------------------+
|                        6|
+-------------------------+



## Map functions

### create_map()

In [40]:
from pyspark.sql.types import IntegerType, StringType
data = [ ("36636","Finance",3000,"USA"), 
    ("40288","Finance",5000,"IND"), 
    ("42114","Sales",3900,"USA"), 
    ("39192","Marketing",2500,"CAN"), 
    ("34534","Sales",6500,"USA") ]
schema = StructType([
     StructField('id', StringType(), True),
     StructField('dept', StringType(), True),
     StructField('salary', IntegerType(), True),
     StructField('location', StringType(), True)
     ])

df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
df.show(truncate=False)

root
 |-- id: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- location: string (nullable = true)

+-----+---------+------+--------+
|id   |dept     |salary|location|
+-----+---------+------+--------+
|36636|Finance  |3000  |USA     |
|40288|Finance  |5000  |IND     |
|42114|Sales    |3900  |USA     |
|39192|Marketing|2500  |CAN     |
|34534|Sales    |6500  |USA     |
+-----+---------+------+--------+



In [44]:
# convert dataframe columns to Map Type
from pyspark.sql.functions import create_map, lit, col
df.withColumn("properitiesMap", create_map(
    lit("salary"), col("salary"),
    lit("location"), col("location")
))\
    .drop("salary", "location")\
    .show(truncate = False)

+-----+---------+---------------------------------+
|id   |dept     |properitiesMap                   |
+-----+---------+---------------------------------+
|36636|Finance  |{salary -> 3000, location -> USA}|
|40288|Finance  |{salary -> 5000, location -> IND}|
|42114|Sales    |{salary -> 3900, location -> USA}|
|39192|Marketing|{salary -> 2500, location -> CAN}|
|34534|Sales    |{salary -> 6500, location -> USA}|
+-----+---------+---------------------------------+



### Exploding Map

In [47]:
from pyspark.sql.types import MapType
schema = StructType([
    StructField('name', StringType(), True),
    StructField('properties', MapType(StringType(),StringType()),True)
])
dataDictionary = [
        ('James',{'hair':'black','eye':'brown'}),
        ('Michael',{'hair':'brown','eye':None}),
        ('Robert',{'hair':'red','eye':'black'}),
        ('Washington',{'hair':'grey','eye':'grey'}),
        ('Jefferson',{'hair':'brown','eye':''})
        ]
df = spark.createDataFrame(data=dataDictionary, schema = schema)

In [48]:
df2 = df.rdd.map(lambda x : \
                (x.name, x.properties["hair"], x.properties["eye"]))\
    .toDF(["name", "hair", "eye"])
df2.show()

+----------+-----+-----+
|      name| hair|  eye|
+----------+-----+-----+
|     James|black|brown|
|   Michael|brown| null|
|    Robert|  red|black|
|Washington| grey| grey|
| Jefferson|brown|     |
+----------+-----+-----+



In [50]:
df.select(df.name, explode(df.properties)).show()

+----------+----+-----+
|      name| key|value|
+----------+----+-----+
|     James| eye|brown|
|     James|hair|black|
|   Michael| eye| null|
|   Michael|hair|brown|
|    Robert| eye|black|
|    Robert|hair|  red|
|Washington| eye| grey|
|Washington|hair| grey|
| Jefferson| eye|     |
| Jefferson|hair|brown|
+----------+----+-----+



### map_key() function

In [51]:
from pyspark.sql.functions import map_keys
df.select(df.name, map_keys(df.properties)).show()

+----------+--------------------+
|      name|map_keys(properties)|
+----------+--------------------+
|     James|         [eye, hair]|
|   Michael|         [eye, hair]|
|    Robert|         [eye, hair]|
|Washington|         [eye, hair]|
| Jefferson|         [eye, hair]|
+----------+--------------------+



### map_values() function

In [52]:
from pyspark.sql.functions import map_values
df.select(df.name, map_values(df.properties)).show()

+----------+----------------------+
|      name|map_values(properties)|
+----------+----------------------+
|     James|        [brown, black]|
|   Michael|         [null, brown]|
|    Robert|          [black, red]|
|Washington|          [grey, grey]|
| Jefferson|             [, brown]|
+----------+----------------------+

