## Setting Environment Variables

In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

## Create a Dataframe

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .appName("MapType")\
        .getOrCreate()

## MapType

In [5]:
from pyspark.sql.types import StructField, StructType, StringType, MapType

# Define Schema
schema = StructType([
    StructField('name', StringType(), True),
    StructField('properties',MapType(StringType(), StringType()), True)
])

# Define Data
dataDictionary = [
        ('James',{'hair':'black','eye':'brown'}),
        ('Michael',{'hair':'brown','eye':None}),
        ('Robert',{'hair':'red','eye':'black'}),
        ('Washington',{'hair':'grey','eye':'grey'}),
        ('Jefferson',{'hair':'brown','eye':''})
        ]
df = spark.createDataFrame(data=dataDictionary, schema = schema)
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [7]:
df.show(truncate = False)

+----------+-----------------------------+
|name      |properties                   |
+----------+-----------------------------+
|James     |{eye -> brown, hair -> black}|
|Michael   |{eye -> null, hair -> brown} |
|Robert    |{eye -> black, hair -> red}  |
|Washington|{eye -> grey, hair -> grey}  |
|Jefferson |{eye -> , hair -> brown}     |
+----------+-----------------------------+



## Accessing Elements of MapType

In [8]:
df3 = df.rdd.map(lambda x: \
                (x.name, x.properties["hair"], x.properties["eye"])) \
                .toDF(["name", "hair", "eye"])
df3.show()

+----------+-----+-----+
|      name| hair|  eye|
+----------+-----+-----+
|     James|black|brown|
|   Michael|brown| null|
|    Robert|  red|black|
|Washington| grey| grey|
| Jefferson|brown|     |
+----------+-----+-----+



In [12]:
df.withColumn("hair", df.properties.getItem("hair"))\
    .withColumn("eye", df.properties.getItem("eye"))\
    .drop("properties")\
    .show()

+----------+-----+-----+
|      name| hair|  eye|
+----------+-----+-----+
|     James|black|brown|
|   Michael|brown| null|
|    Robert|  red|black|
|Washington| grey| grey|
| Jefferson|brown|     |
+----------+-----+-----+



In [13]:
df.withColumn("hair", df.properties["hair"])\
    .withColumn("eye", df.properties["eye"])\
    .drop("properties")\
    .show()

+----------+-----+-----+
|      name| hair|  eye|
+----------+-----+-----+
|     James|black|brown|
|   Michael|brown| null|
|    Robert|  red|black|
|Washington| grey| grey|
| Jefferson|brown|     |
+----------+-----+-----+



## Functions used for MapType()

### explode

In [14]:
from pyspark.sql.functions import explode
df.select(df.name, explode(df.properties)).show()

+----------+----+-----+
|      name| key|value|
+----------+----+-----+
|     James| eye|brown|
|     James|hair|black|
|   Michael| eye| null|
|   Michael|hair|brown|
|    Robert| eye|black|
|    Robert|hair|  red|
|Washington| eye| grey|
|Washington|hair| grey|
| Jefferson| eye|     |
| Jefferson|hair|brown|
+----------+----+-----+



### Map_Keys()

In [15]:
from pyspark.sql.functions import map_keys
df.select(df.name, map_keys(df.properties)).show()

+----------+--------------------+
|      name|map_keys(properties)|
+----------+--------------------+
|     James|         [eye, hair]|
|   Michael|         [eye, hair]|
|    Robert|         [eye, hair]|
|Washington|         [eye, hair]|
| Jefferson|         [eye, hair]|
+----------+--------------------+



### Map_Values

In [16]:
from pyspark.sql.functions import map_values
df.select(df.name, map_values(df.properties)).show()

+----------+----------------------+
|      name|map_values(properties)|
+----------+----------------------+
|     James|        [brown, black]|
|   Michael|         [null, brown]|
|    Robert|          [black, red]|
|Washington|          [grey, grey]|
| Jefferson|             [, brown]|
+----------+----------------------+



### Map Keys List

In [18]:
keysDf = df.select(explode(map_keys(df.properties))).distinct()
keysList = keysDf.rdd.map(lambda x : x[0]).collect()
print(keysList)

['eye', 'hair']


### Map Values List

In [19]:
keysDf = df.select(explode(map_values(df.properties))).distinct()
keysList = keysDf.rdd.map(lambda x : x[0]).collect()
print(keysList)

['black', 'brown', None, 'red', 'grey', '']
