<a href="https://colab.research.google.com/github/sumaaithal/PySpark_30Days_Challenge/blob/main/pyspark21.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
!tar xf spark-3.5.0-bin-hadoop3.tgz
!pip install -q findspark

[33m0% [Working][0m            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
[33m0% [Connecting to archive.ubuntu.com (91.189.91.82)] [Connecting to security.ub[0m[33m0% [Connecting to archive.ubuntu.com (91.189.91.82)] [Connecting to security.ub[0m                                                                               Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Get:6 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [48.6 kB]
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Get:8 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease [18.1 kB]
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 h

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"

In [3]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

In [19]:
from pyspark.sql.types import StringType,MapType, StructType, StructField
from pyspark.sql.functions import *

In [6]:
schema = StructType([
    StructField("name", StringType(), True ),
    StructField("properties", MapType( StringType(), StringType(), True ) )
])

In [7]:
dataDictionary = [
        ('James',{'hair':'black','eye':'brown'}),
        ('Michael',{'hair':'brown','eye':None}),
        ('Robert',{'hair':'red','eye':'black'}),
        ('Washington',{'hair':'grey','eye':'grey'}),
        ('Jefferson',{'hair':'brown','eye':''})
        ]

In [8]:
df = spark.createDataFrame(data=dataDictionary, schema=schema)
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [11]:
df.show(truncate=False)

+----------+-----------------------------+
|name      |properties                   |
+----------+-----------------------------+
|James     |{eye -> brown, hair -> black}|
|Michael   |{eye -> NULL, hair -> brown} |
|Robert    |{eye -> black, hair -> red}  |
|Washington|{eye -> grey, hair -> grey}  |
|Jefferson |{eye -> , hair -> brown}     |
+----------+-----------------------------+



In [12]:
df2 = df.rdd.map(lambda x: (x.name, x.properties['hair'],x.properties['eye'])).toDF(['name','hair','eyes'])
df2.show(truncate=False)

+----------+-----+-----+
|name      |hair |eyes |
+----------+-----+-----+
|James     |black|brown|
|Michael   |brown|NULL |
|Robert    |red  |black|
|Washington|grey |grey |
|Jefferson |brown|     |
+----------+-----+-----+



In [17]:
df.withColumn("hair", df.properties.getItem("hair"))\
  .withColumn("eyes", df.properties.getItem("eye")).show(truncate=False)

+----------+-----------------------------+-----+-----+
|name      |properties                   |hair |eyes |
+----------+-----------------------------+-----+-----+
|James     |{eye -> brown, hair -> black}|black|brown|
|Michael   |{eye -> NULL, hair -> brown} |brown|NULL |
|Robert    |{eye -> black, hair -> red}  |red  |black|
|Washington|{eye -> grey, hair -> grey}  |grey |grey |
|Jefferson |{eye -> , hair -> brown}     |brown|     |
+----------+-----------------------------+-----+-----+



In [18]:
df.withColumn("hair", df.properties['hair'])\
  .withColumn("eyes", df.properties['eye']).show(truncate=False)

+----------+-----------------------------+-----+-----+
|name      |properties                   |hair |eyes |
+----------+-----------------------------+-----+-----+
|James     |{eye -> brown, hair -> black}|black|brown|
|Michael   |{eye -> NULL, hair -> brown} |brown|NULL |
|Robert    |{eye -> black, hair -> red}  |red  |black|
|Washington|{eye -> grey, hair -> grey}  |grey |grey |
|Jefferson |{eye -> , hair -> brown}     |brown|     |
+----------+-----------------------------+-----+-----+



In [20]:
df.select("name", explode(df.properties )).show(truncate=False)

+----------+----+-----+
|name      |key |value|
+----------+----+-----+
|James     |eye |brown|
|James     |hair|black|
|Michael   |eye |NULL |
|Michael   |hair|brown|
|Robert    |eye |black|
|Robert    |hair|red  |
|Washington|eye |grey |
|Washington|hair|grey |
|Jefferson |eye |     |
|Jefferson |hair|brown|
+----------+----+-----+



In [21]:
df.select(df.name, map_keys(df.properties) ).show(truncate=False)

+----------+--------------------+
|name      |map_keys(properties)|
+----------+--------------------+
|James     |[eye, hair]         |
|Michael   |[eye, hair]         |
|Robert    |[eye, hair]         |
|Washington|[eye, hair]         |
|Jefferson |[eye, hair]         |
+----------+--------------------+



In [22]:
df.select(df.name, map_values(df.properties) ).show(truncate=False)

+----------+----------------------+
|name      |map_values(properties)|
+----------+----------------------+
|James     |[brown, black]        |
|Michael   |[NULL, brown]         |
|Robert    |[black, red]          |
|Washington|[grey, grey]          |
|Jefferson |[, brown]             |
+----------+----------------------+



In [24]:
keys_df = df.select(explode(map_keys(df.properties) )).distinct()
keys_df

col
eye
hair


In [27]:
keys_list = keys_df.rdd.map(lambda x: x[0]).collect()
keys_list

['eye', 'hair']