<a href="https://colab.research.google.com/github/sumaaithal/PySpark_30Days_Challenge/blob/main/pyspark39.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
!tar xf spark-3.5.0-bin-hadoop3.tgz
!pip install -q findspark

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:6 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease [18.1 kB]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:8 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,063 kB]
Hit:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Get:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease [24.3 kB]
Get:11 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [1,677 kB]
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:13 https://ppa.launchpadcontent.net/c

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"

In [3]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

In [13]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,MapType

In [6]:
data = [ ("36636","Finance",3000,"USA"),
    ("40288","Finance",5000,"IND"),
    ("42114","Sales",3900,"USA"),
    ("39192","Marketing",2500,"CAN"),
    ("34534","Sales",6500,"USA") ]
schema = StructType([
     StructField('id', StringType(), True),
     StructField('dept', StringType(), True),
     StructField('salary', IntegerType(), True),
     StructField('location', StringType(), True)
     ])

df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
df.show(truncate=False)

root
 |-- id: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- location: string (nullable = true)

+-----+---------+------+--------+
|id   |dept     |salary|location|
+-----+---------+------+--------+
|36636|Finance  |3000  |USA     |
|40288|Finance  |5000  |IND     |
|42114|Sales    |3900  |USA     |
|39192|Marketing|2500  |CAN     |
|34534|Sales    |6500  |USA     |
+-----+---------+------+--------+



In [18]:
from pyspark.sql.functions import col,lit,create_map,map_keys,map_values

In [12]:
df.withColumn("properties_map",
              create_map(lit("salary"),col("salary"),
                         lit("location"),col("location"))).show(truncate=False)

+-----+---------+------+--------+---------------------------------+
|id   |dept     |salary|location|properties_map                   |
+-----+---------+------+--------+---------------------------------+
|36636|Finance  |3000  |USA     |{salary -> 3000, location -> USA}|
|40288|Finance  |5000  |IND     |{salary -> 5000, location -> IND}|
|42114|Sales    |3900  |USA     |{salary -> 3900, location -> USA}|
|39192|Marketing|2500  |CAN     |{salary -> 2500, location -> CAN}|
|34534|Sales    |6500  |USA     |{salary -> 6500, location -> USA}|
+-----+---------+------+--------+---------------------------------+



In [14]:
schema = StructType([
    StructField('name', StringType(), True),
    StructField('properties', MapType(StringType(),StringType()),True)
])

dataDictionary = [
        ('James',{'hair':'black','eye':'brown'}),
        ('Michael',{'hair':'brown','eye':None}),
        ('Robert',{'hair':'red','eye':'black'}),
        ('Washington',{'hair':'grey','eye':'grey'}),
        ('Jefferson',{'hair':'brown','eye':''})
        ]
df = spark.createDataFrame(data=dataDictionary, schema = schema)
df.printSchema()
df.show(truncate=False)

root
 |-- name: string (nullable = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+----------+-----------------------------+
|name      |properties                   |
+----------+-----------------------------+
|James     |{eye -> brown, hair -> black}|
|Michael   |{eye -> NULL, hair -> brown} |
|Robert    |{eye -> black, hair -> red}  |
|Washington|{eye -> grey, hair -> grey}  |
|Jefferson |{eye -> , hair -> brown}     |
+----------+-----------------------------+



In [15]:
df3 = df.rdd.map(lambda x: (x.name,x.properties['hair'],x.properties['eye'])).toDF(['name','hair','eyes'])
df3.show()

+----------+-----+-----+
|      name| hair| eyes|
+----------+-----+-----+
|     James|black|brown|
|   Michael|brown| NULL|
|    Robert|  red|black|
|Washington| grey| grey|
| Jefferson|brown|     |
+----------+-----+-----+



In [17]:
df.withColumn("hair",
              df.properties.getItem("hair"))\
  .withColumn("eyes",
              df.properties.getItem("eye"))\
  .drop(col("properties")).show()

+----------+-----+-----+
|      name| hair| eyes|
+----------+-----+-----+
|     James|black|brown|
|   Michael|brown| NULL|
|    Robert|  red|black|
|Washington| grey| grey|
| Jefferson|brown|     |
+----------+-----+-----+



In [19]:
df.select("name",
          map_keys(col("properties")).alias("keys")).show()

+----------+-----------+
|      name|       keys|
+----------+-----------+
|     James|[eye, hair]|
|   Michael|[eye, hair]|
|    Robert|[eye, hair]|
|Washington|[eye, hair]|
| Jefferson|[eye, hair]|
+----------+-----------+



In [20]:
df.select("name",
          map_values(col("properties")).alias("values")).show()

+----------+--------------+
|      name|        values|
+----------+--------------+
|     James|[brown, black]|
|   Michael| [NULL, brown]|
|    Robert|  [black, red]|
|Washington|  [grey, grey]|
| Jefferson|     [, brown]|
+----------+--------------+

