In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.appName("test").getOrCreate()

### Q1: Explode Array

In [3]:
data = [
    ["Nadal", ["Badminton", "Tennis"]],
    ["Federer", ["Tennis", "Cricket"]],
    ["Novak", ["Baseball"]],
]
df = spark.createDataFrame(data, ["name", "hobbies"])
df.show()

+-------+-------------------+
|   name|            hobbies|
+-------+-------------------+
|  Nadal|[Badminton, Tennis]|
|Federer|  [Tennis, Cricket]|
|  Novak|         [Baseball]|
+-------+-------------------+



In [4]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- hobbies: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [5]:
(df.withColumn("hobby", explode(col("hobbies"))).select("name", "hobby").show())

+-------+---------+
|   name|    hobby|
+-------+---------+
|  Nadal|Badminton|
|  Nadal|   Tennis|
|Federer|   Tennis|
|Federer|  Cricket|
|  Novak| Baseball|
+-------+---------+



### Q2: Coalesce

In [6]:
data = [
    ["New York", "", "Texas"],
    ["", "Georgia", None],
    [None, "", "New Jersey"],
]
df = spark.createDataFrame(data, ["city1", "city2", "city3"])
df.show()

+--------+-------+----------+
|   city1|  city2|     city3|
+--------+-------+----------+
|New York|       |     Texas|
|        |Georgia|      null|
|    null|       |New Jersey|
+--------+-------+----------+



In [7]:
df.select(
    coalesce(
        when(col("city1") == "", None).otherwise(col("city1")),
        when(col("city2") == "", None).otherwise(col("city2")),
        when(col("city3") == "", None).otherwise(col("city3")),
    ).alias("city")
).show()

+----------+
|      city|
+----------+
|  New York|
|   Georgia|
|New Jersey|
+----------+



### Q3 Date Functions and Join

In [9]:
data = [
    (100, "Raj", None, 1, "01-04-23", 50000),
    (200, "Joanne", 100, 1, "01-04-23", 4000),
    (200, "Joanne", 100, 1, "13-04-23", 4500),
    (200, "Joanne", 100, 1, "14-04-23", 4020),
]
salary_df = spark.createDataFrame(
    data, ["emp_id", "emp_name", "mgr_id", "dept_id", "salary_dt", "salary"]
)
salary_df.show()

data = [(1, "IT"), (2, "HR")]
dept_df = spark.createDataFrame(data, ["dept_id", "dept_name"])
dept_df.show()

+------+--------+------+-------+---------+------+
|emp_id|emp_name|mgr_id|dept_id|salary_dt|salary|
+------+--------+------+-------+---------+------+
|   100|     Raj|  null|      1| 01-04-23| 50000|
|   200|  Joanne|   100|      1| 01-04-23|  4000|
|   200|  Joanne|   100|      1| 13-04-23|  4500|
|   200|  Joanne|   100|      1| 14-04-23|  4020|
+------+--------+------+-------+---------+------+

+-------+---------+
|dept_id|dept_name|
+-------+---------+
|      1|       IT|
|      2|       HR|
+-------+---------+



In [29]:
df = (
    salary_df.alias("e")
    .join(dept_df.alias("d"), col("e.dept_id") == col("d.dept_id"))
    .join(salary_df.alias("m"), col("e.mgr_id") == col("m.emp_id"), "left")
)

+------+--------+------+-------+---------+------+-------+---------+------+--------+------+-------+---------+------+
|emp_id|emp_name|mgr_id|dept_id|salary_dt|salary|dept_id|dept_name|emp_id|emp_name|mgr_id|dept_id|salary_dt|salary|
+------+--------+------+-------+---------+------+-------+---------+------+--------+------+-------+---------+------+
|   100|     Raj|  null|      1| 01-04-23| 50000|      1|       IT|  null|    null|  null|   null|     null|  null|
|   200|  Joanne|   100|      1| 01-04-23|  4000|      1|       IT|   100|     Raj|  null|      1| 01-04-23| 50000|
|   200|  Joanne|   100|      1| 13-04-23|  4500|      1|       IT|   100|     Raj|  null|      1| 01-04-23| 50000|
|   200|  Joanne|   100|      1| 14-04-23|  4020|      1|       IT|   100|     Raj|  null|      1| 01-04-23| 50000|
+------+--------+------+-------+---------+------+-------+---------+------+--------+------+-------+---------+------+



In [31]:
emp_df = df.select(
    col("d.dept_name"),
    col("m.emp_name").alias("mgr_name"),
    col("e.emp_name"),
    year(to_date("e.salary_dt", "dd-MM-yy")).alias("salary_year"),
    month(to_date("e.salary_dt", "dd-MM-yy")).alias("salary_month"),
    col("e.salary"),
)
emp_df.show()

+---------+--------+--------+-----------+------------+------+
|dept_name|mgr_name|emp_name|salary_year|salary_month|salary|
+---------+--------+--------+-----------+------------+------+
|       IT|    null|     Raj|       2023|           4| 50000|
|       IT|     Raj|  Joanne|       2023|           4|  4000|
|       IT|     Raj|  Joanne|       2023|           4|  4500|
|       IT|     Raj|  Joanne|       2023|           4|  4020|
+---------+--------+--------+-----------+------------+------+



In [33]:
(
    emp_df
    .groupBy("d.dept_name", "mgr_name", "e.emp_name", "salary_year", "salary_month")
    .agg(sum("e.salary").alias("monthly_salary"))
    .show()
)

+---------+--------+--------+-----------+------------+--------------+
|dept_name|mgr_name|emp_name|salary_year|salary_month|monthly_salary|
+---------+--------+--------+-----------+------------+--------------+
|       IT|     Raj|  Joanne|       2023|           4|         12520|
|       IT|    null|     Raj|       2023|           4|         50000|
+---------+--------+--------+-----------+------------+--------------+

