In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder.appName("test").getOrCreate()

### Q1: Group Concat

In [3]:
data = [
    (1, "John", "ADF"),
    (1, "John", "ADB"),
    (1, "John", "PowerBI"),
    (2, "Joanne", "ADF"),
    (2, "Joanne", "SQL"),
    (2, "Joanne", "Crystal Report"),
    (3, "Vikas", "ADF"),
    (3, "Vikas", "SQL"),
    (3, "Vikas", "SSIS"),
    (4, "Monu", "SQL"),
    (4, "Monu", "SSIS"),
    (4, "Monu", "SSAS"),
    (4, "Monu", "ADF"),
]
schema = ["emp_id", "emp_name", "skill"]
df = spark.createDataFrame(data, schema)

In [4]:
from pyspark.sql.functions import collect_list, concat_ws

In [5]:
df1 = df.groupBy(df.emp_name).agg(collect_list(df.skill).alias("skills"))
df1.show(truncate=False)

+--------+--------------------------+
|emp_name|skills                    |
+--------+--------------------------+
|John    |[ADF, ADB, PowerBI]       |
|Joanne  |[ADF, SQL, Crystal Report]|
|Vikas   |[ADF, SQL, SSIS]          |
|Monu    |[SQL, SSIS, SSAS, ADF]    |
+--------+--------------------------+



In [6]:
df1.select(df.emp_name, concat_ws(",", df1.skills).alias("skills")).show(truncate=False)

+--------+----------------------+
|emp_name|skills                |
+--------+----------------------+
|John    |ADF,ADB,PowerBI       |
|Joanne  |ADF,SQL,Crystal Report|
|Vikas   |ADF,SQL,SSIS          |
|Monu    |SQL,SSIS,SSAS,ADF     |
+--------+----------------------+



### Q2: Grouping

In [8]:
data = [
    ("IT", "M"),
    ("IT", "F"),
    ("IT", "M"),
    ("IT", "M"),
    ("HR", "M"),
    ("HR", "F"),
    ("HR", "F"),
    ("HR", "F"),
    ("HR", "M"),
    ("Sales", "M"),
    ("Sales", "F"),
    ("Sales", "M"),
    ("Sales", "F")
]
schema = ["dept_name", "gender"]
df = spark.createDataFrame(data, schema)

In [19]:
df1 = df.groupBy("dept_name").agg(
    count(col("dept_name")).alias("total_emp_count"),
    count(when(col("gender") == "M", 1)).alias("male_count"),
    count(when(col("gender") == "F", 1)).alias("female_count"),
)
df1.show()

+---------+---------------+----------+------------+
|dept_name|total_emp_count|male_count|female_count|
+---------+---------------+----------+------------+
|       IT|              4|         3|           1|
|       HR|              5|         2|           3|
|    Sales|              4|         2|           2|
+---------+---------------+----------+------------+

