In [0]:
# Application  - sparksubmit command to cluster  to run job is call application
# job - each action is called job
# stage - no. of transformations is called stage
# task - actual execution is called task

In [0]:
df = spark.read.csv("dbfs:/FileStore/demo_folder/employee_dataset.csv",header=True,inferSchema=True)
df.show()

+----------+-----------------+---+----------+-----------+------+------------+
|EmployeeID|             Name|Age|Department|JoiningDate|Salary|        City|
+----------+-----------------+---+----------+-----------+------+------------+
|         1|       Jon Rivera| 56|     Sales| 2024-04-29|121250|     Houston|
|         2|   Nicole Daniels| 46|        HR| 2024-04-01|138633|    New York|
|         3| Monique Sullivan| 32|   Finance| 2020-05-02| 83619|Philadelphia|
|         4|     James Wright| 60| Marketing| 2023-02-21|129751|    New York|
|         5|  Nicole Williams| 25|     Sales| 2018-04-29|123193|     Chicago|
|         6|      David Bates| 38|     Sales| 2019-03-12| 98719|      Dallas|
|         7|    Matthew Riggs| 56| Marketing| 2019-11-06| 71156| Los Angeles|
|         8|     Wendy Powers| 36|Operations| 2023-07-30| 73901|      Dallas|
|         9|   Thomas Collins| 40|   Support| 2018-10-12| 30418|Philadelphia|
|        10|      Joshua Wong| 28|        IT| 2022-05-17| 89252|

In [0]:
# Partitioning and Parallelism Optimization in Spark

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import *
spark = SparkSession.builder.appName("PartitioningOptimization").getOrCreate()
empDf = spark.read.csv("dbfs:/FileStore/demo_folder/employee_dataset.csv",header=True,inferSchema=True)

In [0]:
empDf.write.mode("overwrite").partitionBy("Department").parquet("dbfs:/FileStore/demo_folder13/")

In [0]:
spark.read.parquet("dbfs:/FileStore/demo_folder13/Department=Finance/part-00001-tid-4893172150368010465-0d2058e1-d01a-41f7-b6af-bcc0bef853a5-223-1.c000.snappy.parquet").show()

+----------+-----------------+---+-----------+------+------------+
|EmployeeID|             Name|Age|JoiningDate|Salary|        City|
+----------+-----------------+---+-----------+------+------------+
|   1275135|  Natalie Bradley| 58| 2020-10-10|115570|    New York|
|   1275152|     Kyle Gregory| 53| 2018-06-13|149212|     Houston|
|   1275159|   Teresa Compton| 21| 2023-11-12|104425|     Phoenix|
|   1275177|     John Fuentes| 24| 2022-08-18| 34800|Philadelphia|
|   1275182|    Hannah Rivera| 62| 2019-04-07| 97046|     Houston|
|   1275185|   Jennifer Braun| 46| 2015-05-12|127498|      Dallas|
|   1275191| Christian Carter| 24| 2022-03-19| 39445| Los Angeles|
|   1275205|     Anthony Luna| 51| 2016-04-23|109995|      Dallas|
|   1275208| Jennifer Mcclain| 49| 2023-06-06| 86345| Los Angeles|
|   1275214|Christopher Hodge| 48| 2018-08-31| 83242|     Houston|
|   1275217|   Alejandro Chan| 49| 2015-11-01|112688|      Dallas|
|   1275218|Alexander Vaughan| 40| 2018-01-18| 55965|    New Y

In [0]:
print(f"Default number of partitions: {empDf.rdd.getNumPartitions()}")

Default number of partitions: 8


In [0]:
# Increase parallelism During Shuffles

spark.conf.set("spark.sql.shuffle.partitions",200)

agg_df = empDf.groupBy("Department").count()
agg_df.show()


+----------+-------+
|Department|  count|
+----------+-------+
|     Sales|1428770|
|        HR|1430234|
|   Finance|1427169|
| Marketing|1428760|
|        IT|1429511|
|   Support|1428311|
|Operations|1427245|
+----------+-------+



In [0]:
agg_df.explain(True)

== Parsed Logical Plan ==
'Aggregate ['Department], ['Department, count(1) AS count#488L]
+- Relation [EmployeeID#459,Name#460,Age#461,Department#462,JoiningDate#463,Salary#464,City#465] csv

== Analyzed Logical Plan ==
Department: string, count: bigint
Aggregate [Department#462], [Department#462, count(1) AS count#488L]
+- Relation [EmployeeID#459,Name#460,Age#461,Department#462,JoiningDate#463,Salary#464,City#465] csv

== Optimized Logical Plan ==
Aggregate [Department#462], [Department#462, count(1) AS count#488L]
+- Project [Department#462]
   +- Relation [EmployeeID#459,Name#460,Age#461,Department#462,JoiningDate#463,Salary#464,City#465] csv

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[Department#462], functions=[finalmerge_count(merge count#499L) AS count(1)#487L], output=[Department#462, count#488L])
   +- Exchange hashpartitioning(Department#462, 200), ENSURE_REQUIREMENTS, [plan_id=508]
      +- HashAggregate(keys=[Department#462], functions=[

In [0]:
repartition_df = empDf.repartition(100,col("Department"))

result_df = repartition_df.groupBy(col("Department")).agg(avg("Salary").alias("avg_Salary"))

In [0]:
repartition_df.write.mode("overwrite").parquet("dbfs:/FileStore/demo_folder10/")

In [0]:
spark.read.parquet('dbfs:/FileStore/demo_folder10/part-00021-tid-851087917284295486-16a99efb-7717-4fd9-b024-3299536d357a-77-1-c000.snappy.parquet',header=True,inferSchema=True).show()

+----------+--------------------+---+----------+-----------+------+------------+
|EmployeeID|                Name|Age|Department|JoiningDate|Salary|        City|
+----------+--------------------+---+----------+-----------+------+------------+
|         4|        James Wright| 60| Marketing| 2023-02-21|129751|    New York|
|         7|       Matthew Riggs| 56| Marketing| 2019-11-06| 71156| Los Angeles|
|        13|      Allison Miller| 53| Marketing| 2019-09-19|108950|      Dallas|
|        14|        Ryan Morales| 57| Marketing| 2021-05-13|130007|      Dallas|
|        26|      Jason Marshall| 29| Marketing| 2018-11-09| 85932|      Dallas|
|        40|      Elizabeth Love| 38| Marketing| 2016-01-17| 61988|      Dallas|
|        41|Christopher Cruz DDS| 26| Marketing| 2016-11-22|144849|      Dallas|
|        80|   Anthony Hernandez| 57| Marketing| 2016-01-04| 47130| Los Angeles|
|        83|       Michelle Neal| 62| Marketing| 2020-11-28| 40823|     Phoenix|
|        92|         Alexis 

In [0]:
final_df = result_df.coalesce(3)

final_df.write.mode("overwrite").parquet("dbfs:/FileStore/demo_folder12/")

In [0]:
spark.read.parquet("dbfs:/FileStore/demo_folder12/part-00000-tid-1762136447601742868-a192849b-c39b-4ed3-8782-5e42f2a8666a-216-1-c000.snappy.parquet").show()

+----------+-----------------+
|Department|       avg_Salary|
+----------+-----------------+
|     Sales|90004.43771495762|
| Marketing|89978.62071866513|
|        IT|89968.35851910198|
+----------+-----------------+



In [0]:
newDf = empDf.groupBy("Department")
newDf.show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mAttributeError[0m                            Traceback (most recent call last)
File [0;32m<command-2789666227908671>:2[0m
[1;32m      1[0m newDf [38;5;241m=[39m empDf[38;5;241m.[39mgroupBy([38;5;124m"[39m[38;5;124mDepartment[39m[38;5;124m"[39m)
[0;32m----> 2[0m newDf[38;5;241m.[39mshow()

[0;31mAttributeError[0m: 'GroupedData' object has no attribute 'show'