Load the csv data

In [0]:
dbutils.fs.cp("file:/Workspace/Shared/employee_data.csv", "dbfs:/FileStore/employee_data.csv")

True

In [0]:
# Load the file from DBFS
df_csv = spark.read.format("csv").option("header", "true").load("/FileStore/employee_data.csv")
df_csv.show()

+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1001|     John Doe|        HR| 2021-01-15| 55000|
|      1002|   Jane Smith|        IT| 2020-03-10| 62000|
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1004|Michael Brown|        HR| 2018-12-22| 54000|
|      1005| David Wilson|        IT| 2021-06-25| 58000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
|      1007| James Miller|        IT| 2019-08-14| 65000|
|      1008|Barbara Moore|        HR| 2021-03-29| 53000|
+----------+-------------+----------+-----------+------+



In [0]:
df_csv.show(10)
df_csv.printSchema()

+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1001|     John Doe|        HR| 2021-01-15| 55000|
|      1002|   Jane Smith|        IT| 2020-03-10| 62000|
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1004|Michael Brown|        HR| 2018-12-22| 54000|
|      1005| David Wilson|        IT| 2021-06-25| 58000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
|      1007| James Miller|        IT| 2019-08-14| 65000|
|      1008|Barbara Moore|        HR| 2021-03-29| 53000|
+----------+-------------+----------+-----------+------+

root
 |-- EmployeeID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- JoiningDate: string (nullable = true)
 |-- Salary: string (nullable = true)



2. Data Cleaning:

In [0]:

df_salary_filtered = df_csv.filter(df_csv["Salary"] >= 55000)

# Filter employees who joined after the year 2020
df_cleaned = df_salary_filtered.filter(df_salary_filtered["JoiningDate"] > '2020-12-31')
df_cleaned.show()


+----------+------------+----------+-----------+------+
|EmployeeID|        Name|Department|JoiningDate|Salary|
+----------+------------+----------+-----------+------+
|      1001|    John Doe|        HR| 2021-01-15| 55000|
|      1005|David Wilson|        IT| 2021-06-25| 58000|
+----------+------------+----------+-----------+------+



Data Aggregation:

In [0]:
# Average salary by department
df_cleaned.groupBy("Department").agg({'Salary': 'avg'}).show()




+----------+-----------+
|Department|avg(Salary)|
+----------+-----------+
|        HR|    55000.0|
|        IT|    58000.0|
+----------+-----------+



In [0]:
# Count employees by department
df_cleaned.groupBy("Department").count().show()

+----------+-----+
|Department|count|
+----------+-----+
|        HR|    1|
|        IT|    1|
+----------+-----+



4. Write the Data to CSV

In [0]:
df_cleaned.write.format("csv").option("header",
"true").save("/FileStore/cleaned_employee_data.csv")

In [0]:
df_csv = spark.read.format("csv").option("header", "true").load("/FileStore/cleaned_employee_data.csv")
df_csv.show()

+----------+------------+----------+-----------+------+
|EmployeeID|        Name|Department|JoiningDate|Salary|
+----------+------------+----------+-----------+------+
|      1001|    John Doe|        HR| 2021-01-15| 55000|
|      1005|David Wilson|        IT| 2021-06-25| 58000|
+----------+------------+----------+-----------+------+

