In [0]:
dbutils.fs.cp("file:/Workspace/Shared/exer.csv","dbfs:/FileStore/exer.csv")

df=spark.read.format("csv").option("header","true").load("/FileStore/exer.csv")
df.show()

+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1001|     John Doe|        HR| 2021-01-15| 55000|
|      1002|   Jane Smith|        IT| 2020-03-10| 62000|
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1004|Michael Brown|        HR| 2018-12-22| 54000|
|      1005| David Wilson|        IT| 2021-06-25| 58000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
|      1007| James Miller|        IT| 2019-08-14| 65000|
|      1008|Barbara Moore|        HR| 2021-03-29| 53000|
+----------+-------------+----------+-----------+------+



In [0]:
import pyspark.sql.functions as F
# Assuming the CSV file is in your Databricks workspace
employee_df = spark.read.csv("/FileStore/exer.csv", header=True, inferSchema=True)

# Display the first 10 rows
employee_df.show(10)

# Inspect the schema
employee_df.printSchema()

# Remove rows with Salary less than 55,000
filtered_df = employee_df.filter(F.col("Salary") >= 55000)

# Filter employees who joined after 2020
filtered_df = filtered_df.filter(F.year(F.col("JoiningDate")) > 2020)

# Average salary by Department
avg_salary_by_department = filtered_df.groupBy("Department").agg(F.avg("Salary").alias("AverageSalary"))

# Count of employees in each Department
employee_count_by_department = filtered_df.groupBy("Department").count()

# Save the cleaned data to a new CSV file
filtered_df.write.csv("cleaned_employee_data.csv", header=True)



+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1001|     John Doe|        HR| 2021-01-15| 55000|
|      1002|   Jane Smith|        IT| 2020-03-10| 62000|
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1004|Michael Brown|        HR| 2018-12-22| 54000|
|      1005| David Wilson|        IT| 2021-06-25| 58000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
|      1007| James Miller|        IT| 2019-08-14| 65000|
|      1008|Barbara Moore|        HR| 2021-03-29| 53000|
+----------+-------------+----------+-----------+------+

root
 |-- EmployeeID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- JoiningDate: date (nullable = true)
 |-- Salary: integer (nullable = true)



Assignment 2


In [0]:
dbutils.fs.cp("file:/Workspace/Shared/product_data (1).json", "dbfs:/FileStore/product_data(1).json")

df = spark.read.option("multiline", "true").json("/FileStore/product_data(1).json")
df.show(10)
df.printSchema()

# Remove rows where Stock is less than 30.
# Filter the products that belong to the "Electronics" category.
df_cleaned_product = df.filter((df['Stock'] >= 30) & (df['Category'] == 'Electronics'))
df_cleaned_product.show()

# Calculate the total stock for products in the "Furniture" category.
df_total_furniture_stock = df.filter(df['Category'] == 'Furniture').groupBy('Category').agg({'Stock': 'sum'}).withColumnRenamed('sum(Stock)', 'TotalStock')
df_total_furniture_stock.show()

# Find the average price of all products in the dataset.
df_avg_price = df.groupBy('Category').agg({'Price': 'avg'}).withColumnRenamed('avg(Price)', 'AvgPrice')
df_avg_price.show()

# Save the cleaned and aggregated data into a new JSON file.
df_cleaned_product.coalesce(1).write.json('/FileStore/cleaned_product_data.json')

+-----------+-----+---------+-----------+-----+
|   Category|Price|ProductID|ProductName|Stock|
+-----------+-----+---------+-----------+-----+
|Electronics| 1200|      101|     Laptop|   35|
|Electronics|  800|      102| Smartphone|   80|
|  Furniture|  150|      103| Desk Chair|   60|
|Electronics|  300|      104|    Monitor|   45|
|  Furniture|  350|      105|       Desk|   25|
+-----------+-----+---------+-----------+-----+

root
 |-- Category: string (nullable = true)
 |-- Price: long (nullable = true)
 |-- ProductID: long (nullable = true)
 |-- ProductName: string (nullable = true)
 |-- Stock: long (nullable = true)

+-----------+-----+---------+-----------+-----+
|   Category|Price|ProductID|ProductName|Stock|
+-----------+-----+---------+-----------+-----+
|Electronics| 1200|      101|     Laptop|   35|
|Electronics|  800|      102| Smartphone|   80|
|Electronics|  300|      104|    Monitor|   45|
+-----------+-----+---------+-----------+-----+

+---------+----------+
| Categor

Assignment 3

In [0]:
# Load employee.csv file data
df_employee = spark.read.csv('/FileStore/exer.csv', header=True, inferSchema=True).cache()
df_employee.show()
df_employee.printSchema()

# Load product_data.json file
df = spark.read.option("multiline", "true").json("/FileStore/product_data(1).json")
df.show(10)
df.printSchema()


df_employee.write.format("delta").mode("overwrite").save("/dbfs/FileStore/delta/exer")
df.write.format("delta").mode("overwrite").save("/dbfs/FileStore/delta/product_data(1)")


spark.sql("CREATE TABLE IF NOT EXISTS employee_delta USING DELTA LOCATION '/dbfs/FileStore/delta/exer'")
spark.sql("CREATE TABLE IF NOT EXISTS product_delta USING DELTA LOCATION '/dbfs/FileStore/delta/product_data(1)'")


# Increase salary by 5% for IT department employees
spark.sql("UPDATE employee_delta SET Salary = Salary * 1.05 WHERE Department = 'IT'")
# Delete products where stock is less than 40
spark.sql("DELETE FROM product_delta WHERE Stock < 40")


# Query the product Delta table to show its state before the delete
# operation (use time travel).
df_product_version_before_delete = spark.sql("SELECT * FROM product_delta VERSION AS OF 0")
df_product_version_before_delete.show()
# Retrieve the version of the employee Delta table before the salary update.
df_employee_version_before_update = spark.sql("SELECT * FROM employee_delta VERSION AS OF 0")
df_employee_version_before_update.show()


# Query the employee Delta table to find the employees in the Finance department.
df_finance_employees = spark.sql("SELECT * FROM employee_delta WHERE Department = 'Finance'")
df_finance_employees.show()
# Query the product Delta table to find all products in the Electronics category with a price greater than 500.
df_expensive_electronics = spark.sql("SELECT * FROM product_delta WHERE Category = 'Electronics' AND Price > 500")
df_expensive_electronics.show()

+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1001|     John Doe|        HR| 2021-01-15| 55000|
|      1002|   Jane Smith|        IT| 2020-03-10| 62000|
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1004|Michael Brown|        HR| 2018-12-22| 54000|
|      1005| David Wilson|        IT| 2021-06-25| 58000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
|      1007| James Miller|        IT| 2019-08-14| 65000|
|      1008|Barbara Moore|        HR| 2021-03-29| 53000|
+----------+-------------+----------+-----------+------+

root
 |-- EmployeeID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- JoiningDate: date (nullable = true)
 |-- Salary: integer (nullable = true)

+-----------+-----+---------+-----------+-----+
|   Category|Price|ProductID|ProductName|Stock|
+-----------