Task 1

In [0]:
#Move the file from Workspace to DBFS
dbutils.fs.cp("file:/Workspace/Shared/Employees.csv", "dbfs:/FileStore/Employees.csv")
#Load CSV data into a DataFrame
employees_df = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/Employees.csv")
#Write DataFrame to Delta format
employees_df.write.format("delta").mode("overwrite").save("/delta/Employees")

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Define schema for Products.json file
schema = StructType([ 
                   StructField("ProductID", StringType(), True), 
                   StructField("ProductName", StringType(), True), 
                   StructField("Category", StringType(), True), 
                   StructField("Price", IntegerType(), True) 
])


In [0]:
dbutils.fs.cp("file:/Workspace/Shared/Products.json", "dbfs:/FileStore/Products.json")

True

In [0]:

products_df=spark.read.format("json").schema(schema).load("dbfs:/FileStore/Products.json") 
products_df.show()


+---------+-----------+-----------+-----+
|ProductID|ProductName|   Category|Price|
+---------+-----------+-----------+-----+
|     P101|     Laptop|Electronics| 1200|
|     P102|      Phone|Electronics|  800|
|     P103|     Tablet|Electronics|  600|
|     P104|    Monitor|Electronics|  300|
|     P105|      Mouse|Accessories|   25|
+---------+-----------+-----------+-----+



In [0]:
products_df.write.format("delta").mode("overwrite").save("/delta/Products")

Task 2 

In [0]:
dbutils.fs.cp("file:/Workspace/Shared/Nwe_Employee.csv", "dbfs:/FileStore/Nwe_Employee.csv")

True

In [0]:
new_employee_df = spark.read.format("csv").option("header", "true").load("/FileStore/Nwe_Employee.csv")
new_employee_df.write.format("delta").mode("overwrite").save("/delta/Nwe_Employee")

In [0]:
employee_df = spark.read.format("delta").load("/delta/Employees")
new_employee_df = spark.read.format("delta").load("/delta/Nwe_Employee")

In [0]:
# Create temporary views for SQL operations with corrected view names
employee_df.createOrReplaceTempView("delta_Employees") 

new_employee_df.createOrReplaceTempView("Nwe_Employee")

In [0]:
spark.sql("""
    MERGE INTO delta_Employees AS target
    USING Nwe_Employee AS source
    ON target.EmployeeID = source.EmployeeID
    WHEN MATCHED THEN UPDATE SET
        target.EmployeeName = source.EmployeeName,
        target.Department = source.Department,
        target.JoiningDate = source.JoiningDate,
        target.Salary = source.Salary
    WHEN NOT MATCHED THEN INSERT 
    (
        EmployeeID, EmployeeName, Department, JoiningDate, Salary
    ) VALUES (
        source.EmployeeID, source.EmployeeName, source.Department, source.JoiningDate, source.Salary
    )
""")

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [0]:
spark.sql("SELECT * FROM delta_Employees").show()

+----------+------------+-----------+-----------+------+
|EmployeeID|EmployeeName| Department|JoiningDate|Salary|
+----------+------------+-----------+-----------+------+
|       101|        John|         HR| 2023-01-10| 50000|
|       103|        Mark|Engineering| 2023-03-20| 85000|
|       104|        Emma|      Sales| 2023-04-01| 55000|
|       105|        Liam|  Marketing| 2023-05-12| 60000|
|       102|       Alice|    Finance| 2023-02-15|75000 |
|       106|      Olivia|         HR| 2023-06-10| 65000|
+----------+------------+-----------+-----------+------+



In [0]:
#Write the employee dataframe to a delta table
employees_df.write.format("delta").mode("overwrite").save("/delta/Employees")
#Register the delta table
spark.sql("CREATE TABLE IF NOT EXISTS delta_Employees_table USING DELTA LOCATION '/delta/Employees1'")

DataFrame[]

Task 3

In [0]:
# Describe table
spark.sql("DESCRIBE TABLE delta_Employees").show(truncate=False)

+------------+---------+-------+
|col_name    |data_type|comment|
+------------+---------+-------+
|EmployeeID  |string   |NULL   |
|EmployeeName|string   |NULL   |
|Department  |string   |NULL   |
|JoiningDate |string   |NULL   |
|Salary      |string   |NULL   |
+------------+---------+-------+



In [0]:
historical_df = spark.read.format("delta").option("versionAsOf", 1).load("/delta/Employees")
historical_df.show()


+----------+------------+-----------+-----------+------+
|EmployeeID|EmployeeName| Department|JoiningDate|Salary|
+----------+------------+-----------+-----------+------+
|       101|        John|         HR| 2023-01-10| 50000|
|       103|        Mark|Engineering| 2023-03-20| 85000|
|       104|        Emma|      Sales| 2023-04-01| 55000|
|       105|        Liam|  Marketing| 2023-05-12| 60000|
|       102|       Alice|    Finance| 2023-02-15|75000 |
|       106|      Olivia|         HR| 2023-06-10| 65000|
+----------+------------+-----------+-----------+------+



Task 4: Optimize Delta Table

In [0]:
spark.sql("""
    OPTIMIZE delta_Employees_table ZORDER BY Department
""")


DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,clusteringStats:struct<inputZCubeFiles:struct<numFiles:bigint,size:bigint>,inputOtherFiles:struct<numFiles:bigint,size:bigint>,inputNumZCubes:bigint,mergedFiles:struct<numFiles:bigint,size:bigint>,numOutputZCubes:bigint>,numBins:bigint,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,

Task5

In [0]:
# Retrieve the table as it was before the last merge
historical_df = spark.read.format("delta").option("versionAsOf", 0).load("/delta/Employees")
historical_df.show()


+----------+------------+-----------+-----------+------+
|EmployeeID|EmployeeName| Department|JoiningDate|Salary|
+----------+------------+-----------+-----------+------+
|       101|        John|         HR| 2023-01-10| 50000|
|       102|       Alice|    Finance| 2023-02-15| 70000|
|       103|        Mark|Engineering| 2023-03-20| 85000|
|       104|        Emma|      Sales| 2023-04-01| 55000|
|       105|        Liam|  Marketing| 2023-05-12| 60000|
+----------+------------+-----------+-----------+------+



In [0]:

historical_df = spark.read.format("delta").option("versionAsOf", 1).load("/delta/Employees")
historical_df.show()

+----------+------------+-----------+-----------+------+
|EmployeeID|EmployeeName| Department|JoiningDate|Salary|
+----------+------------+-----------+-----------+------+
|       101|        John|         HR| 2023-01-10| 50000|
|       103|        Mark|Engineering| 2023-03-20| 85000|
|       104|        Emma|      Sales| 2023-04-01| 55000|
|       105|        Liam|  Marketing| 2023-05-12| 60000|
|       102|       Alice|    Finance| 2023-02-15|75000 |
|       106|      Olivia|         HR| 2023-06-10| 65000|
+----------+------------+-----------+-----------+------+



Task 6

In [0]:
spark.sql("""
	VACUUM delta_Employees RETAIN 168 HOURS
""")