In [0]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from delta.tables import *
spark=SparkSession.builder.appName("sep 17").getOrCreate()
dbutils.fs.cp("file:/Workspace/Shared/Employee.csv","dbfs:/Filestore/Employee.csv")
dbutils.fs.cp("file:/Workspace/Shared/products.json","dbfs:/Filestore/products.json")
employee_df=spark.read.format("csv").option("header","true").load("dbfs:/Filestore/Employee.csv")
employee_df.show()
products_df=spark.read.format("json").option("multiline","true").load("/content/products.json")
employee_df.write.format("delta").mode("overwrite").save("/delta/employee_delta")
products_df.write.format("delta").mode("overwrite").save("/delta/products_delta")
employee_delta = spark.read.format("delta").load("/delta/employee_delta")
products_delta = spark.read.format("delta").load("/delta/products_delta")

+----------+------------+-----------+-----------+------+
|EmployeeID|EmployeeName| Department|JoiningDate|Salary|
+----------+------------+-----------+-----------+------+
|       101|        John|         HR| 10-01-2023| 50000|
|       102|       Alice|    Finance| 15-02-2023| 70000|
|       103|        Mark|Engineering| 20-03-2023| 85000|
|       104|        Emma|      Sales| 01-04-2023| 55000|
|       105|        Liam|  Marketing| 12-05-2023| 60000|
+----------+------------+-----------+-----------+------+



In [0]:

dbutils.fs.cp("File:/Workspace/Shared/new_employee.csv","dbfs:/Filestore/new_employee.csv")
employee_new_df=spark.read.format("csv").option("header","true").load("dbfs:/Filestore/new_employee.csv")
employee_new_df.write.format("delta").mode("overwrite").save("/delta/employee_new_delta")
employee_new_delta=spark.read.format("delta").load("/delta/employee_new_delta")
employee_delta.createOrReplaceTempView("employee_delta")
employee_new_delta.createOrReplaceTempView("new_employee_delta")

In [0]:

spark.sql("""
          merge into employee_delta as target
          using new_employee_delta as source
          on target.employeeID=source.employeeID
          when matched then update set target.Salary=source.Salary
          when not matched then 
           insert (EmployeeID, EmployeeName, JoiningDate, Salary)
           values (source.EmployeeID, source.EmployeeName, source.JoiningDate, source.Salary)
          """)
spark.sql("select * from employee_delta").show()

+----------+------------+-----------+-----------+------+
|EmployeeID|EmployeeName| Department|JoiningDate|Salary|
+----------+------------+-----------+-----------+------+
|       101|        John|         HR| 10-01-2023| 50000|
|       103|        Mark|Engineering| 20-03-2023| 85000|
|       104|        Emma|      Sales| 01-04-2023| 55000|
|       105|        Liam|  Marketing| 12-05-2023| 60000|
|       102|       Alice|    Finance| 15-02-2023| 75000|
|       106|      Olivia|       NULL| 10-06-2023| 65000|
+----------+------------+-----------+-----------+------+



In [0]:
spark.sql("""
          create table if not exists employee  as select * from employee_delta
          """)
#Optmizing the table using zordering and optimize
spark.sql("optimize employee zorder by(Salary)")

#describing the history of the delta table
spark.sql("DESCRIBE HISTORY employee").show()

#vacuuming the table abd storing data of previous 7 days only
spark.sql("Vacuum employee retain 168 hours")

#using versioning of delta lake to find data with certain version
spark.sql("SELECT * FROM employee VERSION AS OF 3")


+-------+-------------------+----------------+--------------------+--------------------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|version|          timestamp|          userId|            userName|           operation| operationParameters| job|          notebook|           clusterId|readVersion|   isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+-------------------+----------------+--------------------+--------------------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|      2|2024-09-17 05:32:13|3885359369260198|azuser2114_mml.lo...|          VACUUM END|{status -> COMPLE...|NULL|{2164644895540858}|0913-035629-qk1t69gv|          1|SnapshotIsolation|         true|{numDeletedFiles ...|        NULL|Databr

DataFrame[EmployeeID: string, EmployeeName: string, Department: string, JoiningDate: string, Salary: string]