In [0]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType
from pyspark.sql.functions import col,lit,explode,posexplode
from delta.tables import *

In [0]:
%sql
create schema if not exists emp;
create or replace table emp.employee1
(
  id int,
  name string,
  dob int
)using delta
location '/FileStore/tables/employee1'

In [0]:
%sql
insert into emp.employee1 values(1,'Vinoth',1988),(2,'Sathya',1994);

In [0]:
%sql
select * from emp.employee1;
--describe history emp.employee1

In [0]:
dt = DeltaTable.forName(spark,"emp.employee1")
dt.toDF().display()
dt.history().display()

In [0]:
emp_schema = StructType([StructField("id",IntegerType(),True),
                         StructField("name",StringType(),True),
                         StructField("dob",IntegerType(),True)
                         ])
emp_data = [(1,"Vinoth Sakkaraivel",1988),
            (3,"Thanu",2016),
            (4,"Sirpiga",2021)]

df = spark.createDataFrame(emp_data,emp_schema)
display(df)

In [0]:
#dt.alias("t").merge(df.alias("s"),"t.id = s.id").whenMatchedUpdate(set= {"t.id":"s.id","t.name":"s.name","t.dob":"s.dob"}).execute()
dt.alias("t").merge(df.alias("s"),"t.id = s.id").whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
dt.toDF().display()

In [0]:
%sql
select * from emp.employee1;

In [0]:
#dbutils.fs.mkdirs("/FileStore/tables/audit_log1")
#dbutils.fs.rm("/FileStore/tables/audit_log",True)

In [0]:
%sql
create table if not exists emp.audit_log1
(
  operation string,
  update_time timestamp,
  user_name string,
  notebook_name string,
  numTargetRowsUpdated int,
  numTargetRowsInserted int,
  numTargetRowDeleted int
)using delta
location '/FileStore/tables/audit_log1'

In [0]:
dt_AL = DeltaTable.forName(spark,"emp.audit_log1")
dt_AL.toDF().display()

In [0]:
df1 = dt.history(1)
df1.display()
df2 = df1.filter((df1.operation == 'MERGE') | (df1.operation == 'WRITE'))
df3 = df2.select(df2.operation.alias('opn'),df2.timestamp,df2.userName)
df3.display()
df4 = df2.select(df2.operation,explode(df2.operationMetrics))
display(df4)

In [0]:
from pyspark.sql.functions import coalesce
df5 = df4.select(df4.operation,df4.key,df4.value.cast("int"))
display(df5)
df6 = df5.groupBy("operation").pivot("key").sum("value")
display(df6)
df7 = df6.select(df6.operation,coalesce(col("numTargetRowsInserted"),lit(0)).alias('numTargetRowsInserted'),df6.numTargetRowsUpdated,df6.numTargetRowsDeleted)
display(df7)
df8 = df7.join(df3,df7.operation == df3.opn,"inner")
notebookname = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get().split("/")[-1]
df9 = df8.withColumn("NotebookName",lit(notebookname))
df9.display()

In [0]:
%sql
MERGE into emp.audit_log1 as t using tmp_auditlog as s
ON t.operation = s.opn WHEN MATCHED THEN UPDATE SET 
t.operation = s.opn,t.update_time = s.timestamp, t.user_name = s.userName, t.notebook_name = s.NotebookName,
t.numTargetRowsUpdated = s.numTargetRowsUpdated, t.numTargetRowsInserted = s.numTargetRowsInserted, t.numTargetRowDeleted = s.numTargetRowsDeleted 
WHEN NOT MATCHED THEN
INSERT (t.operation,t.update_time,t.user_name,t.notebook_name,t.numTargetRowsUpdated,t.numTargetRowsInserted,t.numTargetRowDeleted) 
VALUES (s.opn,s.timestamp, s.userName, s.NotebookName,s.numTargetRowsUpdated,s.numTargetRowsInserted,s.numTargetRowsDeleted)



In [0]:
%sql
select * from tmp_auditlog;

In [0]:
df9.createOrReplaceTempView('tmp_auditlog')

In [0]:
%sql
select * from emp.audit_log1