In [1]:

import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages io.delta:delta-core_2.12:0.7.0 --conf spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog pyspark-shell'

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('lineage').getOrCreate()

In [5]:
from pyspark.sql.functions import *
history = spark.range(1,1000)
history = history.withColumn("type", lit("history"))
history.show()

+---+-------+
| id|   type|
+---+-------+
|  1|history|
|  2|history|
|  3|history|
|  4|history|
|  5|history|
|  6|history|
|  7|history|
|  8|history|
|  9|history|
| 10|history|
| 11|history|
| 12|history|
| 13|history|
| 14|history|
| 15|history|
| 16|history|
| 17|history|
| 18|history|
| 19|history|
| 20|history|
+---+-------+
only showing top 20 rows



In [6]:
history.write.format("delta").save("delta_sample_data")

In [8]:
updates = spark.range(100,200)
updates = updates.withColumn("type", lit("updates"))
updates.show()

+---+-------+
| id|   type|
+---+-------+
|100|updates|
|101|updates|
|102|updates|
|103|updates|
|104|updates|
|105|updates|
|106|updates|
|107|updates|
|108|updates|
|109|updates|
|110|updates|
|111|updates|
|112|updates|
|113|updates|
|114|updates|
|115|updates|
|116|updates|
|117|updates|
|118|updates|
|119|updates|
+---+-------+
only showing top 20 rows



In [9]:
from delta.tables import *
deltaTable = DeltaTable.forPath(spark, "delta_sample_data")
deltaTable.alias("history").merge(
    updates.alias("updates"),
    "history.id = updates.id").whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()

In [10]:
deltaTable.history().show(truncate=False)

+-------+-----------------------+------+--------+---------+--------------------------------------------+----+--------+---------+-----------+--------------+-------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+
|version|timestamp              |userId|userName|operation|operationParameters                         |job |notebook|clusterId|readVersion|isolationLevel|isBlindAppend|operationMetrics                                                                                                                                                                                                   |userMetadata|
+-------+-----------------------+------+--------+---------+--------------------------------------------+----+--------+---------+-----------+--------------+-------------+-----------------------------------------

In [12]:
spark.read.format("delta").load("delta_sample_data").groupBy("type").count().show()

+-------+-----+
|   type|count|
+-------+-----+
|updates|  100|
|history|  899|
+-------+-----+



In [16]:
spark.read \
 .format("delta") \
 .load("delta_sample_data") \
 .repartition(5) \
 .write \
 .option("dataChange", "false") \
 .format("delta") \
 .mode("overwrite") \
 .save("C://Users//yuvaan//Downloads//Edited-20210902T170302Z-001//Edited//delta_sample_data")

In [17]:
spark.read.format("delta").load("delta_sample_data").groupBy("type").count().show()

+-------+-----+
|   type|count|
+-------+-----+
|updates|  100|
|history|  899|
+-------+-----+



In [21]:
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled","false")

In [22]:
deltaTable.vacuum(0)

DataFrame[]