### Import Libraries

In [1]:
import pyspark
from delta import *
from pyspark.sql.functions import initcap

### Create Spark Session with Delta

In [2]:
#  Create a spark session with Delta
builder = pyspark.sql.SparkSession.builder.appName("DeltaTutorial") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Create spark context
spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/Library/Python/3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/sahilnagpal/.ivy2/cache
The jars for the packages stored in: /Users/sahilnagpal/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ac226889-da8c-4b0d-9d1d-16196aa857b9;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.2.0 in central
	found io.delta#delta-storage;3.2.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 222ms :: artifacts dl 8ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.2.0 from central in [default]
	io.delta#delta-storage;3.2.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     | 

### Loading the data

In [7]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS demo_db")

DataFrame[]

In [8]:
spark.sql('''CREATE OR REPLACE TABLE demo_db.people_time_travel(
id INT,
firstName STRING,
lastName STRING,
birthDate STRING)
USING DELTA''')

# inserting the data
spark.sql('''
INSERT OVERWRITE TABLE demo_db.people_time_travel
SELECT id, fname as firstName, lname as lastName, dob as birthDate
FROM JSON.`/Users/sahilnagpal/Desktop/wordsToSpeak/delta_lake/dataset/people.json`
''')


                                                                                

DataFrame[]

In [31]:
spark.sql('''select * from demo_db.people_time_travel''').show()

                                                                                

+---+---------+--------+---------+
| id|firstName|lastName|birthDate|
+---+---------+--------+---------+
+---+---------+--------+---------+



### Describe History of Table

In [39]:
spark.sql('''DESCRIBE HISTORY demo_db.people_time_travel''').show(truncate=False)

+-------+-----------------------+------+--------+-----------------------+----------------------------------------------------------------------------------------------+----+--------+---------+-----------+--------------+-------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+-----------------------------------+
|version|timestamp              |userId|userName|operation              |operationParameters                                                                           |job |notebook|clusterId|readVersion|isolationLevel|isBlindAppend|operationMetrics                                                                                                                                                     

### Operation on Delta Table

#### Delete the data

In [13]:
spark.sql('''delete from demo_db.people_time_travel where firstName ="M David"''').show()

+-----------------+
|num_affected_rows|
+-----------------+
|                1|
+-----------------+



### Select and Show Delta Table Based on Version

#### Version As Of

In [23]:
spark\
    .sql('''select * from demo_db.people_time_travel VERSION AS OF 4''')\
    .show(truncate=False)

+---+---------+--------+----------+
|id |firstName|lastName|birthDate |
+---+---------+--------+----------+
|101|prashant |pandey  |1975-05-26|
|102|abdul    |hamid   |1986-12-28|
|103|M David  |turner  |1979-08-23|
|104|Kailash  |Patil   |1972-09-02|
+---+---------+--------+----------+



#### Timestamp As Of

In [24]:
spark\
    .sql('''select * from demo_db.people_time_travel TIMESTAMP AS OF "2024-10-14 10:43:45.65" ''')\
    .show(truncate=False)

+---+---------+--------+----------+
|id |firstName|lastName|birthDate |
+---+---------+--------+----------+
|101|prashant |pandey  |1975-05-26|
|102|abdul    |hamid   |1986-12-28|
|104|Kailash  |Patil   |1972-09-02|
+---+---------+--------+----------+



### Accidentally Delete the Data of the Table

In [29]:
spark.sql('''delete from demo_db.people_time_travel''').show()

+-----------------+
|num_affected_rows|
+-----------------+
|                4|
+-----------------+



### Restore the Table Using the "RESTORE COMMAND"

#### Spark SQL Code

In [26]:
spark.sql('''
RESTORE TABLE demo_db.people_time_travel VERSION AS OF 4
''').show(truncate=False)

                                                                                

+------------------------+--------------------------+-----------------+------------------+------------------+-------------------+
|table_size_after_restore|num_of_files_after_restore|num_removed_files|num_restored_files|removed_files_size|restored_files_size|
+------------------------+--------------------------+-----------------+------------------+------------------+-------------------+
|1330                    |1                         |0                |1                 |0                 |1330               |
+------------------------+--------------------------+-----------------+------------------+------------------+-------------------+



In [38]:
spark.sql('''select * from demo_db.people_time_travel''').show()

                                                                                

+---+---------+--------+----------+
| id|firstName|lastName| birthDate|
+---+---------+--------+----------+
|101| prashant|  pandey|1975-05-26|
|102|    abdul|   hamid|1986-12-28|
|103|  M David|  turner|1979-08-23|
|104|  Kailash|   Patil|1972-09-02|
+---+---------+--------+----------+



#### Dataframe API Code

In [35]:
spark\
    .read\
    .format("delta")\
    .option("versionAsOf","4")\
    .table("demo_db.people_time_travel")\
    .show(truncate=False)

                                                                                

+---+---------+--------+----------+
|id |firstName|lastName|birthDate |
+---+---------+--------+----------+
|101|prashant |pandey  |1975-05-26|
|102|abdul    |hamid   |1986-12-28|
|103|M David  |turner  |1979-08-23|
|104|Kailash  |Patil   |1972-09-02|
+---+---------+--------+----------+



#### Restore using DataFrame API

In [37]:
people_dt = DeltaTable.forName(spark, "demo_db.people_time_travel")
people_dt.restoreToVersion(4)

                                                                                

DataFrame[table_size_after_restore: bigint, num_of_files_after_restore: bigint, num_removed_files: bigint, num_restored_files: bigint, removed_files_size: bigint, restored_files_size: bigint]