### Import Libraries

In [13]:
import pyspark
from delta import *
from pyspark.sql.functions import initcap

### Create Spark Session with Delta

In [2]:
#  Create a spark session with Delta
builder = pyspark.sql.SparkSession.builder.appName("DeltaTutorial") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Create spark context
spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/Library/Python/3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/sahilnagpal/.ivy2/cache
The jars for the packages stored in: /Users/sahilnagpal/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-820ee371-26f0-4bae-b10b-4476f175c4ba;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.2.0 in central
	found io.delta#delta-storage;3.2.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 147ms :: artifacts dl 5ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.2.0 from central in [default]
	io.delta#delta-storage;3.2.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     | 

### Loading the data

In [7]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS demo_db")

DataFrame[]

In [8]:
spark.sql('''CREATE OR REPLACE TABLE demo_db.people(
id INT,
firstName STRING,
lastName STRING,
birthDate STRING)
USING DELTA''')

# inserting the data
spark.sql('''
INSERT OVERWRITE TABLE demo_db.people
SELECT id, fname as firstName, lname as lastName, dob as birthDate
FROM JSON.`/Users/sahilnagpal/Desktop/wordsToSpeak/delta_lake/dataset/people.json`
''')


                                                                                

DataFrame[]

In [9]:
spark.sql('''select * from demo_db.people''').show()

                                                                                

+---+---------+--------+----------+
| id|firstName|lastName| birthDate|
+---+---------+--------+----------+
|101| prashant|  pandey|1975-05-25|
|102|    abdul|   hamid|1986-12-28|
|103|  M David|  turner|1979-08-23|
|104|  Kailash|   Patil|1972-09-02|
+---+---------+--------+----------+



### Operation on Delta Table

#### Delete the data using spark sql

In [10]:
spark.sql('''delete from demo_db.people where firstName ="M David"''').show()

+-----------------+
|num_affected_rows|
+-----------------+
|                1|
+-----------------+



#### Delete the data using delta API

In [11]:
from delta import DeltaTable

people_dt = DeltaTable.forName(spark, "demo_db.people")
people_dt.delete("firstName = 'abdul'")

                                                                                

#### Update the record using delta API

In [14]:
people_dt.update(
  condition = "birthDate = '1975-05-25'",
  set = { "firstName": initcap("firstName"), "lastName":  initcap("lastName") }
)

In [15]:
spark.sql('''select * from demo_db.people''').show()

+---+---------+--------+----------+
| id|firstName|lastName| birthDate|
+---+---------+--------+----------+
|101| Prashant|  Pandey|1975-05-25|
|104|  Kailash|   Patil|1972-09-02|
+---+---------+--------+----------+



#### Merge the record using delta API

##### Read new dataset to merge into

In [18]:
source_df = spark.read.format("json").load("/Users/sahilnagpal/Desktop/wordsToSpeak/delta_lake/dataset/people.json")
source_df.show()

+----------+--------+---+------+
|       dob|   fname| id| lname|
+----------+--------+---+------+
|1975-05-26|prashant|101|pandey|
|1986-12-28|   abdul|102| hamid|
|1979-08-23| M David|103|turner|
|1972-09-02| Kailash|104| Patil|
+----------+--------+---+------+



##### Merging the data

In [19]:
(people_dt.alias("tgt")
    .merge(source_df.alias("src"), "src.id=tgt.id")
    .whenMatchedDelete(condition="tgt.firstName='Kailash' and tgt.lastName='Patil'")
    .whenMatchedUpdate(condition="tgt.id = 101", set = {"tgt.birthDate": "src.dob"})
    .whenMatchedUpdate(set = {"tgt.id": "src.id", "tgt.firstName":"src.fname", "tgt.lastName":"src.lname", "tgt.birthDate":"src.dob"})
    .whenNotMatchedInsert(values = {"tgt.id": "src.id", "tgt.firstName":"src.fname", "tgt.lastName":"src.lname", "tgt.birthDate":"src.dob"})
    .execute()
)

In [20]:
spark.sql('''select * from demo_db.people''').show()

                                                                                

+---+---------+--------+----------+
| id|firstName|lastName| birthDate|
+---+---------+--------+----------+
|101| Prashant|  Pandey|1975-05-26|
|102|    abdul|   hamid|1986-12-28|
|103|  M David|  turner|1979-08-23|
+---+---------+--------+----------+

