### Import Libraries

In [2]:
import pyspark
from delta import configure_spark_with_delta_pip

### Create Spark Session with Delta

In [3]:
#  Create a spark session with Delta
builder = pyspark.sql.SparkSession.builder.appName("DeltaTutorial") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Create spark context
spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/Users/sahilnagpal/Library/Python/3.9/lib/python/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/sahilnagpal/.ivy2.5.2/cache
The jars for the packages stored in: /Users/sahilnagpal/.ivy2.5.2/jars
io.delta#delta-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-d50019f0-4e18-405b-92b8-55907e3bedbf;1.0
	confs: [default]
	found io.delta#delta-spark_2.13;4.0.0 in central
	found io.delta#delta-storage;4.0.0 in central
	found org.antlr#antlr4-runtime;4.13.1 in central
downloading https://repo1.maven.org/maven2/io/delta/delta-spark_2.13/4.0.0/delta-spark_2.13-4.0.0.jar ...
	[SUCCESSFUL ] io.delta#delta-spark_2.13;4.0.0!delta-spark_2.13.jar (208ms)
downloading https://repo1.maven.org/maven2/io/delta/delta-storage/4.0.0/delta-storage-4.0.0.jar ...
	[SUCCESSFUL ] io.delta#delta-storage;4.0.0!delta-storage.jar (32ms)
downloading https://rep

### existing data

In [4]:
data = [
    (1, "Alice", "active"),
    (2, "Bob", "inactive"),
    (3, "Charlie", "active")
]

columns = ["user_id", "name", "status"]

df = spark.createDataFrame(data, columns)

df.write.format("delta").mode("overwrite").save("/Users/sahilnagpal/Desktop/wordsToSpeak/delta_lake/dataset/delta_data/")

                                                                                

In [5]:
spark.sql("""
CREATE TABLE user_profiles
USING DELTA
LOCATION '/Users/sahilnagpal/Desktop/wordsToSpeak/delta_lake/dataset/delta_data/';
""")

DataFrame[]

In [6]:
spark.sql("describe detail user_profiles").show(truncate=False)

+------+------------------------------------+-----------------------------------+-----------+--------------------------------------------------------------------------+-----------------------+-----------------------+----------------+-----------------+--------+-----------+----------+----------------+----------------+------------------------+
|format|id                                  |name                               |description|location                                                                  |createdAt              |lastModified           |partitionColumns|clusteringColumns|numFiles|sizeInBytes|properties|minReaderVersion|minWriterVersion|tableFeatures           |
+------+------------------------------------+-----------------------------------+-----------+--------------------------------------------------------------------------+-----------------------+-----------------------+----------------+-----------------+--------+-----------+----------+----------------+--------------

In [7]:
spark.sql("select * from user_profiles").show(truncate=False)

+-------+-------+--------+
|user_id|name   |status  |
+-------+-------+--------+
|3      |Charlie|active  |
|2      |Bob    |inactive|
|1      |Alice  |active  |
+-------+-------+--------+



### new dataset with some changes

In [13]:
updates = [
    (2, "Bob", "inactive"),       # Update existing user
    (4, "Diana", "active")            # New user
]

df_updates = spark.createDataFrame(updates, columns)
df_updates.createOrReplaceTempView("updates_view")

In [14]:
df_updates.show()

+-------+-----+--------+
|user_id| name|  status|
+-------+-----+--------+
|      2|  Bob|inactive|
|      4|Diana|  active|
+-------+-----+--------+



### perform 'MERGE INTO'

In [15]:
spark.sql("""
MERGE INTO user_profiles AS target
USING updates_view as source
ON target.user_id = source.user_id

WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *
""").show(truncate=False)

+-----------------+----------------+----------------+-----------------+
|num_affected_rows|num_updated_rows|num_deleted_rows|num_inserted_rows|
+-----------------+----------------+----------------+-----------------+
|2                |2               |0               |0                |
+-----------------+----------------+----------------+-----------------+



In [16]:
spark.sql("""select * from user_profiles""").show()

+-------+-------+--------+
|user_id|   name|  status|
+-------+-------+--------+
|      3|Charlie|  active|
|      1|  Alice|  active|
|      2|    Bob|inactive|
|      4|  Diana|  active|
+-------+-------+--------+

