
[Delta GitHub](https://github.com/delta-io/delta)

[Quickstart](https://docs.delta.io/latest/quick-start.html)

### Setup Apache Spark with Delta

```
pip install --upgrade pyspark

pyspark --packages io.delta:delta-core_2.12:0.8.0 --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog"

```

#### [quick start examples](https://github.com/delta-io/delta/blob/master/examples/python/quickstart.py)

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr
from delta.tables import DeltaTable
import shutil

# Clear any previous runs
try:
    shutil.rmtree("/tmp/delta-table")
except:
    pass

In [2]:
# Enable SQL commands and Update/Delete/Merge for the current spark session.
# we need to set the following configs
spark = SparkSession.builder \
    .appName("quickstart") \
    .master("local[*]") \
    .getOrCreate()

In [3]:
# Create a table
print("############# Creating a table ###############")
data = spark.range(0, 5)
data.write.format("delta").save("/tmp/delta-table")

############# Creating a table ###############


In [4]:
data.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [5]:
# Read the table
print("############ Reading the table ###############")
df = spark.read.format("delta").load("/tmp/delta-table")
df.show()

############ Reading the table ###############
+---+
| id|
+---+
|  3|
|  4|
|  0|
|  2|
|  1|
+---+



In [6]:
# Upsert (merge) new data
print("########### Upsert new data #############")
newData = spark.range(0, 20)

########### Upsert new data #############


In [7]:
deltaTable = DeltaTable.forPath(spark, "/tmp/delta-table")

In [8]:
(
deltaTable.alias("oldData")
    .merge(
        newData.alias("newData"),
        "oldData.id = newData.id")
    .whenMatchedUpdate(set={"id": col("newData.id")})
    .whenNotMatchedInsert(values={"id": col("newData.id")})
    .execute()
)


In [9]:
type(deltaTable)

delta.tables.DeltaTable

In [11]:
deltaTable.toDF().orderBy("id").show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+



In [16]:
# Update table data
print("########## Overwrite the table ###########")
data = spark.range(15, 25)

########## Overwrite the table ###########


In [17]:
data.show()

+---+
| id|
+---+
| 15|
| 16|
| 17|
| 18|
| 19|
| 20|
| 21|
| 22|
| 23|
| 24|
+---+



In [19]:
data.write.format("delta").mode("overwrite").save("/tmp/delta-table")
deltaTable.toDF().orderBy("id").show()

+---+
| id|
+---+
| 15|
| 16|
| 17|
| 18|
| 19|
| 20|
| 21|
| 22|
| 23|
| 24|
+---+



In [None]:
# Update every even value by adding 100 to it
print("########### Update to the table(add 100 to every even value) ##############")
deltaTable.update(
    condition=expr("id % 2 == 0"),
    set={"id": expr("id + 100")})

In [21]:
deltaTable.toDF().orderBy("id").show()

+---+
| id|
+---+
| 15|
| 17|
| 19|
| 21|
| 23|
|116|
|118|
|120|
|122|
|124|
+---+



In [22]:
# Delete every even value
print("######### Delete every 5-divisible value ##############")
deltaTable.delete(condition=expr("id % 5 == 0"))
deltaTable.toDF().orderBy("id").show()

######### Delete every 5-divisible value ##############
+---+
| id|
+---+
| 17|
| 19|
| 21|
| 23|
|116|
|118|
|122|
|124|
+---+



In [23]:
# Read old version of data using time travel
print("######## Read old data using time travel ############")
v0_df = spark.read.format("delta").option("versionAsOf", 0).load("/tmp/delta-table")
v0_df.orderBy("id").show()

######## Read old data using time travel ############
+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [24]:
v1_df = spark.read.format("delta").option("versionAsOf", 1).load("/tmp/delta-table")
v1_df.orderBy("id").show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+



In [25]:
# cleanup
shutil.rmtree("/tmp/delta-table")

In [26]:
!ls /tmp/delta-table

ls: cannot access '/tmp/delta-table': No such file or directory


In [27]:
spark.stop()