
[Delta GitHub](https://github.com/delta-io/delta)

[Quickstart](https://docs.delta.io/latest/quick-start.html)

### Setup Apache Spark with Delta

```
pip install --upgrade pyspark
pyspark --packages io.delta:delta-core_2.11:0.4.0
```

#### [quick start examples](https://github.com/delta-io/delta/blob/master/examples/python/quickstart.py)

In [1]:
from pyspark import SparkContext
from pyspark.sql import Column, DataFrame, SparkSession, SQLContext, functions
from pyspark.sql.functions import *
from py4j.java_collections import MapConverter
from delta.tables import *
import shutil
import threading

In [2]:
# Clear any previous runs
try:
    shutil.rmtree("/tmp/delta-table")
except:
    pass

In [4]:
# Create SparkContext
sc = SparkContext()
sqlContext = SQLContext(sc)

spark = SparkSession \
    .builder \
    .appName("quickstart") \
    .master("local[*]") \
    .getOrCreate()

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=PySparkShell, master=local[*]) created by <module> at /home/gong/anaconda3/lib/python3.7/site-packages/IPython/utils/py3compat.py:188 

https://stackoverflow.com/questions/46257917/killing-a-sparkcontext-so-i-can-create-a-new-one

In [5]:
# use below to work around above error
from pyspark import  SparkContext
sc = SparkContext.getOrCreate()

In [6]:
sc

In [7]:
# Create a table
print("############# Creating a table ###############")
data = spark.range(0, 5)
data.write.format("delta").save("/tmp/delta-table")

############# Creating a table ###############


folder `/tmp/delta-table` is created with sub-folder `_delta_log` storing transactions

In [8]:
# Read the table
print("############ Reading the table ###############")
df = spark.read.format("delta").load("/tmp/delta-table")
df.show()

############ Reading the table ###############
+---+
| id|
+---+
|  3|
|  4|
|  1|
|  2|
|  0|
+---+



In [9]:
# Upsert (merge) new data
print("########### Upsert new data #############")
newData = spark.range(0, 20)

########### Upsert new data #############


In [10]:
deltaTable = DeltaTable.forPath(spark, "/tmp/delta-table")

In [11]:
type(deltaTable)

delta.tables.DeltaTable

In [12]:
deltaTable.alias("oldData")\
    .merge(
    newData.alias("newData"),
    "oldData.id = newData.id"\
    ).whenMatchedUpdate(set={"id": col("newData.id")})\
    .whenNotMatchedInsert(values={"id": col("newData.id")})\
    .execute()

In [13]:
# convert deltaTable to DataFrame
deltaTable.toDF().show()

+---+
| id|
+---+
|  2|
| 16|
| 18|
| 12|
|  8|
| 19|
|  4|
| 11|
|  5|
| 10|
| 13|
|  7|
|  0|
|  6|
|  1|
| 14|
|  9|
| 17|
|  3|
| 15|
+---+



In [14]:
deltaTable.toDF().count()

20

In [15]:
# Update table data
print("########## Overwrite the table ###########")
data = spark.range(5, 10)
data.write.format("delta").mode("overwrite").save("/tmp/delta-table")
deltaTable.toDF().show()

########## Overwrite the table ###########
+---+
| id|
+---+
|  8|
|  9|
|  5|
|  7|
|  6|
+---+



In [16]:
# read data back
df2 = spark.read.format("delta").load("/tmp/delta-table")
df2.show()

+---+
| id|
+---+
|  8|
|  9|
|  5|
|  7|
|  6|
+---+



In [17]:
# Update every even value by adding 100 to it
print("########### Update to the table(add 100 to every even value) ##############")

deltaTable = DeltaTable.forPath(spark, "/tmp/delta-table")

deltaTable.update(
    condition=expr("id % 2 == 0"),
    set={"id": expr("id + 100")})

deltaTable.toDF().show()

########### Update to the table(add 100 to every even value) ##############
+---+
| id|
+---+
|108|
|  9|
|106|
|  5|
|  7|
+---+



In [18]:
# Delete every odd value
print("######### Delete every odd value ##############")
deltaTable.delete(condition=expr("id % 2 > 0"))
deltaTable.toDF().show()

######### Delete every odd value ##############
+---+
| id|
+---+
|106|
|108|
+---+



In [19]:
# Read old version of data using time travel
print("######## Read old data using time travel ############")
df = spark.read.format("delta")\
    .option("versionAsOf", 0)\
    .load("/tmp/delta-table")
df.show()

######## Read old data using time travel ############
+---+
| id|
+---+
|  3|
|  4|
|  1|
|  2|
|  0|
+---+



In [20]:
df = spark.read.format("delta")\
    .option("versionAsOf", 1)\
    .load("/tmp/delta-table")
df.show()

+---+
| id|
+---+
|  2|
| 16|
| 18|
| 12|
|  8|
| 19|
|  4|
| 11|
|  5|
| 10|
| 13|
|  7|
|  0|
|  6|
|  1|
| 14|
|  9|
| 17|
|  3|
| 15|
+---+



In [21]:
# cleanup
shutil.rmtree("/tmp/delta-table")

In [22]:
!ls /tmp/delta-table

ls: cannot access '/tmp/delta-table': No such file or directory
