### Import Libraries

In [7]:
import pyspark
from delta import *

### Create Spark Session with Delta

In [8]:
#  Create a spark session with Delta
builder = pyspark.sql.SparkSession.builder.appName("DeltaTutorial") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Create spark context
spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

### Create Schema If Not Exists

In [9]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS demo_db")

DataFrame[]

In [10]:
spark.sql("show tables in demo_db").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



### Define the Dataframe to Load

In [11]:
flight_schema_ddl = """FL_DATE DATE, OP_CARRIER STRING, OP_CARRIER_FL_NUM INT, ORIGIN STRING, 
          ORIGIN_CITY_NAME STRING, DEST STRING, DEST_CITY_NAME STRING, CRS_DEP_TIME INT, DEP_TIME INT, 
          WHEELS_ON INT, TAXI_IN INT, CRS_ARR_TIME INT, ARR_TIME INT, CANCELLED STRING, DISTANCE INT"""

flight_time_df = (spark.read.format("json")
                    .schema(flight_schema_ddl)
                    .option("dateFormat", "M/d/y")
                    .load(f"/Users/sahilnagpal/Desktop/wordsToSpeak/delta_lake/dataset/flight-time.json")
)

### Ways To Store Data in Delta Format
#### Through following methods, we can store the delta table
1. Using the Delta Format
2. Using the Delta Table Builder API

#### 1. Using the Delta Format

In [12]:
spark.sql('''CREATE TABLE IF NOT EXISTS demo_db.flight_time_tbl_delta (
     FL_DATE DATE, 
     OP_CARRIER STRING, 
     OP_CARRIER_FL_NUM INT, 
     ORIGIN STRING, 
     ORIGIN_CITY_NAME STRING, 
     DEST STRING, 
     DEST_CITY_NAME STRING, 
     CRS_DEP_TIME INT, 
     DEP_TIME INT, 
     WHEELS_ON INT, 
     TAXI_IN INT, 
     CRS_ARR_TIME INT, 
     ARR_TIME INT, 
     CANCELLED STRING, 
     DISTANCE INT
 ) USING DELTA''')


DataFrame[]

In [13]:
spark.sql("describe detail demo_db.flight_time_tbl_delta").show(truncate=False)



+------+------------------------------------+-------------------------------------------+-----------+--------------------------------------------------------------------------------------------------------+-----------------------+-----------------------+----------------+-----------------+--------+-----------+----------+----------------+----------------+------------------------+
|format|id                                  |name                                       |description|location                                                                                                |createdAt              |lastModified           |partitionColumns|clusteringColumns|numFiles|sizeInBytes|properties|minReaderVersion|minWriterVersion|tableFeatures           |
+------+------------------------------------+-------------------------------------------+-----------+--------------------------------------------------------------------------------------------------------+-----------------------+--------

                                                                                

In [14]:
# table is empty
spark.sql('''select * from demo_db.flight_time_tbl_delta''').show()

                                                                                

+-------+----------+-----------------+------+----------------+----+--------------+------------+--------+---------+-------+------------+--------+---------+--------+
|FL_DATE|OP_CARRIER|OP_CARRIER_FL_NUM|ORIGIN|ORIGIN_CITY_NAME|DEST|DEST_CITY_NAME|CRS_DEP_TIME|DEP_TIME|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_TIME|CANCELLED|DISTANCE|
+-------+----------+-----------------+------+----------------+----+--------------+------------+--------+---------+-------+------------+--------+---------+--------+
+-------+----------+-----------------+------+----------------+----+--------------+------------+--------+---------+-------+------------+--------+---------+--------+



#### Load the dataframe into the table

In [15]:
flight_time_df\
    .write\
    .format("delta")\
    .mode("append")\
    .saveAsTable("demo_db.flight_time_tbl_delta")

                                                                                

In [16]:
# table is loaded
spark.sql('''select * from demo_db.flight_time_tbl_delta''').show()

                                                                                

+----------+----------+-----------------+------+----------------+----+--------------+------------+--------+---------+-------+------------+--------+---------+--------+
|   FL_DATE|OP_CARRIER|OP_CARRIER_FL_NUM|ORIGIN|ORIGIN_CITY_NAME|DEST|DEST_CITY_NAME|CRS_DEP_TIME|DEP_TIME|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_TIME|CANCELLED|DISTANCE|
+----------+----------+-----------------+------+----------------+----+--------------+------------+--------+---------+-------+------------+--------+---------+--------+
|2000-01-01|        DL|             1451|   BOS|      Boston, MA| ATL|   Atlanta, GA|        1115|    1113|     1343|      5|        1400|    1348|        0|     946|
|2000-01-01|        DL|             1479|   BOS|      Boston, MA| ATL|   Atlanta, GA|        1315|    1311|     1536|      7|        1559|    1543|        0|     946|
|2000-01-01|        DL|             1857|   BOS|      Boston, MA| ATL|   Atlanta, GA|        1415|    1414|     1642|      9|        1721|    1651|        0|     946

#### Using the Delta Table Builder API

In [18]:
(DeltaTable.createOrReplace(spark)
    .tableName("demo_db.flight_time_tbl_delta_api")
    .addColumn("id", "INT")
    .addColumn("FL_DATE", "DATE")
    .addColumn("OP_CARRIER", "STRING")
    .addColumn("OP_CARRIER_FL_NUM", "INT")
    .addColumn("ORIGIN", "STRING")
    .addColumn("ORIGIN_CITY_NAME", "STRING")
    .addColumn("DEST", "STRING") 
    .addColumn("DEST_CITY_NAME", "STRING")
    .addColumn("CRS_DEP_TIME", "INT")
    .addColumn("DEP_TIME", "INT")
    .addColumn("WHEELS_ON", "INT")
    .addColumn("TAXI_IN", "INT")
    .addColumn("CRS_ARR_TIME", "INT")
    .addColumn("ARR_TIME", "INT")
    .addColumn("CANCELLED", "STRING")
    .addColumn("DISTANCE", "INT")
    .execute()
)

<delta.tables.DeltaTable at 0x10f291ee0>

### Ways to Read Data in Delta Format
#### Through following methods, we can read the delta table
1. Using the spark sql
2. Using the spark dataframe API

#### Using the spark sql

In [20]:
spark.sql('''select * from demo_db.flight_time_tbl_delta_api''').show()

                                                                                

+---+-------+----------+-----------------+------+----------------+----+--------------+------------+--------+---------+-------+------------+--------+---------+--------+
| id|FL_DATE|OP_CARRIER|OP_CARRIER_FL_NUM|ORIGIN|ORIGIN_CITY_NAME|DEST|DEST_CITY_NAME|CRS_DEP_TIME|DEP_TIME|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_TIME|CANCELLED|DISTANCE|
+---+-------+----------+-----------------+------+----------------+----+--------------+------------+--------+---------+-------+------------+--------+---------+--------+
+---+-------+----------+-----------------+------+----------------+----+--------------+------------+--------+---------+-------+------------+--------+---------+--------+



#### Using the spark dataframe API

In [21]:
data_location = "/Users/sahilnagpal/Desktop/wordsToSpeak/delta_lake/spark-warehouse/demo_db.db/flight_time_tbl_delta/"
data = spark\
    .read\
    .format("delta")\
    .load(data_location)

In [22]:
data.show(2)

+----------+----------+-----------------+------+----------------+----+--------------+------------+--------+---------+-------+------------+--------+---------+--------+
|   FL_DATE|OP_CARRIER|OP_CARRIER_FL_NUM|ORIGIN|ORIGIN_CITY_NAME|DEST|DEST_CITY_NAME|CRS_DEP_TIME|DEP_TIME|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_TIME|CANCELLED|DISTANCE|
+----------+----------+-----------------+------+----------------+----+--------------+------------+--------+---------+-------+------------+--------+---------+--------+
|2000-01-01|        DL|             1451|   BOS|      Boston, MA| ATL|   Atlanta, GA|        1115|    1113|     1343|      5|        1400|    1348|        0|     946|
|2000-01-01|        DL|             1479|   BOS|      Boston, MA| ATL|   Atlanta, GA|        1315|    1311|     1536|      7|        1559|    1543|        0|     946|
+----------+----------+-----------------+------+----------------+----+--------------+------------+--------+---------+-------+------------+--------+---------+--------