### Import Libraries

In [1]:
import pyspark
from delta import *
from pyspark.sql.functions import initcap

### Create Spark Session with Delta

In [2]:
#  Create a spark session with Delta
builder = pyspark.sql.SparkSession.builder.appName("DeltaTutorial") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Create spark context
spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/Library/Python/3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/sahilnagpal/.ivy2/cache
The jars for the packages stored in: /Users/sahilnagpal/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-9d4ff5f1-53b5-4db7-bc0f-5226865c7db6;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.2.0 in central
	found io.delta#delta-storage;3.2.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 223ms :: artifacts dl 6ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.2.0 from central in [default]
	io.delta#delta-storage;3.2.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     | 

### Loading the data

In [7]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS demo_db")

DataFrame[]

In [9]:
spark.sql('''drop table demo_db.people_schema_validation''')

DataFrame[]

In [10]:
spark.sql('''CREATE OR REPLACE TABLE demo_db.people_schema_validation(
id INT,
firstName STRING,
lastName STRING)
USING DELTA''')

# # inserting the data
# spark.sql('''
# INSERT OVERWRITE TABLE demo_db.people_schema_validation
# SELECT id, fname as firstName, lname as lastName, dob as birthDate
# FROM JSON.`/Users/sahilnagpal/Desktop/wordsToSpeak/delta_lake/dataset/people.json`
# ''')


DataFrame[]

In [18]:
spark.sql('''select * from demo_db.people_schema_validation''').show()

                                                                                

+---+---------+--------+
| id|firstName|lastName|
+---+---------+--------+
|101| prashant|  pandey|
|102|    abdul|   hamid|
|103|  M David|  turner|
|104|  Kailash|   Patil|
|105|    jolly|  sharma|
|106|     menu|   verma|
+---+---------+--------+



#### Schema Validations Summary
1. INSERT &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&ensp;&nbsp;- Column matching by position, New columns not allowed
2. OVERWRITE &emsp;&emsp;&emsp;&emsp;&ensp;- Column matching by position, New columns not allowed
3. MERGE INSERT &emsp;&emsp;&emsp;&nbsp;- Column matching by name, New columns ignored
4. DataFrame Append &emsp;&nbsp;- Column matching by name, New columns not allowed
5. Data Type Mismatch &emsp;- Not allowed in any case
#### Schema evolution approaches
1. Manual&emsp;&nbsp; - New columns
2. Automatic - New columns

#### INSERT - Column matching by position (matching names not mandatory)

In [11]:
spark.sql('''
INSERT INTO demo_db.people_schema_validation
SELECT id, fname, lname
FROM JSON.`/Users/sahilnagpal/Desktop/wordsToSpeak/delta_lake/dataset/people.json`
''')

                                                                                

DataFrame[]

#### INSERT - New columns not allowed

In [14]:
spark.sql('''
INSERT INTO demo_db.people_schema_validation
SELECT id, fname, lname, dob
FROM JSON.`/Users/sahilnagpal/Desktop/wordsToSpeak/delta_lake/dataset/people.json`
''')

AnalysisException: [_LEGACY_ERROR_TEMP_DELTA_0007] A schema mismatch detected when writing to the Delta table (Table ID: 73742a51-40c9-4533-b67a-ea60135531f6).
To enable schema migration using DataFrameWriter or DataStreamWriter, please set:
'.option("mergeSchema", "true")'.
For other operations, set the session configuration
spark.databricks.delta.schema.autoMerge.enabled to "true". See the documentation
specific to the operation for details.

Table schema:
root
-- id: integer (nullable = true)
-- firstName: string (nullable = true)
-- lastName: string (nullable = true)


Data schema:
root
-- id: integer (nullable = true)
-- firstName: string (nullable = true)
-- lastName: string (nullable = true)
-- dob: string (nullable = true)

         

#### OVERWRITE - New columns not allowed

In [15]:
spark.sql('''
INSERT OVERWRITE demo_db.people_schema_validation
SELECT id, fname, lname, dob
FROM JSON.`/Users/sahilnagpal/Desktop/wordsToSpeak/delta_lake/dataset/people.json`
''')

AnalysisException: [_LEGACY_ERROR_TEMP_DELTA_0007] A schema mismatch detected when writing to the Delta table (Table ID: 73742a51-40c9-4533-b67a-ea60135531f6).
To enable schema migration using DataFrameWriter or DataStreamWriter, please set:
'.option("mergeSchema", "true")'.
For other operations, set the session configuration
spark.databricks.delta.schema.autoMerge.enabled to "true". See the documentation
specific to the operation for details.

Table schema:
root
-- id: integer (nullable = true)
-- firstName: string (nullable = true)
-- lastName: string (nullable = true)


Data schema:
root
-- id: integer (nullable = true)
-- firstName: string (nullable = true)
-- lastName: string (nullable = true)
-- dob: string (nullable = true)

         
To overwrite your schema or change partitioning, please set:
'.option("overwriteSchema", "true")'.

Note that the schema can't be overwritten when using
'replaceWhere'.
         

#### MERGE - Column matching by name (matching by position not allowed)

In [16]:
spark.sql('''
MERGE INTO demo_db.people_schema_validation tgt
USING (SELECT id, fname, lname FROM json.`/Users/sahilnagpal/Desktop/wordsToSpeak/delta_lake/dataset/people_2.json`) src
ON tgt.id = src.id
WHEN NOT MATCHED THEN
    INSERT * 
''')

AnalysisException: [DELTA_MERGE_UNRESOLVED_EXPRESSION] Cannot resolve firstName in INSERT clause given columns src.id, src.fname, src.lname; line 2 pos 0

#### MERGE - New columns silently ignored

In [17]:
spark.sql('''
MERGE INTO demo_db.people_schema_validation tgt
USING (SELECT id, fname firstName, lname lastName, dob FROM json.`/Users/sahilnagpal/Desktop/wordsToSpeak/delta_lake/dataset/people_2.json`) src
ON tgt.id = src.id
WHEN NOT MATCHED THEN
    INSERT *
''')

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

### Using DataFrame API

#### Dataframe append - Column matching by name (matching by position not allowed)

In [19]:
people_schema = "id INT, fname STRING, lname STRING"
people_df =  spark.read.format("json").schema(people_schema).load("/Users/sahilnagpal/Desktop/wordsToSpeak/delta_lake/dataset/people_2.json")
people_df.write.format("delta").mode("append").saveAsTable("demo_db.people_schema_validation")

AnalysisException: [_LEGACY_ERROR_TEMP_DELTA_0007] A schema mismatch detected when writing to the Delta table (Table ID: 73742a51-40c9-4533-b67a-ea60135531f6).
To enable schema migration using DataFrameWriter or DataStreamWriter, please set:
'.option("mergeSchema", "true")'.
For other operations, set the session configuration
spark.databricks.delta.schema.autoMerge.enabled to "true". See the documentation
specific to the operation for details.

Table schema:
root
-- id: integer (nullable = true)
-- firstName: string (nullable = true)
-- lastName: string (nullable = true)


Data schema:
root
-- id: integer (nullable = true)
-- fname: string (nullable = true)
-- lname: string (nullable = true)

         

#### Dataframe append - New columns not allowed

In [20]:
people_schema = "id INT, firstName STRING, lastName STRING, dob STRING"
people_df =  spark.read.format("json").schema(people_schema).load("/Users/sahilnagpal/Desktop/wordsToSpeak/delta_lake/dataset/people_2.json")
people_df.write.format("delta").mode("append").saveAsTable("demo_db.people_schema_validation")
people_df.show(truncate=False)

AnalysisException: [_LEGACY_ERROR_TEMP_DELTA_0007] A schema mismatch detected when writing to the Delta table (Table ID: 73742a51-40c9-4533-b67a-ea60135531f6).
To enable schema migration using DataFrameWriter or DataStreamWriter, please set:
'.option("mergeSchema", "true")'.
For other operations, set the session configuration
spark.databricks.delta.schema.autoMerge.enabled to "true". See the documentation
specific to the operation for details.

Table schema:
root
-- id: integer (nullable = true)
-- firstName: string (nullable = true)
-- lastName: string (nullable = true)


Data schema:
root
-- id: integer (nullable = true)
-- firstName: string (nullable = true)
-- lastName: string (nullable = true)
-- dob: string (nullable = true)

         