In [0]:
import pyspark.sql.functions as F
import pyspark.sql.types as T

### Data Reading

In [0]:
df = (
    spark
    .read
    .format('parquet')
    .load('abfss://bronze@databricksstorageete.dfs.core.windows.net/customers')
)

In [0]:
display(df)

In [0]:
transformed_df = (
    df
    .withColumn('email_domain', F.element_at(F.split('email', '@'), -1))
    .withColumn('full_name', F.concat('first_name', F.lit(' '), 'last_name'))
)
transformed_df.display()

In [0]:
(
    transformed_df
    .groupBy('email_domain')
    .agg(F.count('customer_id').alias('count_of_customers'))
    .sort(F.desc('count_of_customers'))
    .display()
)

In [0]:
transformed_df_gmail = (
    transformed_df
    .filter(F.col('email_domain') == 'gmail.com') 
)
# transformed_df_gmail.display()

transformed_df_hotmail = (
    transformed_df
    .filter(F.col('email_domain') == 'hotmail.com') 
)
# transformed_df_hotmail.display()

transformed_df_yahoo = (
    transformed_df
    .filter(F.col('email_domain') == 'yahoo.com') 
)
# transformed_df_yahoo.display()

In [0]:
(
    transformed_df
    .write
    .format('delta')
    .mode('overwrite')
    .save('abfss://silver@databricksstorageete.dfs.core.windows.net/customers')
)

### Create External Table

In [0]:
%sql
CREATE TABLE IF NOT EXISTS databricks_cata.silver.customers
USING DELTA
LOCATION 'abfss://silver@databricksstorageete.dfs.core.windows.net/customers'

In [0]:
%sql
SELECT * FROM databricks_cata.silver.customers;

In [0]:
%sql
DESCRIBE HISTORY databricks_cata.silver.customers;

In [0]:
from delta import DeltaTable

In [0]:
customers_delta = DeltaTable.forPath(spark, "abfss://silver@databricksstorageete.dfs.core.windows.net/customers")
customers_delta

In [0]:
customers_delta.toDF().display()

In [0]:
%sql
-- Drop external table but data still remain in external location
DROP TABLE IF EXISTS databricks_cata.silver.customers;

In [0]:
%sql
--Read external delta table after dropping
SELECT * FROM databricks_cata.silver.customers;

In [0]:
%sql
--Read delta lake files from external location 
SELECT * FROM delta.`abfss://silver@databricksstorageete.dfs.core.windows.net/customers`;

In [0]:
%sql
--Read delta lake files from external location and time travel using version
SELECT * FROM delta.`abfss://silver@databricksstorageete.dfs.core.windows.net/customers` VERSION AS OF 18;

In [0]:
%sql
--Read delta lake files from external location and time travel using timestamp
SELECT * FROM delta.`abfss://silver@databricksstorageete.dfs.core.windows.net/customers` TIMESTAMP AS OF "2025-11-23T01:40:12.000+00:00";

In [0]:
%sql
-- Can't describe history after dropping delta table, but can describe history after DELETE FROM statement
DESCRIBE HISTORY `databricks_cata.silver.customers`;

In [0]:
%sql
-- Can't describe history after dropping delta table, but can describe history after querying from delta lake files
DESCRIBE HISTORY delta.`abfss://silver@databricksstorageete.dfs.core.windows.net/customers` ;

In [0]:
%sql
RESTORE TABLE databricks_cata.silver.customers TO VERSION AS OF 18;
    

### Manually restoring dropped delta table after querying delta lake files and time travel to older verion

In [0]:
restored_df = spark.sql("""
SELECT * FROM delta.`abfss://silver@databricksstorageete.dfs.core.windows.net/customers` TIMESTAMP AS OF "2025-11-23T01:40:12.000+00:00";
""")

In [0]:
(
    restored_df
    .write
    .format('delta')
    .mode('overwrite')
    .save('abfss://silver@databricksstorageete.dfs.core.windows.net/customers')
)

In [0]:
%sql
CREATE TABLE IF NOT EXISTS databricks_cata.silver.customers
USING DELTA
LOCATION 'abfss://silver@databricksstorageete.dfs.core.windows.net/customers'

In [0]:
%sql
DESCRIBE HISTORY databricks_cata.silver.customers

In [0]:
%sql
RESTORE TABLE databricks_cata.silver.customers TO VERSION AS OF 20;

In [0]:
(
    spark
    .read
    .format('delta')
    .option("versionAsOf", "17")
    .load('abfss://silver@databricksstorageete.dfs.core.windows.net/customers')
).display()

In [0]:
(
    spark
    .read
    .format("delta")
    .option("timestampAsOf", "2025-11-23T01:33:12.000+00:00")
    .table("databricks_cata.silver.customers")
).display()

### Restoring delta table using DeltaTable API

In [0]:
## No way to restore delta table using dataframe API but it is possible using DeltaTable API
dt_customers = DeltaTable.forName(spark, "databricks_cata.silver.customers")
dt_customers.restoreToTimestamp("2025-11-23T01:50:12.000+00:00")

In [0]:
%sql
--Read external delta table after dropping
SELECT * FROM databricks_cata.silver.customers;

In [0]:
test_df = (
    spark
    .read
    .format("delta")
    .table("databricks_cata.silver.customers")
    # .toDF() #for renaming to new column names
)
test_df.display()

In [0]:
test_df.select([F.col(c).alias("premerge_"+c) for c in test_df.columns]).display()

In [0]:
%sql
DESCRIBE EXTENDED databricks_cata.silver.customers;

In [0]:
%sql
DELETE FROM databricks_cata.silver.customers;
VACUUM databricks_cata.silver.customers DRY RUN;

In [0]:
# Best practice to delete complete data from Delta Table 
# 

# It's recommended to use the overwrite option. Overwrite the table data and run a VACUUM command. 

#     To Delete the data from a Managed Delta table, the DROP TABLE command can be used. 
#     If it's an external table, then run a DELETE query on the table and then execute VACUUM with RETAIN HOURS 0
#     CREATE or REPLACE table can also be used
#     We do not recommend deleting the files from the underlying storage directly. That can cause issues with the transaction logs


#  https://community.databricks.com/t5/data-engineering/what-is-the-best-practice-of-deleting-the-complete-data-from/td-p/19166

In [0]:
# Doesn't work in serverless compute as RDD API is not supported and spark configs have limited support
# https://community.databricks.com/t5/data-engineering/change-spark-configs-in-serverless-compute-clusters/td-p/105512
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")

In [0]:
%sql
DELETE FROM databricks_cata.silver.customers;
VACUUM databricks_cata.silver.customers RETAIN 0 HOURS DRY RUN;

In [0]:
%sql
DELETE FROM databricks_cata.silver.customers;
VACUUM databricks_cata.silver.customers RETAIN 0 HOURS;

In [0]:
%sql
DROP TABLE IF EXISTS databricks_cata.silver.customers;

In [0]:
%sql
SELECT * FROM databricks_cata.silver.customers;

In [0]:
%fs
ls "abfss://silver@databricksstorageete.dfs.core.windows.net/customers"

In [0]:
# # After deleting underlying data files in data lake, delta log still exists, to delete it use
# dbutils.fs.rm("abfss://silver@databricksstorageete.dfs.core.windows.net/customer/_delta_log/", recurse=True)

In [0]:
# Manual way of deleting underlying delta log and data files
# dbutils.fs.rm("abfss://silver@databricksstorageete.dfs.core.windows.net/customer", recurse=True)