### Important Note!

This notebook was run using Databricks Community Edition with a 12.2 LTS Runtime version!

The goal is to show the **consequences and differences** between cloning Delta tables using **Shallow** and Deep Clones while not under **Unity Catalog** coverage, proving that Shallow Clone's zero-copy element will **fail** in case of deletion (with Vacuum) in data source.

Insert `spark.databricks.delta.retentionDurationCheck.enabled false` during cluster configs creation to be able to use VACUUM properly in this notebook.

The purpose of this notebook is just to show the version of the SQL commands for Python, **always use the SQL version as a reference**, as it was the one used during the Databricks SQL course.

In [0]:
%fs ls /databricks-datasets/COVID/coronavirusdataset

path,name,size,modificationTime
dbfs:/databricks-datasets/COVID/coronavirusdataset/.DS_Store,.DS_Store,6148,1594102716000
dbfs:/databricks-datasets/COVID/coronavirusdataset/Case.csv,Case.csv,11711,1595191979000
dbfs:/databricks-datasets/COVID/coronavirusdataset/PatientInfo.csv,PatientInfo.csv,488859,1595191979000
dbfs:/databricks-datasets/COVID/coronavirusdataset/PatientRoute.csv,PatientRoute.csv,718510,1594102718000
dbfs:/databricks-datasets/COVID/coronavirusdataset/Policy.csv,Policy.csv,5713,1595191981000
dbfs:/databricks-datasets/COVID/coronavirusdataset/Region.csv,Region.csv,19082,1595191981000
dbfs:/databricks-datasets/COVID/coronavirusdataset/SearchTrend.csv,SearchTrend.csv,71722,1595191981000
dbfs:/databricks-datasets/COVID/coronavirusdataset/SeoulFloating.csv,SeoulFloating.csv,49682281,1595191981000
dbfs:/databricks-datasets/COVID/coronavirusdataset/Time.csv,Time.csv,6604,1595191981000
dbfs:/databricks-datasets/COVID/coronavirusdataset/TimeAge.csv,TimeAge.csv,27114,1595191981000


In [0]:
df_covid = spark.read\
  .option("header","true")\
  .option("inferSchema","true")\
  .option('delimiter', ',')\
  .csv("dbfs:/databricks-datasets/COVID/coronavirusdataset/PatientInfo.csv")

df_covid.write.format("delta").mode("overwrite").saveAsTable("COVID_Clone")

In [0]:
df_covid_clone = spark.read.table("COVID_Clone")
display(df_covid_clone.limit(5))

patient_id,sex,age,country,province,city,infection_case,infected_by,contact_number,symptom_onset_date,confirmed_date,released_date,deceased_date,state
1000000001,male,50s,Korea,Seoul,Gangseo-gu,overseas inflow,,75,2020-01-22,2020-01-23,2020-02-05,,released
1000000002,male,30s,Korea,Seoul,Jungnang-gu,overseas inflow,,31,,2020-01-30,2020-03-02,,released
1000000003,male,50s,Korea,Seoul,Jongno-gu,contact with patient,2002000001.0,17,,2020-01-30,2020-02-19,,released
1000000004,male,20s,Korea,Seoul,Mapo-gu,overseas inflow,,9,2020-01-26,2020-01-30,2020-02-15,,released
1000000005,female,20s,Korea,Seoul,Seongbuk-gu,contact with patient,1000000002.0,2,,2020-01-31,2020-02-24,,released


In [0]:
df_clone_covid_deep = df_covid_clone.clone('dbfs://hive_metastore_test/covid_deep_clone', isShallow = False)

source_table_size,source_num_of_files,num_removed_files,num_copied_files,removed_files_size,copied_files_size
52110,1,0,1,0,52110


In [0]:
display(df_covid_clone.history())

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,2023-09-14T22:17:33.000+0000,6470955272461360,jvrinacio@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(isManaged -> true, description -> null, partitionBy -> [], properties -> {})",,List(4026694417721742),0914-220300-t7nog3bn,,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 5165, numOutputBytes -> 52110)",,Databricks-Runtime/12.2.x-scala2.12


In [0]:
df_clone_covid_shallow = df_covid_clone.clone('dbfs://hive_metastore_test/covid_shallow_clone', isShallow = True)

source_table_size,source_num_of_files,num_removed_files,num_copied_files,removed_files_size,copied_files_size
52110,1,0,0,0,0


In [0]:
display(df_covid_clone.count())

count(1)
5165


In [0]:
display(df_clone_covid_deep.count())

count(1)
5165


In [0]:
display(df_clone_covid_shallow.count())

count(1)
5165


In [0]:
df_covid_clone.delete("sex = 'male'")

num_affected_rows
1825


In [0]:
display(df_covid_clone.count())

count(1)
3340


In [0]:
display(df_covid_clone.history())

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
1,2023-09-14T22:19:44.000+0000,6470955272461360,jvrinacio@gmail.com,DELETE,"Map(predicate -> [""(sex#2961 = male)""])",,List(4026694417721742),0914-220300-t7nog3bn,0.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 52110, numCopiedRows -> 3340, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 4029, numDeletedRows -> 1825, scanTimeMs -> 1889, numAddedFiles -> 1, numAddedBytes -> 36377, rewriteTimeMs -> 2120)",,Databricks-Runtime/12.2.x-scala2.12
0,2023-09-14T22:17:33.000+0000,6470955272461360,jvrinacio@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(isManaged -> true, description -> null, partitionBy -> [], properties -> {})",,List(4026694417721742),0914-220300-t7nog3bn,,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 5165, numOutputBytes -> 52110)",,Databricks-Runtime/12.2.x-scala2.12


In [0]:
display(df_clone_covid_deep.count())

count(1)
5165


In [0]:
display(df_clone_covid_shallow.filter(col(sex) = 'male'))

patient_id,sex,age,country,province,city,infection_case,infected_by,contact_number,symptom_onset_date,confirmed_date,released_date,deceased_date,state
1000000001,male,50s,Korea,Seoul,Gangseo-gu,overseas inflow,,75,2020-01-22,2020-01-23,2020-02-05,,released
1000000002,male,30s,Korea,Seoul,Jungnang-gu,overseas inflow,,31,,2020-01-30,2020-03-02,,released
1000000003,male,50s,Korea,Seoul,Jongno-gu,contact with patient,2002000001.0,17,,2020-01-30,2020-02-19,,released
1000000004,male,20s,Korea,Seoul,Mapo-gu,overseas inflow,,9,2020-01-26,2020-01-30,2020-02-15,,released
1000000007,male,20s,Korea,Seoul,Jongno-gu,contact with patient,1000000003.0,0,,2020-01-31,2020-02-10,,released
1000000008,male,20s,Korea,Seoul,etc,overseas inflow,,0,,2020-02-02,2020-02-24,,released
1000000009,male,30s,Korea,Seoul,Songpa-gu,overseas inflow,,68,,2020-02-05,2020-02-21,,released
1000000012,male,20s,Korea,Seoul,etc,overseas inflow,,0,,2020-02-07,2020-02-27,,released
1000000013,male,80s,Korea,Seoul,Jongno-gu,contact with patient,1000000017.0,117,,2020-02-16,,,deceased
1000000015,male,70s,Korea,Seoul,Seongdong-gu,Seongdong-gu APT,,8,2020-02-11,2020-02-19,,,released


In [0]:
df_covid_clone.vacuum(0)

path
dbfs:/user/hive/warehouse/covid_clone


In [0]:
display(df_clone_covid_shallow.filter(col(sex) = 'male'))

In [0]:
display(df_clone_covid_deep.filter(col(sex) = 'male'))

patient_id,sex,age,country,province,city,infection_case,infected_by,contact_number,symptom_onset_date,confirmed_date,released_date,deceased_date,state
1000000001,male,50s,Korea,Seoul,Gangseo-gu,overseas inflow,,75,2020-01-22,2020-01-23,2020-02-05,,released
1000000002,male,30s,Korea,Seoul,Jungnang-gu,overseas inflow,,31,,2020-01-30,2020-03-02,,released
1000000003,male,50s,Korea,Seoul,Jongno-gu,contact with patient,2002000001.0,17,,2020-01-30,2020-02-19,,released
1000000004,male,20s,Korea,Seoul,Mapo-gu,overseas inflow,,9,2020-01-26,2020-01-30,2020-02-15,,released
1000000007,male,20s,Korea,Seoul,Jongno-gu,contact with patient,1000000003.0,0,,2020-01-31,2020-02-10,,released
1000000008,male,20s,Korea,Seoul,etc,overseas inflow,,0,,2020-02-02,2020-02-24,,released
1000000009,male,30s,Korea,Seoul,Songpa-gu,overseas inflow,,68,,2020-02-05,2020-02-21,,released
1000000012,male,20s,Korea,Seoul,etc,overseas inflow,,0,,2020-02-07,2020-02-27,,released
1000000013,male,80s,Korea,Seoul,Jongno-gu,contact with patient,1000000017.0,117,,2020-02-16,,,deceased
1000000015,male,70s,Korea,Seoul,Seongdong-gu,Seongdong-gu APT,,8,2020-02-11,2020-02-19,,,released
