In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark-DeltaLake").config("spark.jars.packages","io.delta:delta-core_2.12:1.2.0").config("spark.sql.extensions","io.delta.sql.DeltaSparkSessionExtension") \
    .getOrCreate()

In [0]:
attritionFileLocation="/FileStore/tables/attrition.csv"
attritionUpdatesFileLocation="/FileStore/tables/attrition_updates.csv"

attrition_df = spark.read.csv(attritionFileLocation,header=True,inferSchema=True)
attrition_updates_df = spark.read.option("delimiter", "\t").csv(attritionUpdatesFileLocation,header=True,inferSchema=True)

In [0]:
attrition_df.write.format("delta").mode("Overwrite").save("/FileStore/tables/attrition_delta")
attrition_updates_df.write.format("delta").mode("Overwrite").save("/FileStore/tables/attrition_delta_updates")


In [0]:
display(dbutils.fs.ls("/FileStore/tables/attrition_delta"))

path,name,size,modificationTime
dbfs:/FileStore/tables/attrition_delta/_delta_log/,_delta_log/,0,1658239468000
dbfs:/FileStore/tables/attrition_delta/part-00000-618fe806-3518-4a81-b9c6-87bd7f1c55b3-c000.snappy.parquet,part-00000-618fe806-3518-4a81-b9c6-87bd7f1c55b3-c000.snappy.parquet,52381,1658239466000


In [0]:
from delta.tables import *
from pyspark.sql.functions import *

attritionDeltaTable = DeltaTable.forPath(spark, '/FileStore/tables/attrition_delta')
# Declare the predicate by using a SQL-formatted string.
attritionDeltaTable.update(
  condition = "gender = 'Female'",
  set = { "gender": "'F'" }
)

# Declare the predicate by using Spark SQL functions.
attritionDeltaTable.update(
  condition = col('gender') == 'Male',
  set = { 'gender': lit('M') }
)

attritionDeltaTable.toDF().show()

In [0]:
from delta.tables import *
from pyspark.sql.functions import *

# Declare the predicate by using a SQL-formatted string.
attritionDeltaTable.delete("TotalWorkingYears < 1")

# Declare the predicate by using Spark SQL functions and implicits.
attritionDeltaTable.delete(col("TotalWorkingYears") < 2)

attritionDeltaTable.toDF().show()

In [0]:
from delta.tables import *

attritionUpdatesDeltaTable = DeltaTable.forPath(spark, '/FileStore/tables/attrition_delta_updates')


dfUpdates = attritionUpdatesDeltaTable.toDF()

attritionDeltaTable.alias('attrition') \
  .merge(
    dfUpdates.alias('updates'),
    'attrition.EmployeeNumber = updates.EmployeeNumber'
  ) \
  .whenMatchedUpdate(set =
    {
 "Age": "updates.Age",
 "Attrition": "updates.Attrition",
 "BusinessTravel": "updates.BusinessTravel",
 "DailyRate": "updates.DailyRate",
 "Department": "updates.Department",
 "DistanceFromHome": "updates.DistanceFromHome",
 "Education": "updates.Education",
 "EducationField": "updates.EducationField",
 "EmployeeCount": "updates.EmployeeCount",
 "EmployeeNumber": "updates.EmployeeNumber",
 "EnvironmentSatisfaction": "updates.EnvironmentSatisfaction",
 "Gender": "updates.Gender",
 "HourlyRate": "updates.HourlyRate",
 "JobInvolvement": "updates.JobInvolvement",
 "JobLevel": "updates.JobLevel",
 "JobRole": "updates.JobRole",
 "JobSatisfaction": "updates.JobSatisfaction",
 "MaritalStatus": "updates.MaritalStatus",
 "MonthlyIncome": "updates.MonthlyIncome",
 "MonthlyRate": "updates.MonthlyRate",
 "NumCompaniesWorked": "updates.NumCompaniesWorked",
 "Over18": "updates.Over18",
 "OverTime": "updates.OverTime",
 "PercentSalaryHike": "updates.PercentSalaryHike",
 "PerformanceRating": "updates.PerformanceRating",
 "RelationshipSatisfaction": "updates.RelationshipSatisfaction",
 "StandardHours": "updates.StandardHours",
 "StockOptionLevel": "updates.StockOptionLevel",
 "TotalWorkingYears": "updates.TotalWorkingYears",
 "TrainingTimesLastYear": "updates.TrainingTimesLastYear",
 "WorkLifeBalance": "updates.WorkLifeBalance",
 "YearsAtCompany": "updates.YearsAtCompany",
 "YearsInCurrentRole": "updates.YearsInCurrentRole",
 "YearsSinceLastPromotion": "updates.YearsSinceLastPromotion",
 "YearsWithCurrManager": "updates.YearsWithCurrManager"
    }
  ) \
  .whenNotMatchedInsert(values =
    {
 "Age": "updates.Age",
 "Attrition": "updates.Attrition",
 "BusinessTravel": "updates.BusinessTravel",
 "DailyRate": "updates.DailyRate",
 "Department": "updates.Department",
 "DistanceFromHome": "updates.DistanceFromHome",
 "Education": "updates.Education",
 "EducationField": "updates.EducationField",
 "EmployeeCount": "updates.EmployeeCount",
 "EmployeeNumber": "updates.EmployeeNumber",
 "EnvironmentSatisfaction": "updates.EnvironmentSatisfaction",
 "Gender": "updates.Gender",
 "HourlyRate": "updates.HourlyRate",
 "JobInvolvement": "updates.JobInvolvement",
 "JobLevel": "updates.JobLevel",
 "JobRole": "updates.JobRole",
 "JobSatisfaction": "updates.JobSatisfaction",
 "MaritalStatus": "updates.MaritalStatus",
 "MonthlyIncome": "updates.MonthlyIncome",
 "MonthlyRate": "updates.MonthlyRate",
 "NumCompaniesWorked": "updates.NumCompaniesWorked",
 "Over18": "updates.Over18",
 "OverTime": "updates.OverTime",
 "PercentSalaryHike": "updates.PercentSalaryHike",
 "PerformanceRating": "updates.PerformanceRating",
 "RelationshipSatisfaction": "updates.RelationshipSatisfaction",
 "StandardHours": "updates.StandardHours",
 "StockOptionLevel": "updates.StockOptionLevel",
 "TotalWorkingYears": "updates.TotalWorkingYears",
 "TrainingTimesLastYear": "updates.TrainingTimesLastYear",
 "WorkLifeBalance": "updates.WorkLifeBalance",
 "YearsAtCompany": "updates.YearsAtCompany",
 "YearsInCurrentRole": "updates.YearsInCurrentRole",
 "YearsSinceLastPromotion": "updates.YearsSinceLastPromotion",
 "YearsWithCurrManager": "updates.YearsWithCurrManager"
    }
  ) \
  .execute()

In [0]:
display(attrition_df.select("Department").distinct())

Department
Sales
Research & Development
Human Resources


In [0]:
attrition_df.write.format("delta").mode("Overwrite").save("/FileStore/tables/attrition_transactionlog")

In [0]:
attrition_df.write.format("delta").mode("Overwrite").partitionBy("Department").option("overwriteschema","true").save("/FileStore/tables/attrition_transactionlog")

In [0]:
display(dbutils.fs.ls("/FileStore/tables/attrition_transactionlog"))

path,name,size,modificationTime
dbfs:/FileStore/tables/attrition_transactionlog/Department=Human Resources/,Department=Human Resources/,0,1658239514000
dbfs:/FileStore/tables/attrition_transactionlog/Department=Research & Development/,Department=Research & Development/,0,1658239515000
dbfs:/FileStore/tables/attrition_transactionlog/Department=Sales/,Department=Sales/,0,1658239515000
dbfs:/FileStore/tables/attrition_transactionlog/_delta_log/,_delta_log/,0,1658239516000
dbfs:/FileStore/tables/attrition_transactionlog/part-00000-2736b574-aa6e-4a8e-b0c8-730dfb779a97-c000.snappy.parquet,part-00000-2736b574-aa6e-4a8e-b0c8-730dfb779a97-c000.snappy.parquet,52381,1658239504000


In [0]:
display(dbutils.fs.ls("/FileStore/tables/attrition_transactionlog/" + "_delta_log"))

path,name,size,modificationTime
dbfs:/FileStore/tables/attrition_transactionlog/_delta_log/00000000000000000000.crc,00000000000000000000.crc,4685,1658239505000
dbfs:/FileStore/tables/attrition_transactionlog/_delta_log/00000000000000000000.json,00000000000000000000.json,6245,1658239504000
dbfs:/FileStore/tables/attrition_transactionlog/_delta_log/00000000000000000001.crc,00000000000000000001.crc,4705,1658239516000
dbfs:/FileStore/tables/attrition_transactionlog/_delta_log/00000000000000000001.json,00000000000000000001.json,11629,1658239515000
dbfs:/FileStore/tables/attrition_transactionlog/_delta_log/__tmp_path_dir/,__tmp_path_dir/,0,1658239516000


In [0]:
display(spark.read.json(("/FileStore/tables/attrition_transactionlog/" + "_delta_log/00000000000000000000.json")))

add,commitInfo,metaData,protocol
,,,"List(1, 2)"
,,"List(1658239503527, List(parquet), 8a07e542-9201-49ea-ab65-6f3b57f01754, List(), {""type"":""struct"",""fields"":[{""name"":""Age"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""Attrition"",""type"":""string"",""nullable"":true,""metadata"":{}},{""name"":""BusinessTravel"",""type"":""string"",""nullable"":true,""metadata"":{}},{""name"":""DailyRate"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""Department"",""type"":""string"",""nullable"":true,""metadata"":{}},{""name"":""DistanceFromHome"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""Education"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""EducationField"",""type"":""string"",""nullable"":true,""metadata"":{}},{""name"":""EmployeeCount"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""EmployeeNumber"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""EnvironmentSatisfaction"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""Gender"",""type"":""string"",""nullable"":true,""metadata"":{}},{""name"":""HourlyRate"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""JobInvolvement"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""JobLevel"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""JobRole"",""type"":""string"",""nullable"":true,""metadata"":{}},{""name"":""JobSatisfaction"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""MaritalStatus"",""type"":""string"",""nullable"":true,""metadata"":{}},{""name"":""MonthlyIncome"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""MonthlyRate"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""NumCompaniesWorked"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""Over18"",""type"":""string"",""nullable"":true,""metadata"":{}},{""name"":""OverTime"",""type"":""string"",""nullable"":true,""metadata"":{}},{""name"":""PercentSalaryHike"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""PerformanceRating"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""RelationshipSatisfaction"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""StandardHours"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""StockOptionLevel"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""TotalWorkingYears"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""TrainingTimesLastYear"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""WorkLifeBalance"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""YearsAtCompany"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""YearsInCurrentRole"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""YearsSinceLastPromotion"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""YearsWithCurrManager"",""type"":""integer"",""nullable"":true,""metadata"":{}}]})",
"List(true, 1658239504000, part-00000-2736b574-aa6e-4a8e-b0c8-730dfb779a97-c000.snappy.parquet, 52381, {""numRecords"":1470,""minValues"":{""Age"":18,""Attrition"":""No"",""BusinessTravel"":""Non-Travel"",""DailyRate"":102,""Department"":""Human Resources"",""DistanceFromHome"":1,""Education"":1,""EducationField"":""Human Resources"",""EmployeeCount"":1,""EmployeeNumber"":1,""EnvironmentSatisfaction"":1,""Gender"":""Female"",""HourlyRate"":30,""JobInvolvement"":1,""JobLevel"":1,""JobRole"":""Healthcare Representative"",""JobSatisfaction"":1,""MaritalStatus"":""Divorced"",""MonthlyIncome"":1009,""MonthlyRate"":2094,""NumCompaniesWorked"":0,""Over18"":""Y"",""OverTime"":""No"",""PercentSalaryHike"":11,""PerformanceRating"":3,""RelationshipSatisfaction"":1,""StandardHours"":80,""StockOptionLevel"":0,""TotalWorkingYears"":0,""TrainingTimesLastYear"":0,""WorkLifeBalance"":1,""YearsAtCompany"":0},""maxValues"":{""Age"":60,""Attrition"":""Yes"",""BusinessTravel"":""Travel_Rarely"",""DailyRate"":1499,""Department"":""Sales"",""DistanceFromHome"":29,""Education"":5,""EducationField"":""Technical Degree"",""EmployeeCount"":1,""EmployeeNumber"":2068,""EnvironmentSatisfaction"":4,""Gender"":""Male"",""HourlyRate"":100,""JobInvolvement"":4,""JobLevel"":5,""JobRole"":""Sales Representative"",""JobSatisfaction"":4,""MaritalStatus"":""Single"",""MonthlyIncome"":19999,""MonthlyRate"":26999,""NumCompaniesWorked"":9,""Over18"":""Y"",""OverTime"":""Yes"",""PercentSalaryHike"":25,""PerformanceRating"":4,""RelationshipSatisfaction"":4,""StandardHours"":80,""StockOptionLevel"":3,""TotalWorkingYears"":40,""TrainingTimesLastYear"":6,""WorkLifeBalance"":4,""YearsAtCompany"":40},""nullCount"":{""Age"":0,""Attrition"":0,""BusinessTravel"":0,""DailyRate"":0,""Department"":0,""DistanceFromHome"":0,""Education"":0,""EducationField"":0,""EmployeeCount"":0,""EmployeeNumber"":0,""EnvironmentSatisfaction"":0,""Gender"":0,""HourlyRate"":0,""JobInvolvement"":0,""JobLevel"":0,""JobRole"":0,""JobSatisfaction"":0,""MaritalStatus"":0,""MonthlyIncome"":0,""MonthlyRate"":0,""NumCompaniesWorked"":0,""Over18"":0,""OverTime"":0,""PercentSalaryHike"":0,""PerformanceRating"":0,""RelationshipSatisfaction"":0,""StandardHours"":0,""StockOptionLevel"":0,""TotalWorkingYears"":0,""TrainingTimesLastYear"":0,""WorkLifeBalance"":0,""YearsAtCompany"":0}}, List(1658239504000000, 268435456))",,,
,"List(0709-112236-vj7n6quv, Databricks-Runtime/10.4.x-scala2.12, false, WriteSerializable, List(489996382143165), WRITE, List(1, 52381, 1470), List(Overwrite, []), 1658239504202, a72f3355-0eeb-4399-a3cc-c21f1cf4a37f, 6178894471545234, surya.patchipala@cgi.com)",,


In [0]:
display(spark.read.json(("/FileStore/tables/attrition_transactionlog/" + "_delta_log/00000000000000000001.json")))

add,commitInfo,metaData,remove
,,"List(1658239503527, List(parquet), 8a07e542-9201-49ea-ab65-6f3b57f01754, List(Department), {""type"":""struct"",""fields"":[{""name"":""Age"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""Attrition"",""type"":""string"",""nullable"":true,""metadata"":{}},{""name"":""BusinessTravel"",""type"":""string"",""nullable"":true,""metadata"":{}},{""name"":""DailyRate"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""Department"",""type"":""string"",""nullable"":true,""metadata"":{}},{""name"":""DistanceFromHome"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""Education"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""EducationField"",""type"":""string"",""nullable"":true,""metadata"":{}},{""name"":""EmployeeCount"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""EmployeeNumber"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""EnvironmentSatisfaction"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""Gender"",""type"":""string"",""nullable"":true,""metadata"":{}},{""name"":""HourlyRate"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""JobInvolvement"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""JobLevel"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""JobRole"",""type"":""string"",""nullable"":true,""metadata"":{}},{""name"":""JobSatisfaction"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""MaritalStatus"",""type"":""string"",""nullable"":true,""metadata"":{}},{""name"":""MonthlyIncome"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""MonthlyRate"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""NumCompaniesWorked"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""Over18"",""type"":""string"",""nullable"":true,""metadata"":{}},{""name"":""OverTime"",""type"":""string"",""nullable"":true,""metadata"":{}},{""name"":""PercentSalaryHike"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""PerformanceRating"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""RelationshipSatisfaction"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""StandardHours"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""StockOptionLevel"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""TotalWorkingYears"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""TrainingTimesLastYear"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""WorkLifeBalance"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""YearsAtCompany"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""YearsInCurrentRole"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""YearsSinceLastPromotion"",""type"":""integer"",""nullable"":true,""metadata"":{}},{""name"":""YearsWithCurrManager"",""type"":""integer"",""nullable"":true,""metadata"":{}}]})",
"List(true, 1658239514000, List(Human Resources), Department=Human%20Resources/part-00000-d54bbffe-2a74-4ac6-b84d-b99aa5e444f0.c000.snappy.parquet, 12774, {""numRecords"":63,""minValues"":{""Age"":19,""Attrition"":""No"",""BusinessTravel"":""Non-Travel"",""DailyRate"":106,""DistanceFromHome"":1,""Education"":1,""EducationField"":""Human Resources"",""EmployeeCount"":1,""EmployeeNumber"":103,""EnvironmentSatisfaction"":1,""Gender"":""Female"",""HourlyRate"":30,""JobInvolvement"":1,""JobLevel"":1,""JobRole"":""Human Resources"",""JobSatisfaction"":1,""MaritalStatus"":""Divorced"",""MonthlyIncome"":1555,""MonthlyRate"":2097,""NumCompaniesWorked"":0,""Over18"":""Y"",""OverTime"":""No"",""PercentSalaryHike"":11,""PerformanceRating"":3,""RelationshipSatisfaction"":1,""StandardHours"":80,""StockOptionLevel"":0,""TotalWorkingYears"":1,""TrainingTimesLastYear"":0,""WorkLifeBalance"":1,""YearsAtCompany"":1,""YearsInCurrentRole"":0},""maxValues"":{""Age"":59,""Attrition"":""Yes"",""BusinessTravel"":""Travel_Rarely"",""DailyRate"":1444,""DistanceFromHome"":26,""Education"":5,""EducationField"":""Technical Degree"",""EmployeeCount"":1,""EmployeeNumber"":2040,""EnvironmentSatisfaction"":4,""Gender"":""Male"",""HourlyRate"":100,""JobInvolvement"":4,""JobLevel"":5,""JobRole"":""Manager"",""JobSatisfaction"":4,""MaritalStatus"":""Single"",""MonthlyIncome"":19717,""MonthlyRate"":26894,""NumCompaniesWorked"":9,""Over18"":""Y"",""OverTime"":""Yes"",""PercentSalaryHike"":23,""PerformanceRating"":4,""RelationshipSatisfaction"":4,""StandardHours"":80,""StockOptionLevel"":3,""TotalWorkingYears"":36,""TrainingTimesLastYear"":6,""WorkLifeBalance"":4,""YearsAtCompany"":33,""YearsInCurrentRole"":10},""nullCount"":{""Age"":0,""Attrition"":0,""BusinessTravel"":0,""DailyRate"":0,""DistanceFromHome"":0,""Education"":0,""EducationField"":0,""EmployeeCount"":0,""EmployeeNumber"":0,""EnvironmentSatisfaction"":0,""Gender"":0,""HourlyRate"":0,""JobInvolvement"":0,""JobLevel"":0,""JobRole"":0,""JobSatisfaction"":0,""MaritalStatus"":0,""MonthlyIncome"":0,""MonthlyRate"":0,""NumCompaniesWorked"":0,""Over18"":0,""OverTime"":0,""PercentSalaryHike"":0,""PerformanceRating"":0,""RelationshipSatisfaction"":0,""StandardHours"":0,""StockOptionLevel"":0,""TotalWorkingYears"":0,""TrainingTimesLastYear"":0,""WorkLifeBalance"":0,""YearsAtCompany"":0,""YearsInCurrentRole"":0}}, List(1658239514000000, 268435456))",,,
"List(true, 1658239515000, List(Research & Development), Department=Research%20&%20Development/part-00000-c890c8dc-b2b5-4f58-81b3-d081cccffb39.c000.snappy.parquet, 37498, {""numRecords"":961,""minValues"":{""Age"":18,""Attrition"":""No"",""BusinessTravel"":""Non-Travel"",""DailyRate"":102,""DistanceFromHome"":1,""Education"":1,""EducationField"":""Life Sciences"",""EmployeeCount"":1,""EmployeeNumber"":2,""EnvironmentSatisfaction"":1,""Gender"":""Female"",""HourlyRate"":30,""JobInvolvement"":1,""JobLevel"":1,""JobRole"":""Healthcare Representative"",""JobSatisfaction"":1,""MaritalStatus"":""Divorced"",""MonthlyIncome"":1009,""MonthlyRate"":2094,""NumCompaniesWorked"":0,""Over18"":""Y"",""OverTime"":""No"",""PercentSalaryHike"":11,""PerformanceRating"":3,""RelationshipSatisfaction"":1,""StandardHours"":80,""StockOptionLevel"":0,""TotalWorkingYears"":0,""TrainingTimesLastYear"":0,""WorkLifeBalance"":1,""YearsAtCompany"":0,""YearsInCurrentRole"":0},""maxValues"":{""Age"":60,""Attrition"":""Yes"",""BusinessTravel"":""Travel_Rarely"",""DailyRate"":1496,""DistanceFromHome"":29,""Education"":5,""EducationField"":""Technical Degree"",""EmployeeCount"":1,""EmployeeNumber"":2068,""EnvironmentSatisfaction"":4,""Gender"":""Male"",""HourlyRate"":100,""JobInvolvement"":4,""JobLevel"":5,""JobRole"":""Research Scientist"",""JobSatisfaction"":4,""MaritalStatus"":""Single"",""MonthlyIncome"":19999,""MonthlyRate"":26999,""NumCompaniesWorked"":9,""Over18"":""Y"",""OverTime"":""Yes"",""PercentSalaryHike"":25,""PerformanceRating"":4,""RelationshipSatisfaction"":4,""StandardHours"":80,""StockOptionLevel"":3,""TotalWorkingYears"":40,""TrainingTimesLastYear"":6,""WorkLifeBalance"":4,""YearsAtCompany"":40,""YearsInCurrentRole"":18},""nullCount"":{""Age"":0,""Attrition"":0,""BusinessTravel"":0,""DailyRate"":0,""DistanceFromHome"":0,""Education"":0,""EducationField"":0,""EmployeeCount"":0,""EmployeeNumber"":0,""EnvironmentSatisfaction"":0,""Gender"":0,""HourlyRate"":0,""JobInvolvement"":0,""JobLevel"":0,""JobRole"":0,""JobSatisfaction"":0,""MaritalStatus"":0,""MonthlyIncome"":0,""MonthlyRate"":0,""NumCompaniesWorked"":0,""Over18"":0,""OverTime"":0,""PercentSalaryHike"":0,""PerformanceRating"":0,""RelationshipSatisfaction"":0,""StandardHours"":0,""StockOptionLevel"":0,""TotalWorkingYears"":0,""TrainingTimesLastYear"":0,""WorkLifeBalance"":0,""YearsAtCompany"":0,""YearsInCurrentRole"":0}}, List(1658239514000001, 268435456))",,,
"List(true, 1658239515000, List(Sales), Department=Sales/part-00000-dceb6e32-5cf4-4976-b151-0fa786a7dc44.c000.snappy.parquet, 23853, {""numRecords"":446,""minValues"":{""Age"":18,""Attrition"":""No"",""BusinessTravel"":""Non-Travel"",""DailyRate"":107,""DistanceFromHome"":1,""Education"":1,""EducationField"":""Life Sciences"",""EmployeeCount"":1,""EmployeeNumber"":1,""EnvironmentSatisfaction"":1,""Gender"":""Female"",""HourlyRate"":30,""JobInvolvement"":1,""JobLevel"":1,""JobRole"":""Manager"",""JobSatisfaction"":1,""MaritalStatus"":""Divorced"",""MonthlyIncome"":1052,""MonthlyRate"":2137,""NumCompaniesWorked"":0,""Over18"":""Y"",""OverTime"":""No"",""PercentSalaryHike"":11,""PerformanceRating"":3,""RelationshipSatisfaction"":1,""StandardHours"":80,""StockOptionLevel"":0,""TotalWorkingYears"":0,""TrainingTimesLastYear"":0,""WorkLifeBalance"":1,""YearsAtCompany"":0,""YearsInCurrentRole"":0},""maxValues"":{""Age"":60,""Attrition"":""Yes"",""BusinessTravel"":""Travel_Rarely"",""DailyRate"":1499,""DistanceFromHome"":29,""Education"":5,""EducationField"":""Technical Degree"",""EmployeeCount"":1,""EmployeeNumber"":2065,""EnvironmentSatisfaction"":4,""Gender"":""Male"",""HourlyRate"":100,""JobInvolvement"":4,""JobLevel"":5,""JobRole"":""Sales Representative"",""JobSatisfaction"":4,""MaritalStatus"":""Single"",""MonthlyIncome"":19847,""MonthlyRate"":26997,""NumCompaniesWorked"":9,""Over18"":""Y"",""OverTime"":""Yes"",""PercentSalaryHike"":25,""PerformanceRating"":4,""RelationshipSatisfaction"":4,""StandardHours"":80,""StockOptionLevel"":3,""TotalWorkingYears"":38,""TrainingTimesLastYear"":6,""WorkLifeBalance"":4,""YearsAtCompany"":37,""YearsInCurrentRole"":17},""nullCount"":{""Age"":0,""Attrition"":0,""BusinessTravel"":0,""DailyRate"":0,""DistanceFromHome"":0,""Education"":0,""EducationField"":0,""EmployeeCount"":0,""EmployeeNumber"":0,""EnvironmentSatisfaction"":0,""Gender"":0,""HourlyRate"":0,""JobInvolvement"":0,""JobLevel"":0,""JobRole"":0,""JobSatisfaction"":0,""MaritalStatus"":0,""MonthlyIncome"":0,""MonthlyRate"":0,""NumCompaniesWorked"":0,""Over18"":0,""OverTime"":0,""PercentSalaryHike"":0,""PerformanceRating"":0,""RelationshipSatisfaction"":0,""StandardHours"":0,""StockOptionLevel"":0,""TotalWorkingYears"":0,""TrainingTimesLastYear"":0,""WorkLifeBalance"":0,""YearsAtCompany"":0,""YearsInCurrentRole"":0}}, List(1658239514000002, 268435456))",,,
,,,"List(true, 1658239515623, true, part-00000-2736b574-aa6e-4a8e-b0c8-730dfb779a97-c000.snappy.parquet, 52381, List(1658239504000000, 268435456))"
,"List(0709-112236-vj7n6quv, Databricks-Runtime/10.4.x-scala2.12, false, WriteSerializable, List(489996382143165), WRITE, List(3, 74125, 1470), List(Overwrite, [""Department""]), 0, 1658239515624, 2a4b2acc-0784-4d0d-9dc7-3f417846a6dd, 6178894471545234, surya.patchipala@cgi.com)",,


In [0]:
spark.read.json("/FileStore/tables/attrition_transactionlog/" + "_delta_log/00000000000000000001.json").columns

In [0]:
attrition_filter=attrition_df.filter(attrition_df['YearsAtCompany'] > 10)
display(attrition_filter)

Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
53,No,Travel_Rarely,1219,Sales,2,4,Life Sciences,1,23,1,Female,78,2,4,Manager,4,Married,15427,22021,2,Y,No,16,3,3,80,0,31,3,3,25,8,3,7
34,No,Travel_Rarely,419,Research & Development,7,4,Life Sciences,1,28,1,Female,53,3,3,Research Director,2,Single,11994,21293,0,Y,No,11,3,3,80,0,13,4,3,12,6,2,11
53,No,Travel_Rarely,1282,Research & Development,5,3,Other,1,32,3,Female,58,3,5,Manager,3,Divorced,19094,10735,4,Y,No,11,3,4,80,1,26,3,2,14,13,4,8
44,No,Travel_Rarely,477,Research & Development,7,4,Medical,1,36,1,Female,42,2,3,Healthcare Representative,4,Married,10248,2094,3,Y,No,14,3,4,80,1,24,4,3,22,6,5,17
30,No,Travel_Frequently,721,Research & Development,1,2,Medical,1,57,3,Female,58,3,2,Laboratory Technician,4,Single,4011,10781,1,Y,No,23,4,4,80,0,12,2,3,12,8,3,7
41,Yes,Travel_Rarely,1360,Research & Development,12,3,Technical Degree,1,58,2,Female,49,3,5,Research Director,3,Married,19545,16280,1,Y,No,12,3,4,80,0,23,0,3,22,15,15,8
33,No,Travel_Frequently,515,Research & Development,1,2,Life Sciences,1,73,1,Female,98,3,3,Research Director,4,Single,13458,15146,1,Y,Yes,12,3,3,80,0,15,1,3,15,14,8,12
50,No,Travel_Rarely,989,Research & Development,7,2,Medical,1,80,2,Female,43,2,5,Research Director,3,Divorced,18740,16701,5,Y,Yes,12,3,4,80,1,29,2,2,27,3,13,8
59,No,Travel_Rarely,1435,Sales,25,3,Life Sciences,1,81,1,Female,99,3,3,Sales Executive,1,Single,7637,2354,7,Y,No,11,3,4,80,0,28,3,2,21,16,7,9
36,No,Travel_Rarely,1223,Research & Development,8,3,Technical Degree,1,83,3,Female,59,3,3,Healthcare Representative,3,Divorced,10096,8202,1,Y,No,13,3,2,80,3,17,2,3,17,14,12,8


In [0]:
attrition_filter.write.format("delta").mode("Overwrite").partitionBy("Department").option("overwriteschema","true").save("/FileStore/tables/attrition_transactionlog")

In [0]:
display(dbutils.fs.ls("/FileStore/tables/attrition_transactionlog/" + "_delta_log"))

path,name,size,modificationTime
dbfs:/FileStore/tables/attrition_transactionlog/_delta_log/00000000000000000000.crc,00000000000000000000.crc,4685,1658239505000
dbfs:/FileStore/tables/attrition_transactionlog/_delta_log/00000000000000000000.json,00000000000000000000.json,6245,1658239504000
dbfs:/FileStore/tables/attrition_transactionlog/_delta_log/00000000000000000001.crc,00000000000000000001.crc,4705,1658239516000
dbfs:/FileStore/tables/attrition_transactionlog/_delta_log/00000000000000000001.json,00000000000000000001.json,11629,1658239515000
dbfs:/FileStore/tables/attrition_transactionlog/_delta_log/00000000000000000002.crc,00000000000000000002.crc,4697,1658239576000
dbfs:/FileStore/tables/attrition_transactionlog/_delta_log/00000000000000000002.json,00000000000000000002.json,9171,1658239576000
dbfs:/FileStore/tables/attrition_transactionlog/_delta_log/__tmp_path_dir/,__tmp_path_dir/,0,1658239576000


In [0]:
display(dbutils.fs.ls("/FileStore/tables/attrition_transactionlog/" + "Department=Sales/"))

path,name,size,modificationTime
dbfs:/FileStore/tables/attrition_transactionlog/Department=Sales/part-00000-4fe7c55a-5c9d-4a93-a756-4714d684dea1.c000.snappy.parquet,part-00000-4fe7c55a-5c9d-4a93-a756-4714d684dea1.c000.snappy.parquet,13608,1658239575000
dbfs:/FileStore/tables/attrition_transactionlog/Department=Sales/part-00000-dceb6e32-5cf4-4976-b151-0fa786a7dc44.c000.snappy.parquet,part-00000-dceb6e32-5cf4-4976-b151-0fa786a7dc44.c000.snappy.parquet,23853,1658239515000


In [0]:
spark.sql("DROP TABLE IF EXISTS delta_versions")
spark.sql("CREATE TABLE  delta_versions USING DELTA LOCATION '/FileStore/tables/attrition_transactionlog/'")

In [0]:
%sql
DESCRIBE HISTORY delta_versions

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
2,2022-07-19T14:06:16.000+0000,6178894471545234,surya.patchipala@cgi.com,WRITE,"Map(mode -> Overwrite, partitionBy -> [""Department""])",,List(489996382143165),0709-112236-vj7n6quv,1.0,WriteSerializable,False,"Map(numFiles -> 3, numOutputRows -> 246, numOutputBytes -> 39983)",,Databricks-Runtime/10.4.x-scala2.12
1,2022-07-19T14:05:15.000+0000,6178894471545234,surya.patchipala@cgi.com,WRITE,"Map(mode -> Overwrite, partitionBy -> [""Department""])",,List(489996382143165),0709-112236-vj7n6quv,0.0,WriteSerializable,False,"Map(numFiles -> 3, numOutputRows -> 1470, numOutputBytes -> 74125)",,Databricks-Runtime/10.4.x-scala2.12
0,2022-07-19T14:05:04.000+0000,6178894471545234,surya.patchipala@cgi.com,WRITE,"Map(mode -> Overwrite, partitionBy -> [])",,List(489996382143165),0709-112236-vj7n6quv,,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 1470, numOutputBytes -> 52381)",,Databricks-Runtime/10.4.x-scala2.12


In [0]:
attrition_v0 = spark.read.format("delta").option("versionAsOf",2).load("/FileStore/tables/attrition_transactionlog/")
display(attrition_v0)

Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
34,No,Travel_Rarely,419,Research & Development,7,4,Life Sciences,1,28,1,Female,53,3,3,Research Director,2,Single,11994,21293,0,Y,No,11,3,3,80,0,13,4,3,12,6,2,11
53,No,Travel_Rarely,1282,Research & Development,5,3,Other,1,32,3,Female,58,3,5,Manager,3,Divorced,19094,10735,4,Y,No,11,3,4,80,1,26,3,2,14,13,4,8
44,No,Travel_Rarely,477,Research & Development,7,4,Medical,1,36,1,Female,42,2,3,Healthcare Representative,4,Married,10248,2094,3,Y,No,14,3,4,80,1,24,4,3,22,6,5,17
30,No,Travel_Frequently,721,Research & Development,1,2,Medical,1,57,3,Female,58,3,2,Laboratory Technician,4,Single,4011,10781,1,Y,No,23,4,4,80,0,12,2,3,12,8,3,7
41,Yes,Travel_Rarely,1360,Research & Development,12,3,Technical Degree,1,58,2,Female,49,3,5,Research Director,3,Married,19545,16280,1,Y,No,12,3,4,80,0,23,0,3,22,15,15,8
33,No,Travel_Frequently,515,Research & Development,1,2,Life Sciences,1,73,1,Female,98,3,3,Research Director,4,Single,13458,15146,1,Y,Yes,12,3,3,80,0,15,1,3,15,14,8,12
50,No,Travel_Rarely,989,Research & Development,7,2,Medical,1,80,2,Female,43,2,5,Research Director,3,Divorced,18740,16701,5,Y,Yes,12,3,4,80,1,29,2,2,27,3,13,8
36,No,Travel_Rarely,1223,Research & Development,8,3,Technical Degree,1,83,3,Female,59,3,3,Healthcare Representative,3,Divorced,10096,8202,1,Y,No,13,3,2,80,3,17,2,3,17,14,12,8
31,No,Travel_Rarely,746,Research & Development,8,4,Life Sciences,1,98,3,Female,61,3,2,Manufacturing Director,4,Single,4424,20682,1,Y,No,23,4,4,80,0,11,2,3,11,7,1,8
38,No,Non-Travel,573,Research & Development,6,3,Medical,1,107,2,Female,79,1,2,Research Scientist,4,Divorced,5329,15717,7,Y,Yes,12,3,4,80,3,17,3,3,13,11,1,9


In [0]:
attrition_v1 = spark.read.format("delta").option("timestampAsOf","2022-07-19T14:05:15.000+0000").load("/FileStore/tables/attrition_transactionlog/")
display(attrition_v1)

Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2
32,No,Travel_Frequently,1005,Research & Development,2,2,Life Sciences,1,8,4,Male,79,3,1,Laboratory Technician,4,Single,3068,11864,0,Y,No,13,3,3,80,0,8,2,2,7,7,3,6
59,No,Travel_Rarely,1324,Research & Development,3,3,Medical,1,10,3,Female,81,4,1,Laboratory Technician,1,Married,2670,9964,4,Y,Yes,20,4,1,80,3,12,3,2,1,0,0,0
30,No,Travel_Rarely,1358,Research & Development,24,1,Life Sciences,1,11,4,Male,67,3,1,Laboratory Technician,3,Divorced,2693,13335,1,Y,No,22,4,2,80,1,1,2,3,1,0,0,0
38,No,Travel_Frequently,216,Research & Development,23,3,Life Sciences,1,12,4,Male,44,2,3,Manufacturing Director,3,Single,9526,8787,0,Y,No,21,4,2,80,0,10,2,3,9,7,1,8
36,No,Travel_Rarely,1299,Research & Development,27,3,Medical,1,13,3,Male,94,3,2,Healthcare Representative,3,Married,5237,16577,6,Y,No,13,3,2,80,2,17,3,2,7,7,7,7
35,No,Travel_Rarely,809,Research & Development,16,3,Medical,1,14,1,Male,84,4,1,Laboratory Technician,2,Married,2426,16479,0,Y,No,13,3,3,80,1,6,5,3,5,4,0,3


In [0]:
from delta.tables import *
delta_table = DeltaTable.forPath(spark,"/FileStore/tables/attrition_transactionlog/")
delta_table.vacuum(0)

In [0]:
from delta.tables import *
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled","false")
delta_table = DeltaTable.forPath(spark,"/FileStore/tables/attrition_transactionlog/")
delta_table.vacuum(0)

In [0]:
attrition_v0 = spark.read.format("delta").option("versionAsOf",2).load("/FileStore/tables/attrition_transactionlog/")
display(attrition_v0)

Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
34,No,Travel_Rarely,419,Research & Development,7,4,Life Sciences,1,28,1,Female,53,3,3,Research Director,2,Single,11994,21293,0,Y,No,11,3,3,80,0,13,4,3,12,6,2,11
53,No,Travel_Rarely,1282,Research & Development,5,3,Other,1,32,3,Female,58,3,5,Manager,3,Divorced,19094,10735,4,Y,No,11,3,4,80,1,26,3,2,14,13,4,8
44,No,Travel_Rarely,477,Research & Development,7,4,Medical,1,36,1,Female,42,2,3,Healthcare Representative,4,Married,10248,2094,3,Y,No,14,3,4,80,1,24,4,3,22,6,5,17
30,No,Travel_Frequently,721,Research & Development,1,2,Medical,1,57,3,Female,58,3,2,Laboratory Technician,4,Single,4011,10781,1,Y,No,23,4,4,80,0,12,2,3,12,8,3,7
41,Yes,Travel_Rarely,1360,Research & Development,12,3,Technical Degree,1,58,2,Female,49,3,5,Research Director,3,Married,19545,16280,1,Y,No,12,3,4,80,0,23,0,3,22,15,15,8
33,No,Travel_Frequently,515,Research & Development,1,2,Life Sciences,1,73,1,Female,98,3,3,Research Director,4,Single,13458,15146,1,Y,Yes,12,3,3,80,0,15,1,3,15,14,8,12
50,No,Travel_Rarely,989,Research & Development,7,2,Medical,1,80,2,Female,43,2,5,Research Director,3,Divorced,18740,16701,5,Y,Yes,12,3,4,80,1,29,2,2,27,3,13,8
36,No,Travel_Rarely,1223,Research & Development,8,3,Technical Degree,1,83,3,Female,59,3,3,Healthcare Representative,3,Divorced,10096,8202,1,Y,No,13,3,2,80,3,17,2,3,17,14,12,8
31,No,Travel_Rarely,746,Research & Development,8,4,Life Sciences,1,98,3,Female,61,3,2,Manufacturing Director,4,Single,4424,20682,1,Y,No,23,4,4,80,0,11,2,3,11,7,1,8
38,No,Non-Travel,573,Research & Development,6,3,Medical,1,107,2,Female,79,1,2,Research Scientist,4,Divorced,5329,15717,7,Y,Yes,12,3,4,80,3,17,3,3,13,11,1,9


In [0]:
display(dbutils.fs.ls("/FileStore/tables/attrition_transactionlog/" + "Department=Human Resources"))

path,name,size,modificationTime
dbfs:/FileStore/tables/attrition_transactionlog/Department=Human Resources/part-00000-98abe259-7dcd-4505-9c8f-57a1e0d2cd98.c000.snappy.parquet,part-00000-98abe259-7dcd-4505-9c8f-57a1e0d2cd98.c000.snappy.parquet,10572,1658107333000


In [0]:
from delta.tables import *
delta_table = DeltaTable.forPath(spark,"/FileStore/tables/attrition_transactionlog/")
delta_table.optimize.executeCompaction()


In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *


customer_rdd = spark.sparkContext.emptyRDD()


# Defining the schema of the DataFrame
columns = StructType([StructField('customerId', IntegerType(), False),
                       StructField('address', StringType(), False),
                      StructField('current', BooleanType(), False),
                      StructField('effectiveDate', StringType(), True),
                      StructField('endDate', StringType(), True)])


rows = [[1, "old address for 1", False, None, "2018-02-01"], [1, "current address for 1", True, "2018-02-01", None],
        [2, "current address for 2", True, "2018-02-01", None], [3, "current address for 3", True, "2018-02-01", None]]
 
# Creating an empty DataFrame
customers = spark.createDataFrame(data=rows,
                                         schema=columns).write.format("delta").mode("overwrite").saveAsTable("customers")
 
#customers.createOrReplaceTempView("customers")  
# Printing the DataFrame with no data
display(table("customers").orderBy("customerId"))



customerId,address,current,effectiveDate,endDate
1,old address for 1,False,,2018-02-01
1,current address for 1,True,2018-02-01,
2,current address for 2,True,2018-02-01,
3,current address for 3,True,2018-02-01,


In [0]:
columns1 = StructType([StructField('customerId', IntegerType(), False),
                       StructField('address', StringType(), False),
                      StructField('effectiveDate', StringType(), True)])


rows1 = [[1, "new address for 1", "2018-03-03"], [3, "current address for 3", "2018-04-04"],
        [4, "new address for 4", "2018-04-04"]]


updates = spark.createDataFrame(data=rows1,
                                         schema=columns1)

updates.createOrReplaceTempView("updates")  
# Printing the DataFrame with no data
display(table("updates"))

customerId,address,effectiveDate
1,new address for 1,2018-03-03
3,current address for 3,2018-04-04
4,new address for 4,2018-04-04


In [0]:
%sql
MERGE INTO customers
USING (
   -- These rows will either UPDATE the current addresses of existing customers or INSERT the new addresses of new customers
  SELECT updates.customerId as mergeKey, updates.*
  FROM updates
  
  UNION ALL
  
  -- These rows will INSERT new addresses of existing customers 
  -- Setting the mergeKey to NULL forces these rows to NOT MATCH and be INSERTed.
  SELECT NULL as mergeKey, updates.*
  FROM updates JOIN customers
  ON updates.customerid = customers.customerid 
  WHERE customers.current = true AND updates.address <> customers.address 
  
) staged_updates
ON customers.customerId = mergeKey
WHEN MATCHED AND customers.current = true AND customers.address <> staged_updates.address THEN  
  UPDATE SET current = false, endDate = staged_updates.effectiveDate    -- Set current to false and endDate to source's effective date.
WHEN NOT MATCHED THEN 
  INSERT(customerid, address, current, effectivedate, enddate) 
  VALUES(staged_updates.customerId, staged_updates.address, true, staged_updates.effectiveDate, null) -- Set current to true along with the new address and its effective date.
 

num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
3,1,0,2


In [0]:
display(table("customers").orderBy("customerId", "current", "endDate"))

customerId,address,current,effectiveDate,endDate
1,old address for 1,False,,2018-02-01
1,current address for 1,False,2018-02-01,2018-03-03
1,new address for 1,True,2018-03-03,
2,current address for 2,True,2018-02-01,
3,current address for 3,True,2018-02-01,
4,new address for 4,True,2018-04-04,


In [0]:
dbutils.fs.rm('/FileStore/tables/attrition_delta_updates', True)
