In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, current_date

spark = SparkSession.builder \
    .appName("SCD-Example") \
    .getOrCreate()

# Initial customer dimension
data_initial = [
    (1, "Alice",   "100 Main St"),
    (2, "Bob",     "200 Oak Ave"),
    (3, "Charlie", "300 Pine Rd")
]
cols = ["cust_id", "name", "address"]

df_dim = spark.createDataFrame(data_initial, cols)

# Incoming updates
# - Alice moved
# - Bob unchanged
# - New customer Dana
data_updates = [
    (1, "Alice",   "123 Elm St"),
    (2, "Bob",     "200 Oak Ave"),
    (4, "Dana",    "400 Birch Blvd")
]
df_updates = spark.createDataFrame(data_updates, cols)

In [0]:
from pyspark.sql.functions import coalesce

# 1. Overwrite existing records
df_scd1_updated = df_dim.alias("d") \
    .join(df_updates.alias("u"), "cust_id", "left") \
    .select(
        "cust_id",
        coalesce("u.name", "d.name").alias("name"),
        coalesce("u.address", "d.address").alias("address")
    ).filter("u.cust_id IS NOT NULL")
df_scd1_updated.display()
# 2. Append truly new customers
new_customers = df_updates \
    .join(df_dim, "cust_id", "left_anti")
new_customers.display()
df_scd1 = df_scd1_updated.unionByName(new_customers)

df_scd1.display()

In [0]:
from pyspark.sql.functions import to_date

# Add SCD2 metadata to the existing dimension
df_dim2 = df_dim \
    .withColumn("effective_date", to_date(lit("2023-01-01"))) \
    .withColumn("end_date", lit(None).cast("date")) \
    .withColumn("is_current", lit(True))

df_dim2.display()

In [0]:
# Join to spot updates vs unchanged vs new
df_join = df_dim2.alias("d") \
    .join(df_updates.alias("u"), "cust_id", "right")
df_join.display()
# Rows needing new version: changed values OR brand new cust_id
from pyspark.sql.functions import expr, col

df_changed = df_join \
    .filter((expr("d.name <> u.name")) | (expr("d.address <> u.address")) | (col("d.cust_id").isNull())) \
    .select("u.cust_id", "u.name", "u.address")

display(df_changed)

In [0]:
from pyspark.sql.functions import date_add

# 1. Expire old versions for changed rows
df_expired = df_dim2.alias("d") \
    .join(df_changed.alias("c"), "cust_id") \
    .select(
        "d.cust_id", "d.name", "d.address", "d.effective_date",
        date_add(current_date(), -1).alias("end_date"),
        lit(False).alias("is_current")
    )
df_expired.display()
# 2. New current versions
df_new_versions = df_changed \
    .withColumn("effective_date", current_date()) \
    .withColumn("end_date", lit(None).cast("date")) \
    .withColumn("is_current", lit(True))
df_new_versions.display()
# 3. Keep unaffected current rows
df_unchanged = df_dim2.filter(~df_dim2.cust_id.isin([r.cust_id for r in df_changed.collect()]))
df_unchanged.display()
# 4. Union all to get the full SCD2 dimension
df_scd2 = df_expired.unionByName(df_new_versions).unionByName(df_unchanged)

df_scd2.orderBy("cust_id", "effective_date").display()