In [0]:
users_data_df = spark.read.format("delta").load("/mnt/adls21s/silver/users_data/")
display(users_data_df)
cards_data_df = spark.read.format("delta").load("/mnt/adls21s/silver/cards_data/")
display(cards_data_df)
transactions_data_df = spark.read.format("delta").load("/mnt/adls21s/silver/transactions_data/")
display(transactions_data_df)

In [0]:
from pyspark.sql.functions import year, month, day, hour, minute, second, col, date_diff, current_date

#Creating dimension date table
dim_date = transactions_data_df.select("transaction_id", year(("date")), month(("date")), day(("date")), hour(("date")), minute(("date")))
dim_date = dim_date.withColumnRenamed("year(date)", "year").withColumnRenamed("month(date)", "month").withColumnRenamed("day(date)", "day").withColumnRenamed("hour(date)", "hour").withColumnRenamed("minute(date)", "minute")

#Creating dimesion user table
dim_users = users_data_df.select("user_id", "current_age", "retirement_age", "birth_year", "birth_month", "gender", "credit_score", "num_credit_cards", "per_capita_income_in_$", "yearly_income_in_$", "total_debt_in_$")

#Creating dimesion card table
dim_cards = cards_data_df.select("card_id", col("client_id").alias("user_id"), "card_brand", "card_type", year("expires").alias("year"), month("expires").alias("month"), "has_chip", "num_cards_issued", "acct_open_date", "year_pin_last_changed", "card_on_dark_web", "credit_limit_in_$").withColumn("acct_age_yrs", date_diff(current_date(), col("acct_open_date"))/365)

#Creating dimesion merchant table
dim_merchants = transactions_data_df.select("merchant_id", "merchant_city", "merchant_state", "zip", "mcc")

#Creating fact transaction table
fact_transactions = transactions_data_df.select("transaction_id", col("client_id").alias("user_id"), "card_id", "use_chip", "merchant_id", "amount_in_$", "errors")

display(dim_merchants)


In [0]:
print(dim_merchants.groupBy("merchant_id").count().filter("count > 1").count())

'''Since dim_merchants has duplicate merchant_ids for unique transactions, we create surrogate key in fact_transactions, and dim_merchants'''

from pyspark.sql.functions import monotonically_increasing_id
dim_merchants = dim_merchants.withColumn("merchant_sk", monotonically_increasing_id())
display(dim_merchants)

fact_transactions = fact_transactions.withColumn("merchant_sk", monotonically_increasing_id()).drop("merchant_id")
display(fact_transactions)

In [0]:
from delta.tables import DeltaTable
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")

# Path to Delta table
dim_date_delta_table_path = "/mnt/adls21s/gold/dim_date/"
dim_cards_delta_table_path = "/mnt/adls21s/gold/dim_cards/"
dim_users_delta_table_path = "/mnt/adls21s/gold/dim_users/"
dim_merchants_delta_table_path = "/mnt/adls21s/gold/dim_merchants/"
fact_transactions_delta_table_path = "/mnt/adls21s/gold/fact_transactions/"

dim_merchants.cache()

# Create Delta table (if not already created)
if not DeltaTable.isDeltaTable(spark, dim_date_delta_table_path):
    dim_date.write.format("delta").mode("overwrite").save(dim_date_delta_table_path)
else:
    dim_date.write.format("delta").mode("append").option("mergeSchema", "true").save(dim_date_delta_table_path)

if not DeltaTable.isDeltaTable(spark,dim_cards_delta_table_path):
    dim_cards.write.format("delta").mode("overwrite").partitionBy("card_brand","year","month").save(dim_cards_delta_table_path)
else:
    dim_cards.write.format("delta").mode("append").option("mergeSchema", "true").partitionBy("card_brand","year","month").save(dim_cards_delta_table_path)

if not DeltaTable.isDeltaTable(spark, dim_users_delta_table_path):
   dim_users.write.format("delta").mode("overwrite").partitionBy("gender").save(dim_users_delta_table_path)
else:
    dim_users.write.format("delta").mode("append").option("mergeSchema", "true").partitionBy("gender").save(dim_users_delta_table_path)

if not DeltaTable.isDeltaTable(spark, dim_merchants_delta_table_path):
   dim_merchants.write.format("delta").mode("overwrite").partitionBy("merchant_state").save(dim_merchants_delta_table_path)
else:
    dim_merchants.write.format("delta").mode("append").option("mergeSchema", "true").partitionBy("merchant_state").save(dim_merchants_delta_table_path)

if not DeltaTable.isDeltaTable(spark, fact_transactions_delta_table_path):
   fact_transactions.write.format("delta").mode("overwrite") \
       .save(fact_transactions_delta_table_path)
else:
    fact_transactions.write.format("delta").mode("append").option("mergeSchema", "true").save(fact_transactions_delta_table_path)




# Load the existing Delta table
dim_date_delta_table = DeltaTable.forPath(spark, (dim_date_delta_table_path))
dim_cards_delta_table = DeltaTable.forPath(spark,(dim_cards_delta_table_path))
dim_users_delta_table = DeltaTable.forPath(spark,(dim_users_delta_table_path))
dim_merchants_delta_table = DeltaTable.forPath(spark,(dim_merchants_delta_table_path))
fact_transactions_delta_table = DeltaTable.forPath(spark,(fact_transactions_delta_table_path))

spark.sql("""
    OPTIMIZE delta.`/mnt/adls21s/gold/dim_merchants/`
    ZORDER BY (merchant_id)
""")

# Define the condition for the merge: match based on a unique key
merge_condition_dim_users = "t1.user_id = t2.user_id"
merge_condition_dim_cards = "t1.card_id = t2.card_id"
merge_condition_fact_transactions = "t1.transaction_id = t2.transaction_id"
merge_condition_dim_date = "t1.transaction_id = t2.transaction_id"
merge_condition_dim_merchants = "t1.merchant_sk = t2.merchant_sk"

# Perform the merge (upsert operation)
dim_users_delta_table.alias("t1").merge(
    dim_users.alias("t2"),
    merge_condition_dim_users
).whenMatchedUpdateAll() \
 .whenNotMatchedInsertAll() \
 .execute()

dim_cards_delta_table.alias("t1").merge(
    dim_cards.alias("t2"),
    merge_condition_dim_cards
).whenMatchedUpdateAll() \
 .whenNotMatchedInsertAll() \
 .execute()

dim_date_delta_table.alias("t1").merge(
    dim_date.alias("t2"),
    merge_condition_dim_date
).whenMatchedUpdateAll() \
 .whenNotMatchedInsertAll() \
 .execute()

dim_merchants_delta_table.alias("t1").merge(
    dim_merchants.alias("t2"),
    merge_condition_dim_merchants
).whenMatchedUpdateAll() \
 .whenNotMatchedInsertAll() \
 .execute()

fact_transactions_delta_table.alias("t1").merge(
    fact_transactions.alias("t2"),
    merge_condition_fact_transactions
).whenMatchedUpdateAll() \
 .whenNotMatchedInsertAll() \
 .execute()