In [0]:
dbutils.secrets.listScopes()
dbutils.secrets.list("adls-key")
service_credential = dbutils.secrets.get(scope="adls-key",key="migrationAppAuth")
application_id = "5403bb6e-a9d0-44a6-a025-de1ebc2a7881"
directory_id = "4249dcf4-f4a1-44f9-940d-14b50a777dd8"

spark.conf.set("fs.azure.account.auth.type.adls21s.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.adls21s.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.adls21s.dfs.core.windows.net", application_id)
spark.conf.set("fs.azure.account.oauth2.client.secret.adls21s.dfs.core.windows.net", service_credential)
spark.conf.set("fs.azure.account.oauth2.client.endpoint.adls21s.dfs.core.windows.net", "https://login.microsoftonline.com/{}/oauth2/token".format(directory_id))

display(dbutils.fs.ls("abfss://raw@adls21s.dfs.core.windows.net"))

In [0]:
configs = {"fs.azure.account.auth.type": "OAuth",
          "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
          "fs.azure.account.oauth2.client.id": application_id,
          "fs.azure.account.oauth2.client.secret": service_credential,
          "fs.azure.account.oauth2.client.endpoint": f"https://login.microsoftonline.com/{directory_id}/oauth2/token"}

mountsDict = {"/mnt/adls21s/raw/" : "abfss://raw@adls21s.dfs.core.windows.net/",
              "/mnt/schema/" : "abfss://schema@adls21s.dfs.core.windows.net/",
              "/mnt/adls21s/bronze/" : "abfss://bronze@adls21s.dfs.core.windows.net/",
              "/mnt/adls21s/silver/" : "abfss://silver@adls21s.dfs.core.windows.net/",
              "/mnt/adls21s/gold/" : "abfss://gold@adls21s.dfs.core.windows.net/"}

def checkIfMounted(mounts):
    if(any(mount.mountPoint == "/mnt/adls21s/raw/" for mount in mounts )):
        print("Mount point exists, Unmounting...")
        dbutils.fs.unmount("/mnt/adls21s/raw/")
        print("Unmounted Successfully")
        dbutils.fs.mount(source = "abfss://raw@adls21s.dfs.core.windows.net/", mount_point = "/mnt/adls21s/raw/", extra_configs = configs)
        print("Mounted Successfully")
        display(dbutils.fs.mounts())
    else:
        print("Mount point does not exist")
        dbutils.fs.mount(source = "abfss://raw@adls21s.dfs.core.windows.net/", mount_point = "/mnt/adls21s/raw/", extra_configs = configs)
        print("Mounted Successfully")

for mountPoint in mountsDict:
    try:
        dbutils.fs.mount(source = mountsDict[mountPoint], mount_point = mountPoint, extra_configs = configs)
        print('No existing mount point found, Mounting {}'.format(mountPoint))
    except Exception as e:
        #checkIfMounted(mountsList)
        print("Mount point {} exists, skipping...".format(mountPoint))
display(dbutils.fs.mounts())   

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType, DateType, DoubleType, ShortType,TimestampNTZType

cardsDataSchema = StructType([StructField("id", IntegerType(), True), 
                              StructField("client_id", IntegerType(), True), 
                              StructField("card_brand", StringType(), True), 
                              StructField("card_type", StringType(), True), 
                              StructField("card_number", LongType(), True), 
                              StructField("expires", StringType(), True), 
                              StructField("cvv", ShortType(), True), 
                              StructField("has_chip", StringType(), True), 
                              StructField("num_cards_issued", ShortType(), True), 
                              StructField("credit_limit", StringType(), True), 
                              StructField("acct_open_date", StringType(), True), 
                              StructField("year_pin_last_changed", IntegerType(), True), 
                              StructField("card_on_dark_web", StringType(), True)])


raw_cards_data_df = spark.read.format("csv").option("header", "true").schema(cardsDataSchema)\
    .load("/mnt/adls21s/raw/cards_data.csv")
display(raw_cards_data_df)

tranactionsDataSchema = StructType([StructField("id", IntegerType(), True),
                                    StructField("date", TimestampNTZType(), True),
                                    StructField("client_id", IntegerType(), True),
                                    StructField("card_id", IntegerType(), True),
                                    StructField("amount", StringType(), True),
                                    StructField("use_chip", StringType(), True),
                                    StructField("merchant_id", IntegerType(), True),
                                    StructField("merchant_city", StringType(), True),
                                    StructField("merchant_state", StringType(), True),
                                    StructField("zip", StringType(), True),
                                    StructField("mcc", IntegerType(), True),
                                    StructField("errors", StringType(), True)])

raw_transactions_data_df = spark.read.format("csv").option("header", "true").schema(tranactionsDataSchema)\
    .load("/mnt/adls21s/raw/transactions_data.csv")
raw_transactions_data_df.show()

usersDataSchema = StructType([StructField("id", IntegerType(), True),
                              StructField("current_age", IntegerType(), True),
                              StructField("retirement_age", IntegerType(), True),
                              StructField("birth_year", IntegerType(), True),
                              StructField("birth_month", IntegerType(), True),
                              StructField("gender", StringType(), True),
                              StructField("address", StringType(), True),
                              StructField("latitude", DoubleType(), True),
                              StructField("longitude", DoubleType(), True),
                              StructField("per_capita_income", StringType(), True),
                              StructField("yearly_income", StringType(), True),
                              StructField("total_debt", StringType(), True),
                              StructField("credit_score", IntegerType(), True),
                              StructField("num_credit_cards", IntegerType(), True)])
                              
raw_users_data_df = spark.read.format("csv").option("header", "true").schema(usersDataSchema)\
    .load("/mnt/adls21s/raw/users_data.csv")
display(raw_users_data_df.head(5))

In [0]:
from pyspark.sql.functions import current_timestamp, regexp_replace, col, count, when, length, year

'''Add timestamp column to dataframe'''
raw_cards_data_df = raw_cards_data_df.withColumn("ingested_at", current_timestamp())
raw_transactions_data_df = raw_transactions_data_df.withColumn("ingested_at", current_timestamp())
raw_users_data_df = raw_users_data_df.withColumn("ingested_at", current_timestamp())

'''Checking for invalid values'''
raw_users_data_df.filter("current_age < 0" or "retirement_age < 0" or "birth_year < 0" or "birth_month < 0" or "credit_score < 0" or "current_age>100" or "credit_score > 900").show()

raw_cards_data_df.filter(length("card_number") != 16).show()

raw_users_data_df.filter((year("ingested_at") - "birth_year") > 115).show()




In [0]:
'''Remove special characters'''
cards_data_df = raw_cards_data_df.withColumn("credit_limit_in_$", regexp_replace('credit_limit', '\$', ''))
cards_data_df = cards_data_df.drop("credit_limit")
transactions_data_df = raw_transactions_data_df.withColumn("amount_in_$", regexp_replace("amount", "\$", ""))
transactions_data_df = transactions_data_df.drop("amount")
users_data_df = raw_users_data_df.withColumn("per_capita_income_in_$", regexp_replace("per_capita_income", "\$", ""))
users_data_df = users_data_df.drop("per_capita_income")
users_data_df = users_data_df.withColumn("yearly_income_in_$", regexp_replace("yearly_income", "\$", ""))
users_data_df = users_data_df.drop("yearly_income")
users_data_df = users_data_df.withColumn("total_debt_in_$", regexp_replace("total_debt", "\$", ""))
users_data_df = users_data_df.drop("total_debt")

'''Changing NULL values in transactions_data[error] column to No Errors'''
transactions_data_df = transactions_data_df.fillna({"errors": "No Errors"})

'''Checking NULL values in each dataframe'''
null_counts_cards_data_df = display(cards_data_df.select([count(when(col(c).isNull(), c)).alias(c) for c in cards_data_df.columns]))
null_counts_transactions_data_df = display(transactions_data_df.select([count(when(col(c).isNull(), c)).alias(c) for c in transactions_data_df.columns]))
null_counts_users_data_df = display(users_data_df.select([count(when(col(c).isNull(), c)).alias(c) for c in users_data_df.columns]))

'''Replacing NULL values in zip where merchant_city is Not ONLINE but zip is NULL to ABROAD'''
transactions_data_df = transactions_data_df.withColumn("zip", when((col("merchant_city") != "ONLINE")&col("zip").isNull(), "ABROAD").otherwise(col("zip")))

'''Replacing NULL values in zip where merchant_city is ONLINE and merchant_state, zip are NULL to ONLINE'''
transactions_data_df = transactions_data_df.fillna({"zip": "ONLINE", "merchant_state": "ONLINE"})

In [0]:

from pyspark.sql.functions import udf

'''Masking card number and cvv columns'''
def mask_card_number(card):
    charlist = list(str(card))
    charlist[1:-1] = "X"*len(charlist[1:-1])
    return "".join(charlist)

def mask_cvv(cvv):
    charlist = list(str(cvv))
    charlist[:] = "X"*len(charlist)
    return "".join(charlist)

mask_card_udf = udf(mask_card_number, StringType())
mask_cvv_udf = udf(mask_cvv, StringType())

cards_data_df_masked = cards_data_df.withColumn("masked_card_number", mask_card_udf(cards_data_df["card_number"])).withColumn("masked_cvv", mask_cvv_udf(cards_data_df["cvv"]))

cards_data_df_masked.show()

'''Masking Address, latitude and longitude columns'''
def mask_address(address):
    charlist = list(address)
    charlist[1:-1] = "X"*len(charlist[1:-1])
    return "".join(charlist)

def mask_latitude(latitude):
    charlist = list(str(latitude))
    charlist[:] = "X"*len(charlist)
    return "".join(charlist)

def mask_longitude(longitude):
    charlist = list(str(longitude))
    charlist[:] = "X"*len(charlist)
    return "".join(charlist)

mask_address_udf = udf(mask_address, StringType())
mask_latitude_udf = udf(mask_latitude, StringType())
mask_longitude_udf = udf(mask_longitude, StringType())

users_data_df_masked = users_data_df.withColumn("masked_address", mask_address_udf(users_data_df["address"])).withColumn("masked_latitude", mask_latitude_udf(users_data_df["latitude"])).withColumn("masked_longitude", mask_longitude_udf(users_data_df["longitude"]))


In [0]:
from delta.tables import DeltaTable

# Path to Delta table
users_data_delta_table_path = "/mnt/adls21s/bronze/users_data/"
cards_data_delta_table_path = "/mnt/adls21s/bronze/cards_data/"
transactions_data_delta_table_path = "/mnt/adls21s/bronze/transactions_data/"

# Create Delta table (if not already created)
if not DeltaTable.isDeltaTable(spark, users_data_delta_table_path):
    raw_users_data_df.write.format("delta").mode("overwrite").save(users_data_delta_table_path)
    if not DeltaTable.isDeltaTable(spark,cards_data_delta_table_path):
        raw_cards_data_df.write.format("delta").mode("overwrite").save(cards_data_delta_table_path)
        if not DeltaTable.isDeltaTable(spark, transactions_data_delta_table_path):
            raw_transactions_data_df.write.format("delta").mode("overwrite").save(transactions_data_delta_table_path)
        else:
            pass

# Load the existing Delta table
users_data_delta_table = DeltaTable.forPath(spark, (users_data_delta_table_path))
cards_data_delta_table = DeltaTable.forPath(spark,(cards_data_delta_table_path))
transactions_data_delta_table = DeltaTable.forPath(spark,(transactions_data_delta_table_path))

# Define the condition for the merge: match based on a unique key
merge_condition = "t1.id = t2.id"

# Perform the merge (upsert operation)
users_data_delta_table.alias("t1").merge(
    raw_users_data_df.alias("t2"),
    merge_condition
).whenMatchedUpdateAll() \
 .whenNotMatchedInsertAll() \
 .execute()

cards_data_delta_table.alias("t1").merge(
    raw_cards_data_df.alias("t2"),
    merge_condition
).whenMatchedUpdateAll() \
 .whenNotMatchedInsertAll() \
 .execute()

transactions_data_delta_table.alias("t1").merge(
    raw_transactions_data_df.alias("t2"),
    merge_condition
).whenMatchedUpdateAll() \
 .whenNotMatchedInsertAll() \
 .execute()