In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import urllib
from pyspark.sql.functions import coalesce, expr, lit, col, regexp_replace, trim

In [0]:
# Initialize Spark Session 

spark = SparkSession.builder \
    .appName("Gold Layer") \
    .getOrCreate()

In [0]:
# ACCESS_KEY=dbutils.secrets.get("aws", "aws_access_key")
# SECRET_KEY= dbutils.secrets.get("aws", "aws_secret_access_key")

In [0]:
# ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

In [0]:
# AWS_S3_BUCKET = "frauddetection-etl"

# # Mount name for the bucket
# MOUNT_NAME = "/mnt/frauddetection-etl"
# # Source url
# SOURCE_URL = "s3n://{0}:{1}@{2}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_S3_BUCKET)
# # Mount the drive
# dbutils.fs.mount(SOURCE_URL, MOUNT_NAME)

In [0]:
# Set silver path
GOLD_PATH = "/mnt/frauddetection-etl/gold"

In [0]:
# Read the transactions data from the external table
transactions_df = spark.sql("SELECT * FROM silver.transactions")

# Read the banking customers data from the external table
banking_customers_df = spark.sql("SELECT * FROM silver.banking_customers")

# Read the country coordinates data from the external table
country_coordinates_df = spark.sql("SELECT * FROM silver.country_coordinates")

In [0]:
# Data Transformation and Feature Engineering

# Joining datasets
gold_df = transactions_df \
    .join(banking_customers_df, transactions_df.customer_id == banking_customers_df.id, "inner") \
    .join(country_coordinates_df.alias("orig"), col("countryOrig") == col("orig.alpha3_code"), "left_outer") \
    .join(country_coordinates_df.alias("dest"), col("countryDest") == col("dest.alpha3_code"), "left_outer") \
    .select(
        transactions_df["*"],
        banking_customers_df["firstname"].alias("customer_firstname"),
        banking_customers_df["lastname"].alias("customer_lastname"),
        coalesce(col("orig.country"), lit("Unknown")).alias("origin_country"),
        col("orig.lat_avg").alias("origin_lat"),
        col("orig.long_avg").alias("origin_long"),
        coalesce(col("dest.country"), lit("Unknown")).alias("destination_country"),
        col("dest.lat_avg").alias("destination_lat"),
        col("dest.long_avg").alias("destination_long")
    )

In [0]:
# Save cleaned data to Gold layer and create external tables

gold_df.write.format("delta").mode("append").save(f"{GOLD_PATH}/transactions")

# Register external tables
spark.sql("CREATE SCHEMA IF NOT EXISTS gold")

# Set the current schema
spark.sql("USE gold")

# Register the external table
spark.sql(f"""
CREATE EXTERNAL TABLE IF NOT EXISTS transactions
USING delta
LOCATION 's3://frauddetection-etl/gold/transactions'
""")

DataFrame[]

In [0]:
# Showing written data
transactions_df_output = spark.sql("SELECT * FROM gold.transactions")
transactions_df_output.show()

+--------------------+---------+-----------+-----------+--------------------+-----------------------+------------+-----------+--------------+--------------+--------------+--------------+----+--------+--------+------------------+-----------------+------------------+----------+-----------+-------------------+---------------+----------------+
|                  id|   amount|countryDest|countryOrig|         customer_id|isUnauthorizedOverdraft|    nameDest|   nameOrig|newBalanceDest|newBalanceOrig|oldBalanceDest|oldBalanceOrig|step|    type|is_fraud|customer_firstname|customer_lastname|    origin_country|origin_lat|origin_long|destination_country|destination_lat|destination_long|
+--------------------+---------+-----------+-----------+--------------------+-----------------------+------------+-----------+--------------+--------------+--------------+--------------+----+--------+--------+------------------+-----------------+------------------+----------+-----------+-------------------+--------

In [0]:
# spark.sql("DROP SCHEMA IF EXISTS gold CASCADE")

DataFrame[]

In [0]:
dbutils.fs.unmount("/mnt/frauddetection-etl")

/mnt/frauddetection-etl has been unmounted.


True