In [0]:
# Importing libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType, LongType
import urllib

In [0]:
# Initialize Spark Session 

spark = SparkSession.builder \
    .appName("Bronze Layer") \
    .getOrCreate()

In [0]:
ACCESS_KEY=dbutils.secrets.get("aws", "aws_access_key")
SECRET_KEY= dbutils.secrets.get("aws", "aws_secret_access_key")

In [0]:
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

In [0]:
%run ./_resources/01-load-data

In [0]:
AWS_S3_BUCKET = "frauddetection-etl"

# Mount name for the bucket
MOUNT_NAME = "/mnt/frauddetection-etl"
# Source url
SOURCE_URL = "s3n://{0}:{1}@{2}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_S3_BUCKET)
# Mount the drive
dbutils.fs.mount(SOURCE_URL, MOUNT_NAME)

True

In [0]:
dbutils.fs.ls("/mnt/frauddetection-etl/")

[FileInfo(path='dbfs:/mnt/frauddetection-etl/bronze/', name='bronze/', size=0, modificationTime=1715796237491),
 FileInfo(path='dbfs:/mnt/frauddetection-etl/gold/', name='gold/', size=0, modificationTime=1715796237491),
 FileInfo(path='dbfs:/mnt/frauddetection-etl/silver/', name='silver/', size=0, modificationTime=1715796237491)]

In [0]:
BRONZE_PATH = "/mnt/frauddetection-etl/bronze"

data already existing. Run with reset_all_data=true to force a data cleanup for your local demo.


In [0]:
# Define Schema for Data

# Schema for transactions data
transaction_schema = StructType([
    StructField("amount", StringType(), True),
    StructField("countryDest", StringType(), True),
    StructField("countryOrig", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("id", StringType(), True),
    StructField("isUnauthorizedOverdraft", IntegerType(), True),
    StructField("nameDest", StringType(), True),
    StructField("nameOrig", StringType(), True),
    StructField("newBalanceDest", StringType(), True),
    StructField("newBalanceOrig", StringType(), True),
    StructField("oldBalanceDest", StringType(), True),
    StructField("oldBalanceOrig", StringType(), True),
    StructField("step", StringType(), True),
    StructField("type", StringType(), True)
])

# Schema for fraud reports
fraud_schema = StructType([
    StructField("is_fraud", IntegerType(), True),
    StructField("id", StringType(), True)
])


In [0]:
# Load transaction data
transactions_df = spark.read.format("json").schema(transaction_schema).load("/dbdemos/fsi/fraud-detection/transactions")

# Load fraud report data
fraud_reports_df = spark.read.format("csv").schema(fraud_schema).option("header", "true").load("/dbdemos/fsi/fraud-detection/customers")

# Ensure immutability by writing in append mode
transactions_df.write.format("delta").mode("append").save("{}/transactions".format(BRONZE_PATH))
fraud_reports_df.write.format("delta").mode("append").save("{}/fraud_reports".format(BRONZE_PATH))

In [0]:
# Create the schema if it doesn't exist
spark.sql("CREATE SCHEMA IF NOT EXISTS bronze")

# Set the current schema
spark.sql("USE bronze")

# Register the external tables
spark.sql("""
CREATE EXTERNAL TABLE IF NOT EXISTS transactions
USING delta
LOCATION 's3://frauddetection-etl/bronze/transactions/'
""")

spark.sql("""
CREATE EXTERNAL TABLE IF NOT EXISTS fraud_reports
USING delta
LOCATION 's3://frauddetection-etl/bronze/fraud_reports/'
""")

DataFrame[]

In [0]:
# Showing written data
transactions_df_output = spark.sql("SELECT * FROM bronze.transactions")
transactions_df_output.show()

+---------+-----------+-----------+--------------------+--------------------+-----------------------+------------+-----------+--------------+--------------+--------------+--------------+----+--------+
|   amount|countryDest|countryOrig|         customer_id|                  id|isUnauthorizedOverdraft|    nameDest|   nameOrig|newBalanceDest|newBalanceOrig|oldBalanceDest|oldBalanceOrig|step|    type|
+---------+-----------+-----------+--------------------+--------------------+-----------------------+------------+-----------+--------------+--------------+--------------+--------------+----+--------+
| 10452.98|        REU|        REU|a0e67d95-b19d-487...|9db8db71-126b-4fe...|                      0| M3479766470|C9695372314|     363539.07|      67009.83|     353086.09|      77462.82| 146| PAYMENT|
| 11230.51|        BRA|        TUR|0796aa96-5c08-4d3...|30f56e89-0e4e-4eb...|                      0| M9181931898|C2700694787|    1314521.73|    1973961.84|    1303291.22|    1985192.36| 420| PAYM

In [0]:
# Showing written data
fraud_reports_df_output = spark.sql("SELECT * FROM bronze.fraud_reports")
fraud_reports_df_output.show()

+--------+----------+
|is_fraud|        id|
+--------+----------+
|    NULL|     Scott|
|    NULL| AS 11135"|
|    NULL|    Harris|
|    NULL| AS 71707"|
|    NULL|   Simmons|
|    NULL| CA 58536"|
|    NULL|     Cowan|
|    NULL| LA 08082"|
|    NULL|   Holland|
|    NULL| TX 78165"|
|    NULL|      Neal|
|    NULL| NE 01419"|
|    NULL|   Merritt|
|    NULL|       SVK|
|    NULL|   Rollins|
|    NULL|       DMA|
|    NULL|     Blair|
|    NULL| MO 59027"|
|    NULL|      Ball|
|    NULL| IN 30277"|
+--------+----------+
only showing top 20 rows



In [0]:
# spark.sql("DROP SCHEMA IF EXISTS bronze CASCADE")

DataFrame[]

In [0]:
# dbutils.fs.unmount("/mnt/frauddetection-etl")