In [0]:
# ------------------------------------------------------------
# STEP 1: Read CSV from Databricks Volume
# ------------------------------------------------------------
df = (
    spark.read.format("csv")
        .option("inferSchema", "true")
        .option("header", "true")
        .load("/Volumes/raw-data/banking/csv/Banking_Database.csv")
)

df.printSchema()
df.display(5)

In [0]:
# creating multiple frames 
from pyspark.sql import functions as F

customers = (
    df.select(
        F.col("Customer ID").alias("customer_id"),
        F.col("First Name").alias("first_name"),
        F.col("Last Name").alias("last_name"),
        "Age",
        "Gender",
        "City",
        "Email"
    )
    .dropDuplicates(["customer_id"])
)

accounts = df.select(
    F.col("Customer ID").alias("customer_id"),
    "Account Type",
    "Account Balance",
    "Date Of Account Opening",
    "Branch ID"
).dropDuplicates()

transactions = df.select(
    F.col("TransactionID").alias("transaction_id"),
    F.col("Customer ID").alias("customer_id"),
    "Transaction Date",
    "Transaction Type",
    "Transaction Amount",
    "Account Balance After Transaction"
).dropna(subset=["transaction_id"])

loans = df.select(
    F.col("Loan ID").alias("loan_id"),
    F.col("Customer ID").alias("customer_id"),
    "Loan Amount",
    "Loan Type",
    "Interest Rate",
    "Loan Term",
    "Loan Status"
).dropna(subset=["loan_id"])

cards = df.select(
    F.col("CardID").alias("card_id"),
    F.col("Customer ID").alias("customer_id"),
    "Card Type",
    "Credit Limit",
    "Credit Card Balance",
    "Rewards Points"
).dropna(subset=["card_id"])

In [0]:

# INNER JOIN

cust_accounts = customers.join(
    accounts,
    customers.customer_id == accounts.customer_id,
    "inner"
)

cust_accounts.display()

In [0]:
# LEFT JOIN

cust_accounts_left = customers.join(
    accounts,
    customers.customer_id == accounts.customer_id,
    "left"
)

cust_accounts_left.display()


In [0]:
# RIGHT JOIN

cust_txn_right = customers.join(
    transactions,
    customers.customer_id == transactions.customer_id,
    "right"
)

cust_txn_right.display()

In [0]:
# LEFT ANTI JOIN

customers_no_txn = customers.join(
    transactions,
    customers.customer_id == transactions.customer_id,
    "left_anti"
)

customers_no_txn.display()

In [0]:
# MULTI JOIN

cust_account_loan = customers \
    .join(
        accounts,
        customers.customer_id == accounts.customer_id,
        "left"
    ) \
    .join(
        loans,
        customers.customer_id == loans.customer_id,
        "left"
    )

cust_account_loan.display()

In [0]:
high_loan_customers = customers.join(
    loans,
    customers.customer_id == loans.customer_id,
    "inner"
).filter(
    F.col("Loan Amount") > 500
)

display(high_loan_customers)