In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
df = (
    spark.read.format("csv")
        .option("inferSchema", "true")
        .option("header", "true")
        .load("/Volumes/raw-data/banking/csv/Banking_Database.csv")
)

df.printSchema()
df.display(5)

In [0]:
# creating multiple frames 
from pyspark.sql import functions as F

customers = (
    df.select(
        F.col("Customer ID").alias("customer_id"),
        F.col("First Name").alias("first_name"),
        F.col("Last Name").alias("last_name"),
        "Age",
        "Gender",
        "City",
        "Email"
    )
    .dropDuplicates(["customer_id"])
)

accounts = df.select(
    F.col("Customer ID").alias("customer_id"),
    "Account Type",
    "Account Balance",
    "Date Of Account Opening",
    "Branch ID"
).dropDuplicates()

transactions = df.select(
    F.col("TransactionID").alias("transaction_id"),
    F.col("Customer ID").alias("customer_id"),
    "Transaction Date",
    "Transaction Type",
    "Transaction Amount",
    "Account Balance After Transaction"
).dropna(subset=["transaction_id"])

loans = df.select(
    F.col("Loan ID").alias("loan_id"),
    F.col("Customer ID").alias("customer_id"),
    "Loan Amount",
    "Loan Type",
    "Interest Rate",
    "Loan Term",
    "Loan Status"
).dropna(subset=["loan_id"])

cards = df.select(
    F.col("CardID").alias("card_id"),
    F.col("Customer ID").alias("customer_id"),
    "Card Type",
    "Credit Limit",
    "Credit Card Balance",
    "Rewards Points"
).dropna(subset=["card_id"])

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

win_cumsum = (
    Window
    .partitionBy("customer_id")
    .orderBy("Transaction Date")
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)
)

txn_cumsum = transactions.withColumn(
    "cumulative_amount",
    F.sum("Transaction Amount").over(win_cumsum)
)

display(txn_cumsum)