In [0]:
from pyspark.sql import functions as F
df = (
    spark.read.format("csv")
        .option("inferSchema", "true")
        .option("header", "true")
        .load("/Volumes/raw-data/banking/csv/Banking_Database.csv")
)

df.printSchema()
df.display(5)

In [0]:
from pyspark.sql.functions import col

display(
  df.select(
    col("First Name").alias("Name")
  )
)

display(
  df.filter(
    col("Age") > 30
  )
)

In [0]:
df2 = df.withColumn("Balance_K", F.col("Account Balance") / 1000)
df2.display()

In [0]:
df.withColumn(
    "Risk_Flag",
    F.when(F.col("Account Balance") < 10000, "High")
     .when(F.col("Account Balance") < 50000, "Medium")
     .otherwise("Low")
).display()

In [0]:
df.na.drop(subset=["Account Balance"]).display()
df.na.fill({"Account Balance": 0, "Age": 0}).display()

In [0]:
df.groupBy("Account Type").agg(
    F.count("*").alias("Total_Count"),
    F.avg("Account Balance").alias("Avg_Balance"),
    F.sum("Account Balance").alias("Total_Balance")
).display()

In [0]:
from pyspark.sql.window import Window

win = Window.partitionBy("City").orderBy(
    F.desc("Account Balance")
)

df_with_rank = df.withColumn(
    "Rank_in_Branch",
    F.rank().over(win)
)

display(df_with_rank)

In [0]:
display(
    df.select(
        "Address",
        F.upper("Address").alias("Addr_Upper"),
        F.substring("Address", 1, 5).alias("Short_Addr")
    )
)

In [0]:
df.describe("Age", "Balance").show()

In [0]:
df.write.mode("overwrite").parquet("/tmp/banking_output")

In [0]:
from pyspark.sql import functions as F
df = (
    spark.read.format("csv")
        .option("inferSchema", "true")
        .option("header", "true")
        .load("/Volumes/raw-data/banking/csv/Banking_Database.csv")
)

df.printSchema()
df.display(5)

In [0]:
from pyspark.sql.functions import col

display(
  df.select(
    col("First Name").alias("Name")
  )
)

display(
  df.filter(
    col("Age") > 30
  )
)

2️⃣ Create New Columns

In [0]:
df2 = df.withColumn("Balance_K", F.col("Account Balance") / 1000)
df2.display()

In [0]:
df.withColumn(
    "Risk_Flag",
    F.when(F.col("Account Balance") < 10000, "High")
     .when(F.col("Account Balance") < 50000, "Medium")
     .otherwise("Low")
).display()

In [0]:
df.na.drop(subset=["Account Balance"]).display()
df.na.fill({"Account Balance": 0, "Age": 0}).display()

In [0]:
df.groupBy("Account Type").agg(
    F.count("*").alias("Total_Count"),
    F.avg("Account Balance").alias("Avg_Balance"),
    F.sum("Account Balance").alias("Total_Balance")
).display()

In [0]:
from pyspark.sql.window import Window

win = Window.partitionBy("City").orderBy(
    F.desc("Account Balance")
)

df_with_rank = df.withColumn(
    "Rank_in_Branch",
    F.rank().over(win)
)

display(df_with_rank)

In [0]:
display(
    df.select(
        "Address",
        F.upper("Address").alias("Addr_Upper"),
        F.substring("Address", 1, 5).alias("Short_Addr")
    )
)

In [0]:
df2 = df.withColumn("Txn_Date", F.to_date(F.col("Transaction Date"), "yyyy-MM-dd"))
display(
    df2.select(
        "Txn_Date",
        F.year("Txn_Date").alias("Year"),
        F.month("Txn_Date").alias("Month")
    )
)

In [0]:
df.describe("Age", "Account Balance").show()

In [0]:
df.write.mode("overwrite").parquet("/tmp/banking_output")