# Data Cleaning

In this section, we will perform data cleaning on the credit scoring dataset. This includes handling missing values, correcting data types, and removing duplicates. Data cleaning is a crucial step in the data preprocessing pipeline as it ensures that the data is accurate and ready for analysis and modeling.

*Note: Adjust the catalog and schema parameter as needed based on your Databricks environment setup.*

In [0]:
CATALOG = 'workspace'
BRONZE_SCHEMA = 'bronze'
SILVER_SCHEMA = 'silver'

In [0]:
# Create the silver schema if it doesn't exist
display(
    spark.sql(f"""
    CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SILVER_SCHEMA}
    """)
)


# Clean the data in Bronze Schema


## Clean Applicants Data

In [0]:
# Clean applicants data

applicants_df = spark.table(f"{CATALOG}.{BRONZE_SCHEMA}.raw_applicants")
applicants_df.limit(5).display()

# trim whitespace in nama_lengkap column
from pyspark.sql.functions import trim

applicants_df = applicants_df.withColumn("nama_lengkap", trim("nama_lengkap"))
# applicants_df.limit(5).display()

# lower case jenis_kelamin column
from pyspark.sql.functions import lower

applicants_df = applicants_df.withColumn("jenis_kelamin", lower("jenis_kelamin"))
# applicants_df.limit(5).display()

# In column jenis_kelamin, convert male, p, l, laki-laki, pria to m and perempuan, wanita, female to f
from pyspark.sql.functions import when, col

applicants_df = applicants_df.withColumn(
    "jenis_kelamin",
    when(col("jenis_kelamin").isin("male", "p", "pria", "laki-laki", "l"), "m")
    .when(col("jenis_kelamin").isin("perempuan", "female", "wanita"), "f")
    .otherwise(col("jenis_kelamin"))
)
# applicants_df.limit(5).display()

# convert pendapatan_bulanan to fully numerical column. Remove non-numeric string if needed
from pyspark.sql.functions import regexp_replace, col

# Remove non-numeric characters except dot and comma, then convert to float
applicants_df = applicants_df.withColumn(
    "pendapatan_bulanan",
    regexp_replace(col("pendapatan_bulanan"), "[^0-9]", "")
    .cast("int")
)

silver_applicants_df = applicants_df
silver_applicants_df.limit(5).display()



## Clean Loans Data

In [0]:
# Clean loans data

loans_df = spark.table(f"{CATALOG}.{BRONZE_SCHEMA}.raw_loans")
loans_df.limit(5).display()

# trim whitespace in tujuan_pinjaman column
from pyspark.sql.functions import trim

loans_df = loans_df.withColumn("tujuan_pinjaman", trim("tujuan_pinjaman"))

# lower case status_persetujuan column
from pyspark.sql.functions import lower

loans_df = loans_df.withColumn("status_persetujuan", lower("status_persetujuan"))
# distinct_status_df = loans_df.select("status_persetujuan").distinct()
# display(distinct_status_df)

# In column status_persetujuan, convert dsetujui to approved, ditolak to rejected, 'dalam proses' to pending
from pyspark.sql.functions import when, col

loans_df = loans_df.withColumn(
    "status_persetujuan",
    when(col("status_persetujuan").isin("disetujui"), "approved")
    .when(col("status_persetujuan").isin("ditolak"), "rejected")
    .when(col("status_persetujuan").isin("dalam proses"), "pending")
    .otherwise(col("status_persetujuan"))
)
# distinct_status_df = loans_df.select("status_persetujuan").distinct()
# display(distinct_status_df)


# convert jumlah_pinjaman to fully numerical column. Remove non-numeric string if needed
from pyspark.sql.functions import regexp_replace, col

# Remove non-numeric characters except dot and comma, then convert to float
loans_df = loans_df.withColumn(
    "jumlah_pinjaman",
    regexp_replace(col("jumlah_pinjaman"), "[^0-9]", "")
    .cast("int")
)
silver_loans_df = loans_df
silver_loans_df.limit(5).display()



# Clean Repayment Data

In [0]:
# Clean repayment data

repayments_df = spark.table(f"{CATALOG}.{BRONZE_SCHEMA}.raw_repayments")
repayments_df.limit(5).display()

# trim whitespace in tujuan_pinjaman column
from pyspark.sql.functions import trim

# repayments_df = repayments_df.withColumn("tujuan_pinjaman", trim("tujuan_pinjaman"))

# lower case status_persetujuan column
from pyspark.sql.functions import lower

repayments_df = repayments_df.withColumn("status_pembayaran", lower("status_pembayaran"))
# distinct_status_df = repayments_df.select("status_pembayaran").distinct()
# display(distinct_status_df)

# In column status_pembayaran, convert dsetujui to approved, ditolak to rejected, 'dalam proses' to pending
from pyspark.sql.functions import when, col

repayments_df = repayments_df.withColumn(
    "status_pembayaran",
    when(col("status_pembayaran").isin("tepat waktu"), "on time")
    .when(col("status_pembayaran").isin("gagal bayar"), "default")
    .when(col("status_pembayaran").isin("terlambat"), "late")
    .otherwise(col("status_pembayaran"))
)

# distinct_status_df = repayments_df.select("status_pembayaran").distinct()
# display(distinct_status_df)


# convert jumlah_pinjaman to fully numerical column. Remove non-numeric string if needed
from pyspark.sql.functions import regexp_replace, col

# Remove non-numeric characters except dot and comma, then convert to float
repayments_df = repayments_df.withColumn(
    "jumlah_angsuran",
    regexp_replace(col("jumlah_angsuran"), "[^0-9]", "")
    .cast("int")
)

repayments_df = repayments_df.withColumn(
    "jumlah_dibayar",
    regexp_replace(col("jumlah_dibayar"), "[^0-9]", "")
    .cast("int")
)

silver_repayments_df = repayments_df
silver_repayments_df.limit(5).display()


In [0]:
silver_applicants_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SILVER_SCHEMA}.applicants")
silver_loans_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SILVER_SCHEMA}.loans")
silver_repayments_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SILVER_SCHEMA}.repayments")