# ------------------------------------------
# Silver Layer Notebook - Clean & Standardize
# Dataset: Library Borrowing System
# Author: TheDataLead AI
# ------------------------------------------

In [0]:
from pyspark.sql.functions import col, to_date, datediff, lit, coalesce, current_date
from pyspark.sql.types import *


In [0]:
books_schema = StructType([
    StructField("isbn", StringType(), False),
    StructField("title", StringType(), True),
    StructField("author", StringType(), True),
    StructField("genre", StringType(), True),
    StructField("publish_date", DateType(), True),
    StructField("pages", IntegerType(), True)
])

borrowers_schema = StructType([
    StructField("user_id", StringType(), False),
    StructField("name", StringType(), True),
    StructField("book_isbn", StringType(), True),
    StructField("borrow_date", DateType(), True),
    StructField("return_date", DateType(), True),
    StructField("return_delay_days", IntegerType(), True)
])

staff_schema = StructType([
    StructField("staff_id", StringType(), False),
    StructField("name", StringType(), True),
    StructField("role", StringType(), True),
    StructField("hire_date", DateType(), True)
])

In [0]:
# Load Bronze tables
books_bronze = spark.table("books_bronze")
borrowers_bronze = spark.table("borrowers_bronze")
staff_bronze = spark.table("staff_bronze") 

In [0]:
# Clean books data
books_silver = books_bronze.select(
    col("isbn").cast("string"),
    col("title"),
    col("author"),
    col("genre"),
    to_date(col("publish_date")).alias("publish_date"),
    col("pages").cast("int")
)
books_silver.write.mode("overwrite").format("delta").option("overwriteSchema", True).saveAsTable("books_silver")

In [0]:
%sql
select * from books_silver limit 10

In [0]:
%sql
CREATE OR REPLACE TABLE borrowers_silver
USING DELTA
AS
SELECT
  COALESCE(user_id, 'unknown') AS user_id,
  COALESCE(name, 'anonymous') AS name,
  COALESCE(book_isbn, 'unknown') AS book_isbn,
  TO_DATE(COALESCE(borrow_date, '2000-01-01')) AS borrow_date,
  TO_DATE(COALESCE(return_date, '2000-01-01')) AS return_date,
  DATEDIFF(
    TO_DATE(COALESCE(return_date, '2000-01-01')),
    TO_DATE(COALESCE(borrow_date, '2000-01-01'))
  ) AS return_delay_days
FROM borrowers_bronze;

In [0]:
# Clean borrowers data
borrowers_silver = borrowers_bronze.select(
    col("user_id"),
    col("name"),
    col("book_isbn"),
    col("borrow_date"),
    to_date(coalesce(col("return_date"), current_date())).alias("return_date"),
    datediff(
        to_date(coalesce(col("return_date"), current_date())),
        to_date(coalesce(col("borrow_date"), lit("2000-01-01")))
    ).alias("return_delay_days")
)

borrowers_silver.write.mode("overwrite").format("delta").option("overwriteSchema", True).saveAsTable("borrowers_silver")

In [0]:
%sql
select * from borrowers_silver limit 10

In [0]:
# Clean staff data
staff_silver = staff_bronze.select(
    col("staff_id").cast("string"),
    col("name"),
    col("role"),
    to_date(col("hire_date")).alias("hire_date")
)
staff_silver.write.mode("overwrite").format("delta").option("overwriteSchema", True).saveAsTable("staff_silver")

In [0]:
%sql
select * from staff_silver limit 10