**BRONZE LAYER**

DATA INGESTION

In [0]:
# Bronze Layer: Raw Data Ingestion
# Purpose:
# - Load raw CSV files into Spark
# - Apply explicit schemas
# - Validate identifiers and basic data sanity
# - No cleaning or transforming of data at this stage

from pyspark.sql.types import (
    StructType, StructField,
    StringType, IntegerType, DoubleType
)
from pyspark.sql.functions import col

In [0]:
# These paths point to the raw datasets stored in Databricks Volumes.
SCHOOL_MASTER_PATH = "/Volumes/workspace/default/capstone/school_master.csv"
ENROLLMENT_PATH    = "/Volumes/workspace/default/capstone/student_enrollment.csv"
PERFORMANCE_PATH   = "/Volumes/workspace/default/capstone/student_performance.csv"

In [0]:
# School master reference data schema
school_master_schema = StructType([
    StructField("school_id", StringType(), False),
    StructField("school_name", StringType(), False),
    StructField("region", StringType(), False),
    StructField("district", StringType(), False),
    StructField("school_type", StringType(), False),
    StructField("capacity", IntegerType(), False)
])

# Enrollment fact data schema
enrollment_schema = StructType([
    StructField("school_id", StringType(), True),
    StructField("academic_year", IntegerType(), True),
    StructField("grade_level", DoubleType(), True), 
    StructField("gender", StringType(), True),
    StructField("student_count", DoubleType(), True)
])

# Student performance fact data schema
performance_schema = StructType([
    StructField("school_id", StringType(), False),
    StructField("academic_year", IntegerType(), False),
    StructField("grade_level", IntegerType(), True),
    StructField("average_score", DoubleType(), True),
    StructField("attendance_percentage", DoubleType(), True),
    StructField("dropout_risk_flag", IntegerType(), True)
])

In [0]:
# Load school master data
df_school_master = (
    spark.read
         .format("csv")
         .option("header", "true")
         .schema(school_master_schema)
         .load(SCHOOL_MASTER_PATH)
)

# Load student enrollment data
df_enrollment = (
    spark.read
         .format("csv")
         .option("header", "true")
         .schema(enrollment_schema)
         .load(ENROLLMENT_PATH)
)

# Load student performance data
df_performance = (
    spark.read
         .format("csv")
         .option("header", "true")
         .schema(performance_schema)
         .load(PERFORMANCE_PATH)
)

In [0]:
# Save Bronze tables as Delta
df_school_master.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("bronze_school_master")

df_enrollment.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("bronze_student_enrollment")

df_performance.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("bronze_student_performance")


In [0]:
# Row count checks help catch empty or corrupted files early
print("School Master Records:", df_school_master.count())
print("Enrollment Records:", df_enrollment.count())
print("Performance Records:", df_performance.count())

School Master Records: 120
Enrollment Records: 7000
Performance Records: 7000
