In [0]:
# The Medallion Architecture is a data architecture pattern that organizes data into three layers: Bronze, Silver, and Gold.
# Each layer serves a specific purpose and helps in managing and processing data efficiently.

# Bronze Layer:
# - Raw data ingestion layer.
# - Stores data in its raw, unprocessed form.
# - Data is ingested from various sources like databases, APIs, and files.
# - Schema-on-read is often used, meaning the schema is applied when the data is read.

# Silver Layer:
# - Cleansed and enriched data layer.
# - Data is transformed, cleaned, and enriched.
# - Schema-on-write is often used, meaning the schema is applied when the data is written.
# - Data quality checks and transformations are applied to make the data more usable.

# Gold Layer:
# - Business-level aggregates and analytics layer.
# - Data is aggregated, summarized, and optimized for analytics and reporting.
# - Data is ready for consumption by business users and applications.
# - Often used for creating dashboards, reports, and machine learning models.

# Example code to demonstrate the Medallion Architecture using Spark DataFrames:

# Bronze Layer: Ingest raw data
bronze_df = spark.read.format("csv").option("header", "true").load("/dbfs/mnt/data/raw/data.csv")
bronze_df.write.format("delta").mode("overwrite").save("/path/to/bronze/table")

# Silver Layer: Cleanse and enrich data
bronze_df = spark.read.format("delta").load("/path/to/bronze/table")
silver_df = bronze_df.filter("some_column IS NOT NULL").withColumn("new_column", bronze_df["existing_column"] * 2)
silver_df.write.format("delta").mode("overwrite").save("/path/to/silver/table")

# Gold Layer: Aggregate and summarize data
silver_df = spark.read.format("delta").load("/path/to/silver/table")
gold_df = silver_df.groupBy("group_column").agg({"value_column": "sum"}).withColumnRenamed("sum(value_column)", "total_value")
gold_df.write.format("delta").mode("overwrite").save("/path/to/gold/table")

# Display the Gold Layer DataFrame
display(gold_df)

In [0]:
data = [
    ("A", 10, 1),
    ("B", 20, 2),
    ("A", 15, None),
    ("C", None, 3),
    ("B", 25, 4)
]
columns = ["group_column", "value_column", "existing_column"]
bronze_df = spark.createDataFrame(data, columns)
bronze_df.write.format("delta").mode("overwrite").save(path_to_save_bronze_df)

bronze_df=spark.read.format("delta").load(path_to_save_bronze_df)
silver_df=bronze_df.filter(col("existing_column").isNotNull())\
    .withColumn("new_column",col("existing_column")*2)
silver_df.write.format("delta").mode("overwrite").save(path_to_save_silver_df)

silver_df=spark.read.format("delta").load(Path_to_save_silver_df)
gold_df=silver_df.groupBy("group_column").agg({"value_column":"sum"}).withColumnRenamed("sum(value_column)","total_value")
gold_df.write.format("delta").mode("overwrite").save(path_to_save_gold_df)



In [0]:
from pyspark.sql.functions import col

# Bronze Layer: Ingest raw data (example data)
data = [
    ("A", 10, 1),
    ("B", 20, 2),
    ("A", 15, None),
    ("C", None, 3),
    ("B", 25, 4)
]
columns = ["group_column", "value_column", "existing_column"]
bronze_df = spark.createDataFrame(data, columns)
bronze_df.write.format("delta").mode("overwrite").save("/Workspace/Users/vikky.vinodh.25@gmail.com/bronze_table")

# Silver Layer: Cleanse and enrich data
bronze_df = spark.read.format("delta").load("/tmp/bronze_table")
silver_df = bronze_df.filter(col("existing_column").isNotNull()) \
    .withColumn("new_column", col("existing_column") * 2)
silver_df.write.format("delta").mode("overwrite").save("/Workspace/Users/vikky.vinodh.25@gmail.com/silver_table")

# Gold Layer: Aggregate and summarize data
silver_df = spark.read.format("delta").load("/Workspace/Users/vikky.vinodh.25@gmail.com/silver_table")
gold_df = silver_df.groupBy("group_column") \
    .agg({"value_column": "sum"}) \
    .withColumnRenamed("sum(value_column)", "total_value")
gold_df.write.format("delta").mode("overwrite").save("/Workspace/Users/vikky.vinodh.25@gmail.com/gold_table")

display(gold_df)