#### _<u>_**1. Gold Layer**_</u>_



The gold layer is intended to provide data ready for business analysis - BI.

First, the necessary libraries/functions are imported

In [None]:
from pyspark.sql.functions import col, year, quarter, month, dayofmonth, dayofweek, date_format, to_date, lit
from pyspark.sql.types import IntegerType, StringType

Then it is checked whether the variable has been imported from canvas.

In [None]:
if not time_pipeline:
    raise ValueError("Error: time_pipeline variable is empty or not provided by the pipeline.")
time_pipeline = str(time_pipeline)

The tables are checked for completeness.

In [None]:
try:
    all_tables = [t.name for t in spark.catalog.listTables("silver")]
    if not all_tables:
        raise ValueError("Error: No tables found in the silver schema.")
except Exception as e:
    raise Exception(f"Error accessing silver schema: {str(e)}")


#### 2. Preparation of tables

In order to prepare the time dimension, you need to retrieve the dates from the silver tables.

In [None]:
dates = None
for table_name in all_tables:
    df = spark.read.table(f"silver.{table_name}")
    if 'created_at' in df.columns:
        dates_df = df.select(to_date(col("created_at")).alias("date")).distinct()
        if dates is None:
            dates = dates_df
        else:
            dates = dates.union(dates_df)

if dates is None:
    raise ValueError("Error: No dates found in the silver tables.")

Then the appropriate Data Frame is prepared and saved to the appropriate table - dimension time.

In [None]:

dim_time = dates.select(
    col("date").alias("date_id"),
    year(col("date")).alias("year"),
    quarter(col("date")).alias("quarter"),
    month(col("date")).alias("month"),
    dayofmonth(col("date")).alias("day"),
    dayofweek(col("date")).alias("day_of_week"),
    date_format(col("date"), "EEEE").alias("day_name"),
    lit(time_pipeline).alias("processed_at")
).distinct()

dim_time = dim_time.dropDuplicates(["date_id"])

dim_time.write.mode("append").saveAsTable("gold.dim_time")

Then a dimension describing consumers is created.

In [None]:
df_customers = spark.read.table("silver.customers").where(col("processed_at") == time_pipeline)
dim_customers = df_customers.select(
    col("CustomerID").alias("customer_id"),
    col("CountryID").alias("country_id"),
    to_date(col("created_at")).alias("created_at_date"),
    col("processed_at")
)
dim_customers.write.mode("append").saveAsTable("gold.dim_customers")

The dimension of diamonds is created.

In [None]:
df_diamonds = spark.read.table("silver.diamonds").where(col("processed_at") == time_pipeline)
dim_diamonds = df_diamonds.select(
    col("DiamondID").alias("diamond_id"),
    col("carat"),
    col("cut"),
    col("color"),
    col("clarity"),
    col("depth"),
    col("table"),
    col("price"),
    col("x"),
    col("y"),
    col("z"),
    to_date(col("created_at")).alias("created_at_date"),
    col("processed_at")
)
dim_diamonds.write.mode("append").saveAsTable("gold.dim_diamonds")

A country dimension is created.

In [None]:
df_countries = spark.read.table("silver.countries").where(col("processed_at") == time_pipeline)
dim_countries = df_countries.select(
    col("CountryID").alias("country_id"),
    col("country"),
    col("region"),
    col("GNI_per_capita"),
    to_date(col("created_at")).alias("created_at_date"),
    col("processed_at")
)
dim_countries.write.mode("append").saveAsTable("gold.dim_countries")

A transaction dimension is created

In [None]:
df_transactions = spark.read.table("silver.transactions").where(col("processed_at") == time_pipeline)
df_diamonds = spark.read.table("silver.diamonds")


fact_transactions = df_transactions.join(
    df_diamonds.select("DiamondID", "price"),
    df_transactions.DiamondID == df_diamonds.DiamondID,
    "inner"
).select(
    df_transactions.TransactionID.alias("transaction_id"),
    df_transactions.CustomerID.alias("customer_id"),
    df_transactions.DiamondID.alias("diamond_id"),
    df_transactions.Quantity.alias("quantity"),
    (df_transactions.Quantity * df_diamonds.price).alias("total_price"),
    to_date(df_transactions.created_at).alias("date_id"),
    df_transactions.processed_at
)


fact_transactions.write.mode("append").saveAsTable("gold.fact_transactions")

The results are checked.

In [None]:
gold_tables = ["dim_time", "dim_customers", "dim_diamonds", "dim_countries", "fact_transactions"]
for table in gold_tables:
    count = spark.read.table(f"gold.{table}").count()
    print(f"Row count for gold.{table}: {count}")