In [0]:
from pyspark.sql.functions import col, current_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, FloatType, DateType

In [0]:
spark.sql("DROP TABLE IF EXISTS cpg_industry.bronze.cpg_consumer")
spark.sql("DROP TABLE IF EXISTS cpg_industry.bronze.cpg_consumer_order")
spark.sql("DROP TABLE IF EXISTS cpg_industry.bronze.cpg_order_items")
spark.sql("DROP TABLE IF EXISTS cpg_industry.bronze.cpg_consumer_invoice")
spark.sql("DROP TABLE IF EXISTS cpg_industry.bronze.cpg_distributor")
spark.sql("DROP TABLE IF EXISTS cpg_industry.bronze.cpg_distributor_purchases")
spark.sql("DROP TABLE IF EXISTS cpg_industry.bronze.cpg_distributor_purchase_items")
spark.sql("DROP TABLE IF EXISTS cpg_industry.bronze.cpg_distributor_invoice")
spark.sql("DROP TABLE IF EXISTS cpg_industry.bronze.cpg_product")
spark.sql("DROP TABLE IF EXISTS cpg_industry.bronze.cpg_inventory")


In [0]:

# Define schema for cpg_consumer
cpg_consumer_schema = StructType([
    StructField("consumer_id", StringType()),
    StructField("name", StringType()),
    StructField("email", StringType()),
    StructField("phone", StringType()),
    StructField("gender", StringType()),
    StructField("age", IntegerType()),
    StructField("registration_date", DateType()),
    StructField("is_active", BooleanType()),
    StructField("address", StringType()),
    StructField("city", StringType()),
    StructField("state", StringType()),
    StructField("country", StringType())
])

# Load JSON from volume using _metadata.file_path instead of input_file_name()
df_raw = (
    spark.read
    .schema(cpg_consumer_schema)
    .option("mergeSchema", "true")
    .json("dbfs:/Volumes/cpg_industry/cpg_data/data/cpg_consumer/cpg_consumer.json")
    .withColumn("source_file", col("_metadata.file_path"))
    .withColumn("ingestion_time", current_timestamp())
)

# Write to Delta Table in Bronze Layer
df_raw.write.format("delta").mode("overwrite").saveAsTable("cpg_industry.bronze.cpg_consumer")


In [0]:


# Define schema for cpg_consumer_invoice
cpg_consumer_invoice_schema = StructType([
    StructField("invoice_id", StringType()),
    StructField("order_id", StringType()),
    StructField("consumer_id", StringType()),
    StructField("invoice_date", DateType()),
    StructField("total_items", IntegerType()),
    StructField("gross_amount", FloatType()),
    StructField("discount_amount", FloatType()),
    StructField("tax_amount", FloatType()),
    StructField("net_amount", FloatType()),
    StructField("payment_method", StringType()),
    StructField("invoice_status", StringType()),
])


# Load JSON from volume
df_raw = (
    spark.read
    .schema(cpg_consumer_invoice_schema)
    .json("dbfs:/Volumes/cpg_industry/cpg_data/data/cpg_consumer/cpg_consumer_invoice.json")
    .withColumn("ingestion_time", current_timestamp())
)

# Write to Delta Table
df_raw.write.format("delta").mode("overwrite").saveAsTable("cpg_industry.bronze.cpg_consumer_invoice")


In [0]:


# Define schema for cpg_consumer_order
cpg_consumer_order_schema = StructType([
    StructField("order_id", StringType()),
    StructField("consumer_id", StringType()),
    StructField("order_date", DateType()),
    StructField("shipping_address", StringType()),
    StructField("billing_address", StringType()),
    StructField("order_status", StringType()),
    StructField("total_amount", FloatType()),
    StructField("payment_method", StringType()),
    StructField("currency", StringType()),
    StructField("channel", StringType()),
])


# Load JSON from volume
df_raw = (
    spark.read
    .schema(cpg_consumer_order_schema)
    .json("dbfs:/Volumes/cpg_industry/cpg_data/data/cpg_consumer/cpg_consumer_order.json")
    .withColumn("ingestion_time", current_timestamp())
)

# Write to Delta Table
df_raw.write.format("delta").mode("overwrite").saveAsTable("cpg_industry.bronze.cpg_consumer_order")


In [0]:

# Define schema for cpg_distributor
cpg_distributor_schema = StructType([
    StructField("distributor_id", StringType()),
    StructField("distributor_name", StringType()),
    StructField("company_name", StringType()),
    StructField("last_activity_date", DateType()),
    StructField("no_of_associated_deals", IntegerType()),
    StructField("total_open_deal_value", StringType()),
    StructField("rating", IntegerType()),
    StructField("email", StringType()),
    StructField("phone", StringType()),
    StructField("city", StringType()),
    StructField("state", StringType()),
    StructField("country", StringType()),
    StructField("address", StringType()),
])


# Load JSON from volume
df_raw = (
    spark.read
    .schema(cpg_distributor_schema)
    .json("dbfs:/Volumes/cpg_industry/cpg_data/data/cpg_distributor/cpg_distributor.json")
)

# Write to Delta Table
df_raw.write.format("delta").mode("overwrite").saveAsTable("cpg_industry.bronze.cpg_distributor")


In [0]:

# Define schema for cpg_distributor_invoice
cpg_distributor_invoice_schema = StructType([
    StructField("invoice_id", StringType()),
    StructField("purchase_id", StringType()),
    StructField("invoice_date", DateType()),
    StructField("amount_due", FloatType()),
    StructField("tax_amount", FloatType()),
    StructField("discount", FloatType()),
    StructField("total_payable", FloatType()),
    StructField("payment_status", StringType()),
])


# Load JSON from volume
df_raw = (
    spark.read
    .schema(cpg_distributor_invoice_schema)
    .json("dbfs:/Volumes/cpg_industry/cpg_data/data/cpg_distributor/cpg_distributor_invoice.json")
)

# Write to Delta Table
df_raw.write.format("delta").mode("overwrite").saveAsTable("cpg_industry.bronze.cpg_distributor_invoice")


In [0]:


# Define schema for cpg_distributor_purchase_items
cpg_distributor_purchase_items_schema = StructType([
    StructField("purchase_item_id", StringType()),
    StructField("purchase_id", StringType()),
    StructField("product_id", StringType()),
    StructField("upc", StringType()),
    StructField("quantity_ordered", IntegerType()),
    StructField("unit_cost", FloatType()),
    StructField("total_price", FloatType()),
])


# Load JSON from volume
df_raw = (
    spark.read
    .schema(cpg_distributor_purchase_items_schema)
    .json("dbfs:/Volumes/cpg_industry/cpg_data/data/cpg_distributor/cpg_distributor_purchase_items.json")
)

# Write to Delta Table
df_raw.write.format("delta").mode("overwrite").saveAsTable("cpg_industry.bronze.cpg_distributor_purchase_items")


In [0]:


# Define schema for cpg_distributor_purchases
cpg_distributor_purchases_schema = StructType([
    StructField("purchase_id", StringType()),
    StructField("order_date", DateType()),
    StructField("expected_delivery_date", DateType()),
    StructField("order_status", StringType()),
    StructField("total_amount", FloatType()),
    StructField("currency", StringType()),
])


# Load JSON from volume
df_raw = (
    spark.read
    .schema(cpg_distributor_purchases_schema)
    .json("dbfs:/Volumes/cpg_industry/cpg_data/data/cpg_distributor/cpg_distributor_purchases.json")
)

# Write to Delta Table
df_raw.write.format("delta").mode("overwrite").saveAsTable("cpg_industry.bronze.cpg_distributor_purchases")


In [0]:


# Define schema for cpg_order_items
cpg_consumer_order_items_schema = StructType([
    StructField("order_item_id", StringType()),
    StructField("order_id", StringType()),
    StructField("product_id", StringType()),
    StructField("quantity", IntegerType()),
    StructField("unit_price", FloatType()),
    StructField("total_price", FloatType()),
])


# Load JSON from volume
df_raw = (
    spark.read
    .schema(cpg_consumer_order_items_schema)
    .json("dbfs:/Volumes/cpg_industry/cpg_data/data/cpg_consumer/cpg_consumer_order_items.json")
)

# Write to Delta Table
df_raw.write.format("delta").mode("overwrite").saveAsTable("cpg_industry.bronze.cpg_order_items")


In [0]:


# Define schema for cpg_product
cpg_product_schema = StructType([
    StructField("product_id", StringType()),
    StructField("sku_id", StringType()),
    StructField("upc", StringType()),
    StructField("gtin", StringType()),
    StructField("product_name", StringType()),
    StructField("description", StringType()),
    StructField("department", StringType()),
    StructField("category", StringType()),
    StructField("brand", StringType()),
    StructField("unit_of_measurement", StringType()),
    StructField("retail_price", FloatType()),
    StructField("unit_price", FloatType()),
    StructField("release_date", DateType()),
    StructField("expiration_days", IntegerType()),
    StructField("product_status", StringType()),
])


# Load JSON from volume
df_raw = (
    spark.read
    .option("inferSchema", "true")
    .json("dbfs:/Volumes/cpg_industry/cpg_data/data/cpg_inventory/cpg_product.json")
)

# Write to Delta Table
df_raw.write.format("delta").mode("overwrite").saveAsTable("cpg_industry.bronze.cpg_product")


In [0]:

# Define schema for cpg_inventory
cpg_inventory_schema = StructType([
    StructField("inventory_id", StringType()),
    StructField("product_id", StringType()),
    StructField("quantity_on_hand", IntegerType()),
    StructField("reorder_level", IntegerType()),
    StructField("reorder_quantity", IntegerType()),
    StructField("last_restock_date", DateType()),
    StructField("safety_stock_level", IntegerType()),
    StructField("inventory_status", StringType()),
    StructField("last_updated", DateType()),
    StructField("location_code", StringType()),
    StructField("location_name", StringType()),
    StructField("location_type", StringType()),
    StructField("location_is_active", BooleanType()),
    StructField("address", StringType()),
    StructField("city", StringType()),
    StructField("state", StringType()),
    StructField("country", StringType()),
    StructField("email", StringType()),
    StructField("phone", StringType()),
])


# Load JSON from volume
df_raw = (
    spark.read
    .schema(cpg_inventory_schema)
    .json("dbfs:/Volumes/cpg_industry/cpg_data/data/cpg_inventory/cpg_inventory.json")
)

# Write to Delta Table
df_raw.write.format("delta").mode("overwrite").saveAsTable("cpg_industry.bronze.cpg_inventory")
