# Project Overview
Dimensional data mart that represents a simple business involving customers, the products, and the vendors.

###  Import Required Libraries

In [1]:
import findspark
findspark.init()
print(findspark.find())

import os
import sys
import json
import time
import pymongo
import certifi
import shutil
import pandas as pd

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window as W
from sqlalchemy import create_engine, text

/opt/homebrew/Cellar/apache-spark/3.5.5/libexec


### Instantiate Global Variables and Paths for Data Sources

In [2]:
# --------------------------------------------------------------------------------
# Specify MySQL Server Connection Information
# --------------------------------------------------------------------------------
mysql_args = {
    "host_name" : "localhost",
    "port" : "3306",
    "db_name" : "adventureworks",
    "conn_props" : {
        "user" : "root",
        "password" : "Ashwaniis#1!",
        "driver" : "com.mysql.cj.jdbc.Driver"
    }
}

# --------------------------------------------------------------------------------
# Specify MongoDB Cluster Connection Information
# --------------------------------------------------------------------------------
mongodb_args = {
    "cluster_location" : "local", # "atlas"
    "user_name" : "vaneeshagupta10",
    "password" : "Fdztq26kWFlyBXiE",
    "cluster_name" : "cluster0",
    "cluster_subnet" : "koqso",
    "db_name" : "northwind",
    "collection" : "",
    "null_column_threshold" : 0.5
}


# --------------------------------------------------------------------------------
# Specify Directory Structure for Source Data
# --------------------------------------------------------------------------------
base_dir = "/Users/vaneeshagupta/Desktop/capstone"
data_dir = base_dir
batch_dir = os.path.join(data_dir, 'batch')
stream_dir = os.path.join(data_dir, 'streaming')

# --------------------------------------------------------------------------------
# Streaming Data Source Directories
# --------------------------------------------------------------------------------
orders_stream_dir = os.path.join(stream_dir, 'orders')
purchase_orders_stream_dir = os.path.join(stream_dir, 'purchase_orders')
inventory_trans_stream_dir = os.path.join(stream_dir, 'inventory_transactions')

# --------------------------------------------------------------------------------
# Databricks/SQL Warehouse Paths (Bronze, Silver, Gold Layers)
# --------------------------------------------------------------------------------
dest_database = "data_mart_dlh"
sql_warehouse_dir = os.path.abspath('spark-warehouse')
dest_database_dir = f"{dest_database}.db"
database_dir = os.path.join(sql_warehouse_dir, dest_database_dir)

orders_output_bronze = os.path.join(database_dir, 'fact_orders', 'bronze')
orders_output_silver = os.path.join(database_dir, 'fact_orders', 'silver')
orders_output_gold = os.path.join(database_dir, 'fact_orders', 'gold')

purchase_orders_output_bronze = os.path.join(database_dir, 'fact_purchase_orders', 'bronze')
purchase_orders_output_silver = os.path.join(database_dir, 'fact_purchase_orders', 'silver')
purchase_orders_output_gold = os.path.join(database_dir, 'fact_purchase_orders', 'gold')

inventory_trans_output_bronze = os.path.join(database_dir, 'fact_inventory_transactions', 'bronze')
inventory_trans_output_silver = os.path.join(database_dir, 'fact_inventory_transactions', 'silver')
inventory_trans_output_gold = os.path.join(database_dir, 'fact_inventory_transactions', 'gold')




### Define Global Variables for MySQL, MongoDB, and File Handling

In [3]:

def get_file_info(path: str):
    file_sizes = []
    modification_times = []
    items = os.listdir(path)
    files = sorted([item for item in items if os.path.isfile(os.path.join(path, item))])
    for file in files:
        file_sizes.append(os.path.getsize(os.path.join(path, file)))
        modification_times.append(pd.to_datetime(os.path.getmtime(os.path.join(path, file)), unit='s'))
    data = list(zip(files, file_sizes, modification_times))
    column_names = ['name','size','modification_time']
    return pd.DataFrame(data=data, columns=column_names)

def wait_until_stream_is_ready(query, min_batches=1):
    while len(query.recentProgress) < min_batches:
        time.sleep(5)
    print(f"The stream has processed {len(query.recentProgress)} batches")

def remove_directory_tree(path: str):
    try:
        if os.path.exists(path):
            shutil.rmtree(path)
            return f"Directory '{path}' has been removed successfully."
        else:
            return f"Directory '{path}' does not exist."
    except Exception as e:
        return f"An error occurred: {e}"

def drop_null_columns(df, threshold):
    columns_with_nulls = [col for col in df.columns if df.filter(df[col].isNull()).count() / df.count() > threshold] 
    df_dropped = df.drop(*columns_with_nulls) 
    return df_dropped

def get_mysql_dataframe(spark_session, sql_query : str, **args):
    jdbc_url = f"jdbc:mysql://{args['host_name']}:{args['port']}/{args['db_name']}"
    dframe = spark_session.read.format("jdbc") \
        .option("url", jdbc_url) \
        .option("driver", args['conn_props']['driver']) \
        .option("user", args['conn_props']['user']) \
        .option("password", args['conn_props']['password']) \
        .option("query", sql_query) \
        .load()
    return dframe

def get_mongo_uri(**args):
    if args["cluster_location"] not in ['atlas', 'local']:
        raise Exception("You must specify either 'atlas' or 'local' for the 'cluster_location' parameter.")
        
    if args['cluster_location'] == "atlas":
        uri = f"mongodb+srv://{args['user_name']}:{args['password']}@"
        uri += f"{args['cluster_name']}.{args['cluster_subnet']}.mongodb.net/"
    else:
        uri = "mongodb://localhost:27017/"
    return uri

def get_spark_conf_args(spark_jars : list, **args):
    jars = ""
    for jar in spark_jars:
        jars += f"{jar}, "
    
    sparkConf_args = {
        "app_name" : "PySpark Northwind Data Lakehouse (Medallion Architecture)",
        "worker_threads" : f"local[{int(os.cpu_count()/2)}]",
        "shuffle_partitions" : int(os.cpu_count()),
        "mongo_uri" : get_mongo_uri(**args),
        "spark_jars" : jars[0:-2],
        "database_dir" : sql_warehouse_dir
    }
    
    return sparkConf_args

def get_spark_conf(**args):
    sparkConf = SparkConf().setAppName(args['app_name'])\
    .setMaster(args['worker_threads']) \
    .set('spark.driver.memory', '4g') \
    .set('spark.executor.memory', '2g') \
    .set('spark.jars', args['spark_jars']) \
    .set('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.1') \
    .set('spark.mongodb.input.uri', args['mongo_uri']) \
    .set('spark.mongodb.output.uri', args['mongo_uri']) \
    .set('spark.sql.adaptive.enabled', 'false') \
    .set('spark.sql.debug.maxToStringFields', 35) \
    .set('spark.sql.shuffle.partitions', args['shuffle_partitions']) \
    .set('spark.sql.streaming.forceDeleteTempCheckpointLocation', 'true') \
    .set('spark.sql.streaming.schemaInference', 'true') \
    .set('spark.sql.warehouse.dir', args['database_dir']) \
    .set('spark.streaming.stopGracefullyOnShutdown', 'true')
    
    return sparkConf

def get_mongo_client(**args):
    mongo_uri = get_mongo_uri(**args)
    if args['cluster_location'] == "atlas":
        client = pymongo.MongoClient(mongo_uri, tlsCAFile=certifi.where())
    elif args['cluster_location'] == "local":
        client = pymongo.MongoClient(mongo_uri)
    else:
        raise Exception("A MongoDB Client could not be created.")
    return client
    
def set_mongo_collections_with_pyspark(spark_session, data_directory: str, json_files: dict, **mongo_args):
    db_name = mongo_args["db_name"]
    mongo_uri = get_mongo_uri(**mongo_args)

    for collection_name, filename in json_files.items():
        json_file_path = os.path.join(data_directory, filename)

        df = spark_session.read \
            .option("multiline", "true") \
            .json(json_file_path)

        df.write \
            .format("com.mongodb.spark.sql.DefaultSource") \
            .mode("overwrite") \
            .option("uri", mongo_uri) \
            .option("database", db_name) \
            .option("collection", collection_name) \
            .save()

        print(f"✔ Loaded {filename} into MongoDB collection '{collection_name}'")

def get_mongodb_dataframe(spark_session, **args):
    dframe = spark_session.read.format("com.mongodb.spark.sql.DefaultSource") \
        .option("database", args['db_name']) \
        .option("collection", args['collection']).load()
    dframe = dframe.drop('_id')
    dframe = drop_null_columns(dframe, args['null_column_threshold'])
    return dframe


def get_sql_dataframe(sql_query, **args):
    '''Create a connection to the MySQL database and return a DataFrame'''
    conn_str = f"mysql+pymysql://{args['uid']}:{args['pwd']}@{args['hostname']}/{args['dbname']}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    
    # Using context manager (with statement) to automatically manage connection
    with sqlEngine.connect() as connection:
        # Invoke the pd.read_sql() function to query the database, and fill a Pandas DataFrame
        dframe = pd.read_sql(text(sql_query), connection)
    
    return dframe
    

def set_dataframe(df, table_name, pk_column, db_operation, **args):
    '''Create a connection to the MySQL database and insert/update data'''
    conn_str = f"mysql+pymysql://{args['uid']}:{args['pwd']}@{args['hostname']}/{args['dbname']}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    
    # Using context manager (with statement) to automatically manage connection
    with sqlEngine.connect() as connection:
        '''Invoke the Pandas DataFrame .to_sql() function to either create, or append to, a table'''
        if db_operation == "insert":
            df.to_sql(table_name, con=connection, index=False, if_exists='replace')
            connection.execute(text(f"ALTER TABLE {table_name} ADD PRIMARY KEY ({pk_column});"))
        elif db_operation == "update":
            df.to_sql(table_name, con=connection, index=False, if_exists='append')
    

### Initialize Data Lakehouse Directory Structure
Remove the Data Lakehouse Database Directory Structure to Ensure Idempotency

In [4]:
remove_directory_tree(database_dir)

"Directory '/Users/vaneeshagupta/Desktop/capstone/spark-warehouse/data_mart_dlh.db' has been removed successfully."

### Create a New Spark Session

In [5]:
worker_threads = f"local[{int(os.cpu_count()/2)}]"

jars = []
#mysql_spark_jar = os.path.join(os.getcwd(), "mysql-connector-j-9.1.0", "mysql-connector-j-9.1.0.jar")
mysql_spark_jar = os.path.join(os.getcwd(), "mysql-connector-j-9.3.0.jar") 
mssql_spark_jar = os.path.join(os.getcwd(), "sqljdbc_12.8", "enu", "jars", "mssql-jdbc-12.8.1.jre11.jar")

jars.append(mysql_spark_jar)
#jars.append(mssql_spark_jar)

sparkConf_args = get_spark_conf_args(jars, **mongodb_args)

sparkConf = get_spark_conf(**sparkConf_args)
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
spark.sparkContext.setLogLevel("OFF")
spark

:: loading settings :: url = jar:file:/opt/homebrew/Cellar/apache-spark/3.5.5/libexec/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/vaneeshagupta/.ivy2/cache
The jars for the packages stored in: /Users/vaneeshagupta/.ivy2/jars
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-6553545a-6bc3-46e7-af51-c98532a81438;1.0
	confs: [default]
	found org.mongodb.spark#mongo-spark-connector_2.12;3.0.1 in central
	found org.mongodb#mongodb-driver-sync;4.0.5 in central
	found org.mongodb#bson;4.0.5 in central
	found org.mongodb#mongodb-driver-core;4.0.5 in central
:: resolution report :: resolve 122ms :: artifacts dl 4ms
	:: modules in use:
	org.mongodb#bson;4.0.5 from central in [default]
	org.mongodb#mongodb-driver-core;4.0.5 from central in [default]
	org.mongodb#mongodb-driver-sync;4.0.5 from central in [default]
	org.mongodb.spark#mongo-spark-connector_2.12;3.0.1 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules           

### Create a New Metadata Database

In [6]:
spark.sql(f"DROP DATABASE IF EXISTS {dest_database} CASCADE;")

sql_create_db = f"""
    CREATE DATABASE IF NOT EXISTS {dest_database}
    COMMENT 'DS-2002 Lab 06 Database'
    WITH DBPROPERTIES (contains_pii = true, purpose = 'DS-2002 Lab 6.0');
"""
spark.sql(sql_create_db)

DataFrame[]

### Fetch Reference Data from MongoDB, MySQL, and CSV Files

#### MongoDB (Note: Customer Data)

In [7]:
# JSON files for MongoDB collections
json_files = {
    "customers": "customer.json"
}

# Set MongoDB collections with the specified JSON files
set_mongo_collections_with_pyspark(spark, data_dir, json_files, **mongodb_args)


mongodb_args["collection"] = "customers"

df_dim_customers = get_mongodb_dataframe(spark, **mongodb_args)
df_dim_customers.toPandas().head(2)


                                                                                

✔ Loaded customer.json into MongoDB collection 'customers'


Unnamed: 0,AccountNumber,CustomerID,CustomerType,ModifiedDate,TerritoryID,rowguid
0,AW00000001,1,S,2004-10-13 11:15:07,1,
1,AW00000002,2,S,2004-10-13 11:15:07,1,


In [8]:
df_dim_customers.printSchema()


# Rename columns to standardize them as per dimension schema
df_dim_customers = df_dim_customers.withColumnRenamed("CustomerID", "customer_id") \
                                    .withColumnRenamed("AccountNumber", "account_number") \
                                    .withColumnRenamed("CustomerType", "customer_type") \
                                    .withColumnRenamed("ModifiedDate", "modified_date") \
                                    .withColumnRenamed("TerritoryID", "territory_id")



root
 |-- AccountNumber: string (nullable = true)
 |-- CustomerID: long (nullable = true)
 |-- CustomerType: string (nullable = true)
 |-- ModifiedDate: string (nullable = true)
 |-- TerritoryID: long (nullable = true)
 |-- rowguid: string (nullable = true)



In [9]:
# Create or replace a temporary view for 'dim_customers'
df_dim_customers.createOrReplaceTempView("customers")

# SQL query to add ROW_NUMBER as the primary key (customer_key)
sql_customers = """
SELECT 
    ROW_NUMBER() OVER (ORDER BY customer_id) AS customer_key,
    customer_id,
    account_number,
    customer_type,
    modified_date,
    territory_id
FROM customers
"""


# Execute SQL query to get the updated DataFrame with the primary key
df_dim_customers = spark.sql(sql_customers)

# Reorder Columns
ordered_columns = ['customer_key', 'customer_id', 'account_number', 'customer_type', 
                   'modified_date', 'territory_id']


df_dim_customers = df_dim_customers.select(*ordered_columns)

# Unit test: Show first two rows
df_dim_customers.toPandas().head(2)

# Save the DataFrame as a table in MySQL (data mart)
df_dim_customers.write.saveAsTable(f"{dest_database}.dim_customers", mode="overwrite")

# Unit test: Describe the table
spark.sql(f"DESCRIBE EXTENDED {dest_database}.dim_customers").show()

# Preview the first 2 rows in the table
spark.sql(f"SELECT * FROM {dest_database}.dim_customers LIMIT 2").toPandas()

+--------------------+--------------------+-------+
|            col_name|           data_type|comment|
+--------------------+--------------------+-------+
|        customer_key|                 int|   NULL|
|         customer_id|              bigint|   NULL|
|      account_number|              string|   NULL|
|       customer_type|              string|   NULL|
|       modified_date|              string|   NULL|
|        territory_id|              bigint|   NULL|
|                    |                    |       |
|# Detailed Table ...|                    |       |
|             Catalog|       spark_catalog|       |
|            Database|       data_mart_dlh|       |
|               Table|       dim_customers|       |
|        Created Time|Fri May 09 16:58:...|       |
|         Last Access|             UNKNOWN|       |
|          Created By|         Spark 3.5.5|       |
|                Type|             MANAGED|       |
|            Provider|             parquet|       |
|           

Unnamed: 0,customer_key,customer_id,account_number,customer_type,modified_date,territory_id
0,1,1,AW00000001,S,2004-10-13 11:15:07,1
1,2,2,AW00000002,S,2004-10-13 11:15:07,1


#### MySQL (Note: Product Data and Date Data)

In [10]:
# Define SQL query to fetch all product data from the MySQL 'product' table
sql_dim_products = f"SELECT * FROM {mysql_args['db_name']}.product"

# Execute the query and load the result into a Spark DataFrame
df_dim_products = get_mysql_dataframe(spark, sql_dim_products, **mysql_args)

# Display the schema of the DataFrame for verification
df_dim_products.printSchema()
# Preview the first two rows to validate data
df_dim_products.toPandas().head(2)

from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

# Define window specification for row numbering based on ProductID
window_spec = Window.orderBy("ProductID")

# Add a new surrogate key column 'product_key' using ROW_NUMBER()
df_dim_products = df_dim_products.withColumn("product_key", row_number().over(window_spec))


root
 |-- ProductID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- ProductNumber: string (nullable = true)
 |-- MakeFlag: boolean (nullable = true)
 |-- FinishedGoodsFlag: boolean (nullable = true)
 |-- Color: string (nullable = true)
 |-- SafetyStockLevel: integer (nullable = true)
 |-- ReorderPoint: integer (nullable = true)
 |-- StandardCost: double (nullable = true)
 |-- ListPrice: double (nullable = true)
 |-- Size: string (nullable = true)
 |-- SizeUnitMeasureCode: string (nullable = true)
 |-- WeightUnitMeasureCode: string (nullable = true)
 |-- Weight: decimal(8,2) (nullable = true)
 |-- DaysToManufacture: integer (nullable = true)
 |-- ProductLine: string (nullable = true)
 |-- Class: string (nullable = true)
 |-- Style: string (nullable = true)
 |-- ProductSubcategoryID: integer (nullable = true)
 |-- ProductModelID: integer (nullable = true)
 |-- SellStartDate: timestamp (nullable = true)
 |-- SellEndDate: timestamp (nullable = true)
 |-- DiscontinuedDat

In [11]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

# Use ProductID as the natural sort column
window_spec = Window.orderBy("ProductID")

# Add surrogate primary key
df_dim_products = df_dim_products.withColumn("product_key", row_number().over(window_spec))

ordered_columns = ["product_key", "ProductID", "Name", "ProductNumber", "ListPrice"]

# Reorder
df_dim_products = df_dim_products.select(*ordered_columns)

# Show top 2 rows
df_dim_products.toPandas().head(2)



Unnamed: 0,product_key,ProductID,Name,ProductNumber,ListPrice
0,1,1,Adjustable Race,AR-5381,0.0
1,2,2,Bearing Ball,BA-8327,0.0


In [12]:
# Save the transformed DataFrame as a table in the local Spark SQL warehouse
df_dim_products.write.saveAsTable(f"{dest_database}.dim_products", mode="overwrite")

In [13]:
# Describe to verify
spark.sql(f"DESCRIBE EXTENDED {dest_database}.dim_products").show()

# Preview rows
spark.sql(f"SELECT * FROM {dest_database}.dim_products LIMIT 2").toPandas()


+--------------------+--------------------+-------+
|            col_name|           data_type|comment|
+--------------------+--------------------+-------+
|         product_key|                 int|   NULL|
|           ProductID|                 int|   NULL|
|                Name|         varchar(50)|   NULL|
|       ProductNumber|         varchar(25)|   NULL|
|           ListPrice|              double|   NULL|
|                    |                    |       |
|# Detailed Table ...|                    |       |
|             Catalog|       spark_catalog|       |
|            Database|       data_mart_dlh|       |
|               Table|        dim_products|       |
|        Created Time|Fri May 09 16:58:...|       |
|         Last Access|             UNKNOWN|       |
|          Created By|         Spark 3.5.5|       |
|                Type|             MANAGED|       |
|            Provider|             parquet|       |
|            Location|file:/Users/vanee...|       |
+-----------

Unnamed: 0,product_key,ProductID,Name,ProductNumber,ListPrice
0,1,1,Adjustable Race,AR-5381,0.0
1,2,2,Bearing Ball,BA-8327,0.0


Populate the Date Dimension

In [14]:
sql_dim_date = f"SELECT * FROM {mysql_args['db_name']}.dim_date"
df_dim_date = get_mysql_dataframe(spark, sql_dim_date, **mysql_args)

df_dim_date.write.saveAsTable(f"{dest_database}.dim_date", mode="overwrite")

In [15]:
spark.sql(f"DESCRIBE EXTENDED {dest_database}.dim_date;").show()
spark.sql(f"SELECT * FROM {dest_database}.dim_date LIMIT 2").toPandas()

+--------------------+---------+-------+
|            col_name|data_type|comment|
+--------------------+---------+-------+
|            date_key|      int|   NULL|
|           full_date|     date|   NULL|
|           date_name| char(11)|   NULL|
|        date_name_us| char(11)|   NULL|
|        date_name_eu| char(11)|   NULL|
|         day_of_week|  tinyint|   NULL|
|    day_name_of_week| char(10)|   NULL|
|        day_of_month|  tinyint|   NULL|
|         day_of_year|      int|   NULL|
|     weekday_weekend| char(10)|   NULL|
|        week_of_year|  tinyint|   NULL|
|          month_name| char(10)|   NULL|
|       month_of_year|  tinyint|   NULL|
|is_last_day_of_month|  char(1)|   NULL|
|    calendar_quarter|  tinyint|   NULL|
|       calendar_year|      int|   NULL|
| calendar_year_month| char(10)|   NULL|
|   calendar_year_qtr| char(10)|   NULL|
|fiscal_month_of_year|  tinyint|   NULL|
|      fiscal_quarter|  tinyint|   NULL|
+--------------------+---------+-------+
only showing top

Unnamed: 0,date_key,full_date,date_name,date_name_us,date_name_eu,day_of_week,day_name_of_week,day_of_month,day_of_year,weekday_weekend,...,is_last_day_of_month,calendar_quarter,calendar_year,calendar_year_month,calendar_year_qtr,fiscal_month_of_year,fiscal_quarter,fiscal_year,fiscal_year_month,fiscal_year_qtr
0,20000101,2000-01-01,2000/01/01,01/01/2000,01/01/2000,7,Saturday,1,1,Weekend,...,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3
1,20000102,2000-01-02,2000/01/02,01/02/2000,02/01/2000,1,Sunday,2,2,Weekend,...,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3


#### CSV File Using PySpark (Note: Vendor Data)

In [16]:
# 1). Get a reference to the 'vendor.csv' file located in your batch directory.
vendor_csv = "/Users/vaneeshagupta/Desktop/capstone/vendor.csv"
print(vendor_csv)

# 2). Use Spark to read the CSV file into the 'df_dim_vendor' DataFrame.
#     Set header=True to use the first row as column names, and infer the schema automatically.
df_dim_vendor = spark.read.format('csv') \
    .option('header', 'true') \
    .option('inferSchema', 'true') \
    .load(vendor_csv)

# 3). Unit Test: Convert to Pandas and show the first two rows for verification.
df_dim_vendor.toPandas().head(2)


/Users/vaneeshagupta/Desktop/capstone/vendor.csv


Unnamed: 0,VendorID,AccountNumber,Name,CreditRating,PreferredVendorStatus,ActiveFlag,PurchasingWebServiceURL,ModifiedDate
0,1,INTERNAT0001,International,1,1,1,,2002-02-25
1,2,ELECTRON0002,Electronic Bike Repair & Supplies,1,1,1,,2002-02-17


In [17]:
# ----------------------------------------------------------------------------------
# Rename columns to follow consistent naming conventions (snake_case)
# ----------------------------------------------------------------------------------
df_dim_vendor = df_dim_vendor \
    .withColumnRenamed("VendorID", "vendor_id") \
    .withColumnRenamed("Name", "vendor_name") \
    .withColumnRenamed("CreditRating", "credit_rating") \
    .withColumnRenamed("ActiveFlag", "active_flag") \
    .withColumnRenamed("ModifiedDate", "modified_date")

# ----------------------------------------------------------------------------------
# Add Primary Key column using SQL Windowing function: ROW_NUMBER()
# ----------------------------------------------------------------------------------
df_dim_vendor.createOrReplaceTempView("vendor")

sql_vendor = """
    SELECT 
        ROW_NUMBER() OVER (ORDER BY vendor_id) AS vendor_key,
        vendor_id,
        vendor_name,
        credit_rating,
        active_flag,
        modified_date
    FROM vendor
"""

df_dim_vendor = spark.sql(sql_vendor)

# ----------------------------------------------------------------------------------
# Reorder Columns and display the first two rows in a Pandas dataframe
# ----------------------------------------------------------------------------------
ordered_columns = [
    'vendor_key', 'vendor_id', 'vendor_name',
    'credit_rating', 'active_flag', 'modified_date'
]

df_dim_vendor = df_dim_vendor.select(*ordered_columns)
df_dim_vendor.toPandas().head(2)


Unnamed: 0,vendor_key,vendor_id,vendor_name,credit_rating,active_flag,modified_date
0,1,1,International,1,1,2002-02-25
1,2,2,Electronic Bike Repair & Supplies,1,1,2002-02-17


In [18]:
# ----------------------------------------------------------------------------------
# Save the dimension table to the data mart (local Spark SQL warehouse)
# ----------------------------------------------------------------------------------
df_dim_vendor.write.saveAsTable(f"{dest_database}.dim_vendor", mode="overwrite")


In [19]:
# ----------------------------------------------------------------------------------
# Unit Test: Describe and Preview Table
# ----------------------------------------------------------------------------------
spark.sql(f"DESCRIBE EXTENDED {dest_database}.dim_vendor").show()
spark.sql(f"SELECT * FROM {dest_database}.dim_vendor LIMIT 2").toPandas()

+--------------------+--------------------+-------+
|            col_name|           data_type|comment|
+--------------------+--------------------+-------+
|          vendor_key|                 int|   NULL|
|           vendor_id|                 int|   NULL|
|         vendor_name|              string|   NULL|
|       credit_rating|                 int|   NULL|
|         active_flag|                 int|   NULL|
|       modified_date|           timestamp|   NULL|
|                    |                    |       |
|# Detailed Table ...|                    |       |
|             Catalog|       spark_catalog|       |
|            Database|       data_mart_dlh|       |
|               Table|          dim_vendor|       |
|        Created Time|Fri May 09 16:58:...|       |
|         Last Access|             UNKNOWN|       |
|          Created By|         Spark 3.5.5|       |
|                Type|             MANAGED|       |
|            Provider|             parquet|       |
|           

Unnamed: 0,vendor_key,vendor_id,vendor_name,credit_rating,active_flag,modified_date
0,1,1,International,1,1,2002-02-25
1,2,2,Electronic Bike Repair & Supplies,1,1,2002-02-17


### Fact Table

In [20]:
from pyspark.sql.functions import col, to_date

# Load source fact data
df_sales_order_header = spark.read \
    .format("jdbc") \
    .option("url", f"jdbc:mysql://{mysql_args['host_name']}:{mysql_args['port']}/{mysql_args['db_name']}") \
    .option("driver", mysql_args['conn_props']['driver']) \
    .option("dbtable", "SalesOrderHeader") \
    .option("user", mysql_args['conn_props']['user']) \
    .option("password", mysql_args['conn_props']['password']) \
    .load() \
    .select("SalesOrderID", "CustomerID", "OrderDate", "TotalDue")

df_sales_order_detail = spark.read \
    .format("jdbc") \
    .option("url", f"jdbc:mysql://{mysql_args['host_name']}:{mysql_args['port']}/{mysql_args['db_name']}") \
    .option("driver", mysql_args['conn_props']['driver']) \
    .option("dbtable", "SalesOrderDetail") \
    .option("user", mysql_args['conn_props']['user']) \
    .option("password", mysql_args['conn_props']['password']) \
    .load() \
    .select("SalesOrderID", "ProductID", "OrderQty", "LineTotal")

# Join header and detail tables
df_fact_orders = df_sales_order_header.join(df_sales_order_detail, on="SalesOrderID", how="left")

# Join with product-vendor bridge
df_product_vendor = spark.read \
    .format("jdbc") \
    .option("url", f"jdbc:mysql://{mysql_args['host_name']}:{mysql_args['port']}/{mysql_args['db_name']}") \
    .option("driver", mysql_args['conn_props']['driver']) \
    .option("dbtable", "productvendor") \
    .option("user", mysql_args['conn_props']['user']) \
    .option("password", mysql_args['conn_props']['password']) \
    .load() \
    .select("ProductID", "VendorID")

df_fact_orders = df_fact_orders.join(df_product_vendor, on="ProductID", how="left")

# Load dimension tables
df_dim_customers = spark.table("data_mart_dlh.dim_customers") \
    .select("customer_id", "account_number", "customer_type", "territory_id")

df_dim_products = spark.table("data_mart_dlh.dim_products") \
    .select("ProductID", "Name", "ProductNumber", "ListPrice")

df_dim_vendor = spark.table("data_mart_dlh.dim_vendor") \
    .select("vendor_id", "vendor_name")

df_dim_date = spark.table("data_mart_dlh.dim_date") \
    .select("date_key", "full_date")

# Standardize columns
df_fact_orders = df_fact_orders \
    .withColumnRenamed("SalesOrderID", "sales_order_id") \
    .withColumnRenamed("CustomerID", "customer_id") \
    .withColumnRenamed("ProductID", "product_id") \
    .withColumnRenamed("VendorID", "vendor_id") \
    .withColumnRenamed("OrderDate", "order_date") \
    .withColumnRenamed("OrderQty", "order_qty") \
    .withColumnRenamed("LineTotal", "line_total") \
    .withColumnRenamed("TotalDue", "total_due")

# Join with dim_date on order_date
df_fact_orders = df_fact_orders \
    .withColumn("order_date", to_date("order_date")) \
    .join(df_dim_date.withColumn("full_date", to_date("full_date")), 
          col("order_date") == col("full_date"), 
          how="left") \
    .drop("full_date") \
    .withColumnRenamed("date_key", "order_date_key")

# Fill missing vendor_id and order_qty
df_fact_orders = df_fact_orders.fillna({
    "vendor_id": 0,
    "order_qty": 0
})

# Drop duplicates by sales_order_id (if needed)
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

window_spec = Window.partitionBy("sales_order_id").orderBy("sales_order_id")
df_fact_orders = df_fact_orders.withColumn("row_num", row_number().over(window_spec)) \
                               .filter("row_num = 1") \
                               .drop("row_num")

# Select and reorder final columns
final_columns = [
    "sales_order_id", "customer_id", "order_date", "order_date_key",
    "product_id", "vendor_id", "order_qty", "line_total", "total_due"
]
df_fact_orders = df_fact_orders.select(final_columns)

# Write to table
df_fact_orders.write \
    .mode("overwrite") \
    .saveAsTable("data_mart_dlh.fact_orders")

spark.sql("SELECT * FROM data_mart_dlh.fact_orders LIMIT 10").show(truncate=False)


+--------------+-----------+----------+--------------+----------+---------+---------+----------+----------+
|sales_order_id|customer_id|order_date|order_date_key|product_id|vendor_id|order_qty|line_total|total_due |
+--------------+-----------+----------+--------------+----------+---------+---------+----------+----------+
|43659         |676        |2001-06-30|20010630      |773       |0        |2        |4079.988  |27231.5495|
|43671         |200        |2001-06-30|20010630      |732       |0        |2        |713.796   |10784.9873|
|43674         |83         |2001-06-30|20010630      |758       |0        |3        |2624.382  |3479.9306 |
|43687         |269        |2001-06-30|20010630      |768       |0        |1        |419.4589  |1668.6076 |
|43688         |161        |2001-06-30|20010630      |758       |0        |2        |1749.588  |16891.0829|
|43692         |221        |2001-06-30|20010630      |732       |0        |3        |1070.694  |51056.4965|
|43697         |21768      |

#### Split and Export Fact Data to JSON

In [21]:
from pyspark.sql.functions import monotonically_increasing_id
import os

# 1. Create the destination directory for JSON mini-batches
output_dir = "/Users/vaneeshagupta/Desktop/capstone/stream_source/fact_orders"
os.makedirs(output_dir, exist_ok=True)

# 2. Add a row index to split evenly
df_fact_orders = df_fact_orders.withColumn("row_id", monotonically_increasing_id())

# 3. Compute total row count and batch size
row_count = df_fact_orders.count()
batch_size = row_count // 3

# 4. Create and write the 3 mini-batches
df_batch_1 = df_fact_orders.filter(f"row_id < {batch_size}")
df_batch_2 = df_fact_orders.filter(f"row_id >= {batch_size} AND row_id < {2 * batch_size}")
df_batch_3 = df_fact_orders.filter(f"row_id >= {2 * batch_size}")

# 5. Drop the helper column
df_batch_1 = df_batch_1.drop("row_id")
df_batch_2 = df_batch_2.drop("row_id")
df_batch_3 = df_batch_3.drop("row_id")

# 6. Write each batch as JSON files into separate folders (simulate streaming intervals)
df_batch_1.write.mode("overwrite").json(f"{output_dir}/batch1")
df_batch_2.write.mode("overwrite").json(f"{output_dir}/batch2")
df_batch_3.write.mode("overwrite").json(f"{output_dir}/batch3")


#### Use PySpark Structured Streaming to Process (Hot Path) Fact Data

#### Verify the location of the source data files on the file system

In [22]:
import os

orders_stream_dir = "stream_source/fact_orders"
files = os.listdir(orders_stream_dir)

for file in files:
    file_path = os.path.join(orders_stream_dir, file)
    size = os.path.getsize(file_path)
    mtime = os.path.getmtime(file_path)
    print(f"{file:<30} size={size} bytes   modified={mtime}")


batch1                         size=192 bytes   modified=1746824293.2371776
batch2                         size=192 bytes   modified=1746824293.9083464
batch3                         size=640 bytes   modified=1746824294.5005665


#### Bronze

In [23]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType, DateType

# Define the schema manually
fact_schema = StructType([
    StructField("sales_order_id", IntegerType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("order_date", DateType(), True),
    StructField("order_date_key", IntegerType(), True),
    StructField("product_id", IntegerType(), True),
    StructField("vendor_id", DoubleType(), True),
    StructField("order_qty", IntegerType(), True),
    StructField("line_total", DoubleType(), True),
    StructField("total_due", DoubleType(), True)
])

# Use recursiveFileLookup so it reads nested batch1, batch2, batch3 folders
df_orders_bronze = (
    spark.readStream
    .option("schemaLocation", orders_output_bronze)
    .option("recursiveFileLookup", "true")
    .option("maxFilesPerTrigger", 1)
    .option("multiLine", "true")
    .schema(fact_schema)
    .json(orders_stream_dir)
)

df_orders_bronze.isStreaming


True

In [24]:
orders_checkpoint_bronze = os.path.join(orders_output_bronze, "_checkpoint")

orders_bronze_query = (
    df_orders_bronze
    .withColumn("receipt_time", current_timestamp())
    .withColumn("source_file", input_file_name())
    .writeStream
    .format("parquet")
    .outputMode("append")
    .queryName("orders_bronze")
    .trigger(availableNow=True)  # complete one pass through all available files
    .option("checkpointLocation", orders_checkpoint_bronze)
    .option("compression", "snappy")
    .start(orders_output_bronze)
)

print(f"Query ID: {orders_bronze_query.id}")
print(f"Query Name: {orders_bronze_query.name}")
print(f"Query Status: {orders_bronze_query.status}")

# Wait for the stream to finish
orders_bronze_query.awaitTermination()


Query ID: 29116b00-44d9-40bf-b1c6-8d103b82d9c1
Query Name: orders_bronze
Query Status: {'message': 'Initializing sources', 'isDataAvailable': False, 'isTriggerActive': False}


In [25]:
bronze_path = "/Users/vaneeshagupta/Desktop/capstone/bronze/fact_orders/part-*.parquet"

df_bronze = spark.read.parquet(bronze_path)
df_bronze.show(5, truncate=False)



+--------------+-----------+----------+--------------+----------+---------+---------+----------+----------+-----------------------+-----------------------------------------------------------------------------------------------------------------------------------+
|sales_order_id|customer_id|order_date|order_date_key|product_id|vendor_id|order_qty|line_total|total_due |receipt_time           |source_file                                                                                                                        |
+--------------+-----------+----------+--------------+----------+---------+---------+----------+----------+-----------------------+-----------------------------------------------------------------------------------------------------------------------------------+
|43661         |442        |2001-06-30|20010630      |773       |0.0      |2        |4079.988  |43561.4424|2025-05-09 15:16:00.046|file:///Users/vaneeshagupta/Desktop/test/stream_source/fact_orders/batch3/par

#### Silver

In [26]:
# Prepare dimensions table

from pyspark.sql.functions import col
from pyspark.sql.types import DateType, IntegerType, LongType
import os

# Rename keys to match fact table for clean joins
df_dim_customer = df_dim_customers.withColumnRenamed("CustomerID", "customer_id")
df_dim_vendor = df_dim_vendor.withColumnRenamed("VendorID", "vendor_id")
df_dim_product = df_dim_products.withColumnRenamed("ProductID", "product_id")

# Prepare role-playing dimensions
df_dim_order_date = df_dim_date.select(
    col("date_key").alias("order_date_key"),
    col("full_date").alias("order_full_date")
)


In [27]:
# Defining Paths
orders_output_bronze = "/Users/vaneeshagupta/Desktop/capstone/bronze/fact_orders"
orders_output_silver = "/Users/vaneeshagupta/Desktop/capstone/silver/fact_orders"
orders_checkpoint_silver = os.path.join(orders_output_silver, "_checkpoint")


In [28]:
# Reading and transforming streaming data
df_orders_silver = (
    spark.readStream.format("parquet").load(orders_output_bronze)
    .withColumn("order_date", col("order_date").cast(DateType()))
    .join(df_dim_customer, on="customer_id", how="left")
    .join(df_dim_product, on="product_id", how="left")
    .join(df_dim_vendor, on="vendor_id", how="left")
    .join(
        df_dim_order_date,
        col("order_date") == df_dim_order_date["order_full_date"].cast(DateType()),
        how="left"
    )
    .select(
        col("sales_order_id").cast(LongType()),
        col("customer_id").cast(LongType()),
        col("product_id").cast(LongType()),
        col("vendor_id").cast(LongType()),
        df_dim_order_date["order_date_key"].cast(LongType()),
        col("order_qty").cast(IntegerType()),
        col("line_total").cast("double"),
        col("total_due").cast("double"),
        col("order_date")
    )
)


In [29]:
df_orders_silver.printSchema()

root
 |-- sales_order_id: long (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- product_id: long (nullable = true)
 |-- vendor_id: long (nullable = true)
 |-- order_date_key: long (nullable = true)
 |-- order_qty: integer (nullable = true)
 |-- line_total: double (nullable = true)
 |-- total_due: double (nullable = true)
 |-- order_date: date (nullable = true)



In [30]:
# Write to Silver
orders_output_silver = "/Users/vaneeshagupta/Desktop/test/silver/fact_orders"
orders_checkpoint_silver = "/Users/vaneeshagupta/Desktop/test/silver/_checkpoint"

orders_silver_query = (
    df_orders_silver.writeStream
    .format("parquet")
    .outputMode("append")
    .option("checkpointLocation", orders_checkpoint_silver)
    .option("compression", "snappy")
    .queryName("orders_silver")
    .trigger(availableNow=True)
    .start(orders_output_silver)
)

# Unit Test: Implement Query Monitoring
print(f"Query ID: {orders_silver_query.id}")
print(f"Query Name: {orders_silver_query.name}")
print(f"Query Status: {orders_silver_query.status}")

orders_silver_query.awaitTermination()


Query ID: 899b0c53-6a9d-4948-a034-3c0a34ef5944
Query Name: orders_silver
Query Status: {'message': 'Initializing sources', 'isDataAvailable': False, 'isTriggerActive': False}


In [31]:
orders_silver_query.awaitTermination()

#### Gold

Total Revenue and Quantity by Customer and Month

In [32]:
from pyspark.sql.functions import col, month, year, sum, avg
from pyspark.sql.types import DateType

# Join Silver fact_orders with dim_date
df_orders_enriched = (
    spark.readStream.format("parquet").load(orders_output_silver)
    .join(df_dim_date, col("order_date_key") == df_dim_date.date_key, "left")
    .withColumn("month", month(col("full_date").cast(DateType())))
    .withColumn("year", year(col("full_date").cast(DateType())))
)

# Aggregate by customer
df_orders_gold = (
    df_orders_enriched
    .groupBy("year", "month", "customer_id")
    .agg(
        sum("line_total").alias("total_revenue"),
        avg("order_qty").alias("avg_order_qty"),
        sum("order_qty").alias("total_items")
    )
)


In [33]:
orders_gold_query = (
    df_orders_gold.writeStream
    .format("memory")
    .queryName("fact_orders_by_customer_month")
    .outputMode("complete")
    .start()
)



In [38]:
import time
while not orders_gold_query.isActive:
    time.sleep(1)

spark.sql("SELECT * FROM fact_orders_by_customer_month ORDER BY year, month, customer_id").show(2)


+----+-----+-----------+-------------+-------------+-----------+
|year|month|customer_id|total_revenue|avg_order_qty|total_items|
+----+-----+-----------+-------------+-------------+-----------+
|2001|    6|        117|      874.794|          1.0|          1|
|2001|    6|        203|      713.796|          2.0|          2|
+----+-----+-----------+-------------+-------------+-----------+
only showing top 2 rows



In [35]:
# Final selection
df_fact_orders_by_customer_month_final = df_orders_gold.selectExpr(
    "year as `Year`",
    "month as `Month`",
    "customer_id as `Customer ID`",
    "total_revenue as `Total Revenue`",
    "avg_order_qty as `Avg Order Qty`",
    "total_items as `Total Items`"
)


In [42]:
orders_gold_query = (
    df_fact_orders_by_customer_month_final.writeStream
    .format("memory")
    .queryName("fact_orders_by_customer_month_final_1")
    .outputMode("complete")
    .start()
)
import time
while not orders_gold_query.isActive:
    time.sleep(1)

df_static_final = spark.sql("SELECT * FROM fact_orders_by_customer_month_final")

df_static_final.write.saveAsTable(
    f"{dest_database}.fact_orders_by_customer_month_final",
    mode="overwrite"
)



In [43]:
spark.sql(f"SELECT * FROM {dest_database}.fact_orders_by_customer_month_final").toPandas()

Unnamed: 0,Year,Month,Customer ID,Total Revenue,Avg Order Qty,Total Items
0,2001,6,510,419.4589,1.0,1
1,2001,6,646,2039.994,1.0,1
2,2001,6,117,874.794,1.0,1
3,2001,6,442,4079.988,2.0,2
4,2001,6,227,356.898,1.0,1
5,2001,6,203,713.796,2.0,2
6,2001,6,676,4079.988,2.0,2
7,2001,6,618,5.1865,1.0,1
