In [0]:
# Function to create the staging table with provided schema and load data from raw tables created in previous step
def load_stg_table(table_name, schema):
    table_prefix = "silver_sb.test2."
    stg_table_name = f"{table_prefix}stg_{table_name[2:-4]}"
    spark.sql(f"DROP TABLE IF EXISTS {stg_table_name}")
    print('dropped and creating table')
    spark.sql(f"CREATE TABLE {stg_table_name} ({schema})")
    spark.sql(f"INSERT INTO {stg_table_name} SELECT * FROM {table_prefix}{table_name}")
    count = spark.sql(f"SELECT COUNT(*) FROM {stg_table_name}").collect()[0][0]
    print(f'table {stg_table_name} created and populated with {count} rows')

The following are the schemas of the 10 tables having a primary key declaration

In [0]:
schema1 = """hldy_id STRING NOT NULL,
  hldy_label STRING,
  PRIMARY KEY (hldy_id)"""
load_stg_table("b_hier_hldy_dlm", schema1)

dropped and creating table
table silver_sb.test2.stg_hier_hldy created and populated


In [0]:
schema2 = """fscldt_id INT PRIMARY KEY,
fscldt_label STRING,
fsclwk_id INT,
fsclwk_label STRING,
fsclmth_id INT,
fsclmth_label STRING,
fsclqrtr_id INT,
fsclqrtr_label STRING,
fsclyr_id INT,
fsclyr_label INT,
ssn_id STRING,
ssn_label STRING,
ly_fscldt_id INT,
lly_fscldt_id INT,
fscldow INT,
fscldom INT,
fscldoq INT,
fscldoy INT,
fsclwoy INT,
fsclmoy INT,
fsclqoy INT,
date DATE"""
load_stg_table("b_hier_clnd_dlm", schema2)

dropped and creating table
table silver_sb.test2.stg_hier_clnd created and populated


In [0]:
schema3 = """
substate_id STRING NOT NULL,
substate_label STRING,
state_id STRING NOT NULL,
state_label STRING,
PRIMARY KEY(substate_id, state_id)
"""
load_stg_table("b_hier_pricestate_dlm", schema3)

dropped and creating table
table silver_sb.test2.stg_hier_pricestate created and populated


In [0]:
schema4 = """
str INT PRIMARY KEY,
str_label STRING,
dstr INT,
dstr_label STRING,
rgn INT,
rgn_label STRING
"""
load_stg_table("b_hier_rtlloc_dlm", schema4)

dropped and creating table
table silver_sb.test2.stg_hier_rtlloc created and populated


In [0]:
schema5 = """
loc INT PRIMARY KEY,
loc_label STRING,
loctype STRING,
loctype_label STRING
"""
load_stg_table("b_hier_invloc_dlm", schema5)

dropped and creating table
table silver_sb.test2.stg_hier_invloc created and populated


In [0]:
schema6 = """
site_id STRING PRIMARY KEY,
site_label STRING,
subchnl_id STRING,
subchnl_label STRING,
chnl_id STRING,
chnl_label STRING
"""
load_stg_table("b_hier_possite_dlm", schema6)

dropped and creating table
table silver_sb.test2.stg_hier_possite created and populated


In [0]:
schema7 = """
code_id STRING PRIMARY KEY,
code_label STRING,
bckt_id STRING,
bckt_label STRING,
ownrshp_id STRING,
ownrshp_label STRING
"""
load_stg_table("b_hier_invstatus_dlm", schema7)

dropped and creating table
table silver_sb.test2.stg_hier_invstatus created and populated


In [0]:
schema8 = """
sku_id STRING PRIMARY KEY,
sku_label STRING,
stylclr_id STRING,
stylclr_label STRING,
styl_id STRING,
styl_label STRING,
subcat_id INT,
subcat_label STRING,
cat_id INT,
cat_label STRING,
dept_id INT,
dept_label STRING,
issvc INT,
isasmbly INT,
isnfs INT
"""
load_stg_table("b_hier_prod_dlm", schema8)

dropped and creating table
table silver_sb.test2.stg_hier_prod created and populated


In [0]:
schema9 = """
fscldt_id INT NOT NULL,
sku_id STRING NOT NULL,
average_unit_standardcost DOUBLE,
average_unit_landedcost DOUBLE,
PRIMARY KEY(fscldt_id, sku_id)
"""
# Not added foreign key as databricks not supported but in other databses we can include by adding following line
# CONSTRAINT avg_sk FOREIGN KEY(sku_id) REFERENCES stg_prod_dlm 
load_stg_table("b_fact_averagecosts_dlm", schema9)

dropped and creating table
table silver_sb.test2.stg_fact_averagecosts created and populated with 740805 rows


In [0]:
schema10 = """
order_id BIGINT,
line_id INT,
type STRING,
dt TIMESTAMP,
pos_site_id STRING,
sku_id STRING,
fscldt_id INT,
price_substate_id STRING,
sales_units INT,
sales_dollars DOUBLE,
discount_dollars DOUBLE,
original_order_id BIGINT,
original_line_id INT
"""
load_stg_table("b_fact_transactions_dlm", schema10)

dropped and creating table
table silver_sb.test2.stg_fact_transactions created and populated with 4503108 rows


In [0]:
# Function to check the foreign key between given tables
def foreign_key_check(table_name1, table_name2, foreign_key1, foreign_key2):
    is_foreign = False
    fails = spark.sql(f"select count(*) from {table_name1} where {foreign_key1} not in (select {foreign_key2} from {table_name2})").collect()[0][0]
    if fails == 0:
        is_foreign = True
        print(f"Foreign keys {foreign_key1} for {table_name1} and {table_name2} are satisfied")
    else:
        print(f"Foreign keys {foreign_key1} for {table_name1} and {table_name2} are not satisfied")
    return is_foreign

In [0]:
# Foreign key replacement
print(foreign_key_check("silver_sb.test2.stg_fact_averagecosts", "silver_sb.test2.stg_hier_prod", "sku_id", "sku_id"))
print(foreign_key_check("silver_sb.test2.stg_fact_transactions", "silver_sb.test2.stg_hier_possite", "pos_site_id", "site_id"))
print(foreign_key_check("silver_sb.test2.stg_fact_transactions", "silver_sb.test2.stg_hier_prod", "sku_id", "sku_id"))
print(foreign_key_check("silver_sb.test2.stg_fact_transactions", "silver_sb.test2.stg_hier_pricestate", "price_substate_id", "substate_id"))

"""
sku_id is foreign key in stg_fact_averagecosts referencing to sku_id in stg_hier_prod
pos_site_id is foreign key in stg_fact_transactions referencing to site_id in stg_hier_possite
sku_id is foreign key in stg_fact_transactions referencing to sku_id in stg_hier_prod
price_substate_id is foreign key in stg_fact_transactions referencing to substate_id in stg_hier_pricestate
fscldt_id is not foreign key in both tables
"""

Foreign keys price_substate_id for silver_sb.test2.stg_fact_transactions and silver_sb.test2.stg_hier_pricestate are satisfied
True


In [0]:
%sql
-- select count(distinct fscldt_id) from silver_sb.test2.stg_fact_transactions ; --1545
-- select count(distinct fscldt_id) from silver_sb.test2.stg_hier_clnd; --1820
-- select count(*) from silver_sb.test2.stg_fact_transactions where fscldt_id not in (select fscldt_id from silver_sb.test2.stg_hier_clnd); --2206380
-- select count(*) from silver_sb.test2.stg_hier_clnd where fscldt_id not in (select fscldt_id from silver_sb.test2.stg_fact_transactions); --1010