In [44]:
#Setup
import pandas as pd
from db import get_engine

engine = get_engine()

with engine.begin() as connection:
    connection.exec_driver_sql("PRAGMA foreign_keys = ON;")
    connection.exec_driver_sql("ATTACH DATABASE 'raw.db' AS raw;")
    connection.exec_driver_sql("ATTACH DATABASE 'relational.db' AS relational;")

In [45]:
#Validating that all required staging views exist before performing sanity checks
#Failing indicates that transformation (02_transformation) was not performed
tables_needed = [
    'stg_crm_customers',
    'stg_crm_products',
    'stg_crm_sales',
    'stg_erp_customers',
    'stg_erp_locations',
    'stg_erp_product_categories',
]

tables_loaded = pd.read_sql("""
SELECT name
FROM raw.sqlite_master
WHERE type = 'view'
;""", engine)['name'].tolist()

missing = set(tables_needed) - set(tables_loaded)

if missing:
    print(f'Missing Tables:')
    for m in missing:
        print(f'{m}')
    raise RuntimeError(
    f'Run 02_transformation.ipynb first')

### Sanity Checks

#### stg_crm_customers

In [46]:
#Viewing the first 5 rows of the cleaned data
pd.read_sql("""
SELECT *
FROM raw.stg_crm_customers
LIMIT 5
;""", engine)

Unnamed: 0,cst_id,cst_key,cst_firstname,cst_lastname,cst_marital_status,cst_gndr,cst_create_date
0,11000,AW00011000,Jon,Yang,Married,Male,2025-10-06
1,11001,AW00011001,Eugene,Huang,Single,Male,2025-10-06
2,11002,AW00011002,Ruben,Torres,Married,Male,2025-10-06
3,11003,AW00011003,Christy,Zhu,Single,Female,2025-10-06
4,11004,AW00011004,Elizabeth,Johnson,Single,Female,2025-10-06


In [47]:
#Checking for NULL row removal
pd.read_sql("""
SELECT *
FROM raw.stg_crm_customers
WHERE cst_id IS NULL
;""", engine)

Unnamed: 0,cst_id,cst_key,cst_firstname,cst_lastname,cst_marital_status,cst_gndr,cst_create_date


In [48]:
#Checking for duplicate cst_id records
pd.read_sql("""
SELECT cst_id
FROM raw.stg_crm_customers
GROUP BY cst_id
HAVING COUNT(*) > 1
;""", engine)

Unnamed: 0,cst_id


In [49]:
#Checking for valid dates
pd.read_sql("""
SELECT *
FROM raw.stg_crm_customers
WHERE cst_create_date > DATE('now');
""", engine)

Unnamed: 0,cst_id,cst_key,cst_firstname,cst_lastname,cst_marital_status,cst_gndr,cst_create_date


#### stg_crm_products

In [50]:
#Viewing the first 5 rows of the cleaned data
pd.read_sql("""
SELECT *
FROM raw.stg_crm_products
LIMIT 5
;""", engine)

Unnamed: 0,prd_id,cat_id,prd_key,prd_nm,prd_cost,prd_line,prd_start_dt,prd_end_dt
0,478,AC-BC,BC-M005,Mountain Bottle Cage,4,Mountain,2013-07-01,
1,479,AC-BC,BC-R205,Road Bottle Cage,3,Road,2013-07-01,
2,477,AC-BC,WB-H098,Water Bottle - 30 oz.,2,Other,2013-07-01,
3,483,AC-BR,RA-H123,Hitch Rack - 4-Bike,45,Other,2013-07-01,
4,486,AC-BS,ST-1401,All-Purpose Bike Stand,59,Mountain,2013-07-01,


In [51]:
#Checking start and end date of records of the same product
pd.read_sql("""
SELECT cat_id, prd_key, prd_nm, prd_start_dt, prd_end_dt
FROM raw.stg_crm_products
WHERE prd_nm IN (
    SELECT prd_nm
    FROM raw.stg_crm_products
    GROUP BY prd_nm
    HAVING COUNT(*)>1)
LIMIT 10;""", engine)

Unnamed: 0,cat_id,prd_key,prd_nm,prd_start_dt,prd_end_dt
0,AC-HE,HL-U509,Sport-100 Helmet- Black,2011-07-01,2012-06-30
1,AC-HE,HL-U509,Sport-100 Helmet- Black,2012-07-01,2013-06-30
2,AC-HE,HL-U509,Sport-100 Helmet- Black,2013-07-01,
3,AC-HE,HL-U509-B,Sport-100 Helmet- Blue,2011-07-01,2012-06-30
4,AC-HE,HL-U509-B,Sport-100 Helmet- Blue,2012-07-01,2013-06-30
5,AC-HE,HL-U509-B,Sport-100 Helmet- Blue,2013-07-01,
6,AC-HE,HL-U509-R,Sport-100 Helmet- Red,2011-07-01,2012-06-30
7,AC-HE,HL-U509-R,Sport-100 Helmet- Red,2012-07-01,2013-06-30
8,AC-HE,HL-U509-R,Sport-100 Helmet- Red,2013-07-01,
9,BI-MB,BK-M68B-38,Mountain-200 Black- 38,2012-07-01,2013-06-30


In [52]:
#Records where start date is after end date
pd.read_sql("""
SELECT COUNT(*)
FROM raw.stg_crm_products
WHERE DATE(prd_start_dt) > DATE(prd_end_dt)
;""", engine)

Unnamed: 0,COUNT(*)
0,0


#### stg_crm_sales

In [31]:
#Looking at invalid sls_sales, sls_quantity, or sls_price values
pd.read_sql("""
SELECT sls_sales, sls_quantity, sls_price
FROM raw.stg_crm_sales
WHERE sls_sales <= 0
OR sls_sales IS NULL
OR sls_quantity <= 0
OR sls_quantity IS NULL
OR sls_price <= 0
OR sls_price IS NULL
OR sls_quantity * sls_price != sls_sales;""", engine)

Unnamed: 0,sls_sales,sls_quantity,sls_price


#### stg_erp_customers

In [32]:
#Viewing the first 5 rows of the cleaned data
pd.read_sql("""
SELECT *
FROM raw.stg_erp_customers
LIMIT 5
;""", engine)

Unnamed: 0,CID,BDATE,GEN
0,AW00011000,1971-10-06,Male
1,AW00011001,1976-05-10,Male
2,AW00011002,1971-02-09,Male
3,AW00011003,1973-08-14,Female
4,AW00011004,1979-08-05,Female


In [33]:
#Checking CID for formatting issues
pd.read_sql("""
SELECT CID
FROM raw.stg_erp_customers
WHERE CID NOT LIKE 'AW00%';""", engine)

Unnamed: 0,CID


In [34]:
#Checking BDATE for invalid (futuristic) years
pd.read_sql("""
SELECT BDATE
FROM raw.stg_erp_customers
WHERE strftime('%Y', BDATE) > STRFTIME('%Y', 'now')
LIMIT 5;""", engine)

Unnamed: 0,BDATE


#### stg_erp_locations

In [35]:
#Viewing the first 5 rows of the cleaned data
pd.read_sql("""
SELECT *
FROM raw.stg_erp_locations
LIMIT 5
;""", engine)

Unnamed: 0,CID,CNTRY
0,AW00011000,Australia
1,AW00011001,Australia
2,AW00011002,Australia
3,AW00011003,Australia
4,AW00011004,Australia


In [36]:
#Checking CID for formatting issues
pd.read_sql("""
SELECT CID
FROM raw.stg_erp_locations
WHERE CID NOT LIKE 'AW00%';""", engine)

Unnamed: 0,CID


In [37]:
#Checking CNTRY for formatting issues
pd.read_sql("""
SELECT DISTINCT CNTRY
FROM raw.stg_erp_locations;""", engine)

Unnamed: 0,CNTRY
0,Australia
1,United States
2,Canada
3,Germany
4,United Kingdom
5,France
6,


#### stg_erp_product_categories

In [38]:
#Viewing the first 5 rows of the cleaned data
pd.read_sql("""
SELECT *
FROM raw.stg_erp_product_categories
LIMIT 5
;""", engine)

Unnamed: 0,ID,CAT,SUBCAT,MAINTENANCE
0,AC-BR,Accessories,Bike Racks,Yes
1,AC-BS,Accessories,Bike Stands,No
2,AC-BC,Accessories,Bottles and Cages,No
3,AC-CL,Accessories,Cleaners,Yes
4,AC-FE,Accessories,Fenders,No


In [39]:
#Checking ID for formatting issues
pd.read_sql("""
SELECT *
FROM raw.stg_erp_product_categories
WHERE ID NOT LIKE '%-%'
;""", engine)

Unnamed: 0,ID,CAT,SUBCAT,MAINTENANCE
