In [172]:
#Setup
import pandas as pd
from db import get_engine

engine = get_engine()

with engine.begin() as connection:
    connection.exec_driver_sql("PRAGMA foreign_keys = ON;")
    connection.exec_driver_sql("ATTACH DATABASE 'raw.db' AS raw;")
    connection.exec_driver_sql("ATTACH DATABASE 'relational.db' AS relational;")

### Creating Tables Holding Raw CSV Data

In [173]:
#Dropping tables if they already exist
raw_tables = ['crm_customers', 
              'crm_products', 
              'crm_sales', 
              'erp_customers',
              'erp_locations', 
              'erp_product_categories']

with engine.begin() as connection:
    for table in raw_tables:
            connection.exec_driver_sql(f"DROP TABLE IF EXISTS raw.{table};")

In [174]:
#Creating a table to hold raw data for each CSV file
with engine.begin() as connection:
    connection.exec_driver_sql("""
    CREATE TABLE IF NOT EXISTS raw.crm_customers (
    cst_id TEXT,
    cst_key TEXT,
    cst_firstname TEXT,
    cst_lastname TEXT,
    cst_marital_status TEXT,
    cst_gndr TEXT,
    cst_create_date TEXT)""")
    
    connection.exec_driver_sql("""
    CREATE TABLE IF NOT EXISTS raw.crm_products (
    prd_id TEXT,
    prd_key TEXT,
    prd_nm TEXT,
    prd_cost TEXT,
    prd_line TEXT,
    prd_start_dt TEXT,
    prd_end_dt TEXT)""")
    
    connection.exec_driver_sql("""
    CREATE TABLE IF NOT EXISTS raw.crm_sales (
    sls_ord_num TEXT,
    sls_prd_key TEXT,
    sls_cust_id TEXT,
    sls_order_dt TEXT,
    sls_ship_dt TEXT,
    sls_due_dt TEXT,
    sls_sales TEXT,
    sls_quantity TEXT,
    sls_price TEXT)""")
    
    connection.exec_driver_sql("""
    CREATE TABLE IF NOT EXISTS raw.erp_customers (
    CID TEXT,
    BDATE TEXT,
    GEN TEXT)""")
    
    connection.exec_driver_sql("""
    CREATE TABLE IF NOT EXISTS raw.erp_locations (
    CID TEXT,
    CNTRY TEXT)""")
    
    connection.exec_driver_sql("""
    CREATE TABLE IF NOT EXISTS raw.erp_product_categories (
    ID TEXT,
    CAT TEXT,
    SUBCAT TEXT,
    MAINTENANCE TEXT)""")

In [175]:
#Loading data from CSV files into the raw tables
pd.read_csv('datasets/CRM/cust_info.csv').to_sql('crm_customers', engine, schema = 'raw', 
                                        if_exists="replace", index=False)
pd.read_csv('datasets/CRM/prd_info.csv').to_sql('crm_products', engine, schema = 'raw', 
                                       if_exists="replace", index=False)
pd.read_csv('datasets/CRM/sales_details.csv').to_sql('crm_sales', engine, schema = 'raw', 
                                            if_exists="replace", index=False)
pd.read_csv('datasets/ERP/CUST_AZ12.csv').to_sql('erp_customers', engine, schema = 'raw', 
                                        if_exists="replace", index=False)
pd.read_csv('datasets/ERP/LOC_A101.csv').to_sql('erp_locations', engine, schema = 'raw', 
                                       if_exists="replace", index=False)
pd.read_csv('datasets/ERP/PX_CAT_G1V2.csv').to_sql('erp_product_categories', engine, schema = 'raw', 
                                          if_exists="replace", index=False)

37

### Sanity Checks

In [176]:
#Ensuring proper number of columns per table
sources = {
    'crm_customers': 'datasets/CRM/cust_info.csv',
    'crm_products': 'datasets/CRM/prd_info.csv',
    'crm_sales': 'datasets/CRM/sales_details.csv',
    'erp_customers': 'datasets/ERP/CUST_AZ12.csv',
    'erp_locations': 'datasets/ERP/LOC_A101.csv',
    'erp_product_categories': 'datasets/ERP/PX_CAT_G1V2.csv',
}

missing = []

with engine.begin() as connection:
    for table, path in sources.items():
        expected_cols = list(pd.read_csv(path).columns)
        actual_cols = pd.read_sql(f"PRAGMA raw.table_info({table});", connection)['name'].to_list()

        if expected_cols != actual_cols:
            missing.append({'table': table,
                            'expected': expected_cols,
                            'actual': actual_cols})

if missing:
    for m in missing:
        print(f"Column mismatch in {m['table']}")
        print(f"Expected: {list(m['expected'])}")
        print(f"Found: {list(m['actual'])}\n")
    raise RuntimeError('Columns not properly loaded')
else:
    print(f'All tables loaded with expected columns!')

All tables loaded with expected columns!


In [177]:
#Ensuring proper number of rows per table
sources = {
    'crm_customers': 'datasets/CRM/cust_info.csv',
    'crm_products': 'datasets/CRM/prd_info.csv',
    'crm_sales': 'datasets/CRM/sales_details.csv',
    'erp_customers': 'datasets/ERP/CUST_AZ12.csv',
    'erp_locations': 'datasets/ERP/LOC_A101.csv',
    'erp_product_categories': 'datasets/ERP/PX_CAT_G1V2.csv',
}

mismatch = []

with engine.begin() as connection:
    for table, path in sources.items():
        expected_num_rows = pd.read_csv(path).shape[0]
        actual_num_rows = connection.exec_driver_sql(f"SELECT COUNT(*) FROM raw.{table}").fetchone()[0]
        if expected_num_rows != actual_num_rows:
            mismatch.append({'table': table,
                             'expected': expected_num_rows,
                             'actual': actual_num_rows})

if mismatch:
    for m in mismatch:
        print(f"Row count mismatch in {m['table']}")
        print(f"Expected: {m['expected']}")
        print(f"Actual: {m['actual']}\n")
    raise RuntimeError('Rows not properly loaded')
else:
    print(f'All tables loaded with expected rows!')

All tables loaded with expected rows!
