In [1]:
import os
import glob
import pandas as pd
from sqlalchemy import create_engine

In [2]:
def check_fact_student_mobilty_count():
    """ Checks row count of fact table
        
    Checks if fact table has same number of rows
    as CSV input files.
    """
    # get row count from CSV files
    csv_count = 0
    for root, dirs, files in os.walk('cleaned_data/students_2008_2012'):
        files = glob.glob(os.path.join(root, '*.csv'))
        for f in files:
            csv_count += pd.read_csv(f, sep=';').shape[0]
    # get row count from fact table
    engine = create_engine('postgresql://tag:@localhost:5432/studentdb')
    row_count = pd.read_sql_query("SELECT COUNT(*) AS row_cnt FROM fact_student_mobility;", engine).row_cnt[0]
    # compare
    if csv_count != row_count:
        raise ValueError("Data quality check failed. Fact table contains less rows ({}) than input files ({})".format(
            row_count, csv_count
        ))
    else:
        print("Data quality check passed. Fact table and input file contain same number of rows: {}".format(row_count))

In [3]:
def check_host_institution_integrity():
    """ Checks quality of host institution codes
    
    Checks how many host institution codes are used in 
    the fact table but not available in the institution
    dimension table.
    """
    # get row count from query
    sql_stmt = """
    SELECT COUNT(DISTINCT fsm.home_institution_code) AS row_cnt
    FROM fact_student_mobility AS fsm
    LEFT JOIN dim_institution AS di_home
        ON fsm.home_institution_code = di_home.institution_code
    WHERE di_home IS NULL
    ORDER BY 1
    """
    engine = create_engine('postgresql://tag:@localhost:5432/studentdb')
    row_count = pd.read_sql_query(sql_stmt, engine).row_cnt[0]
    # compare
    if row_count != 0:
        raise ValueError("Data quality check failed. Fact table contains institution codes ({}) not available in dimension table".format(
            row_count
        ))
    else:
        print("Data quality check passed. All home institution codes are available in dimension table")

In [4]:
# run data quality checks
check_fact_student_mobilty_count()
check_host_institution_integrity()

Data quality check passed. Fact table and input file contain same number of rows: 1163508


ValueError: Data quality check failed. Fact table contains institution codes (181) not available in dimension table