https://towardsdatascience.com/how-to-do-data-validation-on-your-data-on-pandas-with-pytest-d5dda51ad0e4

https://github.com/BlueBrain/data-validation-framework

In [4]:
# data_processing.py
# ! pip install pytest
import pandas as pd
from pandas import DataFrame


def read_raw_data(file_path: str, chunk_size: int = 1000) -> DataFrame:
    csv_reader = pd.read_csv(file_path, chunksize=chunk_size)
    processed_chunks = []

    for chunk in csv_reader:
        chunk = chunk.loc[chunk["Order ID"] != "Order ID"].dropna()
        processed_chunks.append(chunk)

    return pd.concat(processed_chunks, axis=0)


def split_purchase_address(df_to_process: DataFrame) -> DataFrame:
    df_address_split = df_to_process["Purchase Address"].str.split(
        ",", n=3, expand=True
    )
    df_address_split.columns = ["Street Name", "City", "State and Postal Code"]

    df_state_postal_split = (
        df_address_split["State and Postal Code"]
        .str.strip()
        .str.split(" ", n=2, expand=True)
    )
    df_state_postal_split.columns = ["State Code", "Postal Code"]

    return pd.concat([df_to_process, df_address_split, df_state_postal_split], axis=1)


def extract_product_pack_information(df_to_process: DataFrame) -> DataFrame:
    df_to_process["Pack Information"] = (
        df_to_process["Product"].str.extract(r".*\((.*)\).*").fillna("Not Pack")
    )

    return df_to_process


def one_hot_encode_product_column(df_to_process: DataFrame) -> DataFrame:
    return pd.get_dummies(df_to_process, columns=["Product"])


def process_raw_data(file_path: str, chunk_size: int) -> DataFrame:
    df = read_raw_data(file_path=file_path, chunk_size=chunk_size)

    return (
        df.pipe(split_purchase_address)
        .pipe(extract_product_pack_information)
        .pipe(one_hot_encode_product_column)
    )

Collecting pytest
  Downloading pytest-7.3.1-py3-none-any.whl (320 kB)
                                              0.0/320.5 kB ? eta -:--:--
     ----                                  41.0/320.5 kB 991.0 kB/s eta 0:00:01
     --------------                         122.9/320.5 kB 1.4 MB/s eta 0:00:01
     -----------------------                194.6/320.5 kB 1.5 MB/s eta 0:00:01
     --------------------------------       276.5/320.5 kB 1.5 MB/s eta 0:00:01
     -------------------------------------- 320.5/320.5 kB 1.5 MB/s eta 0:00:00
Collecting iniconfig (from pytest)
  Downloading iniconfig-2.0.0-py3-none-any.whl (5.9 kB)
Collecting pluggy<2.0,>=0.12 (from pytest)
  Downloading pluggy-1.0.0-py2.py3-none-any.whl (13 kB)
Installing collected packages: pluggy, iniconfig, pytest
Successfully installed iniconfig-2.0.0 pluggy-1.0.0 pytest-7.3.1


In [7]:
import pandas as pd
import numpy as np
import pytest
from pandas import DataFrame
#from data_processing import (
#    read_raw_data,
#   split_purchase_address,
#    extract_product_pack_information,
#    one_hot_encode_product_column,
#)
from pandas.testing import assert_series_equal, assert_index_equal

In [8]:
def test_read_raw_data():
    """Testing output of raw table read in is DataFrame"""
    test_df = read_raw_data(file_path="Updated_sales.csv", chunk_size=1000)
    assert isinstance(test_df, DataFrame)  # checking if it's a DataFrame

In [9]:
def test_pipe_functions_output_df():
    """Testing output of raw table read in is DataFrame"""
    test_df = read_raw_data(file_path="Updated_sales.csv", chunk_size=1000)
    all_pipe_functions = [
        split_purchase_address,
        extract_product_pack_information,
        one_hot_encode_product_column,
    ]
    for function in all_pipe_functions:
        assert isinstance(function(test_df), DataFrame)

In [10]:
test_df = read_raw_data(file_path="Updated_sales.csv", chunk_size=1000)
display(test_df)

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,176558,USB-C Charging Cable,2,11.95,04/19/19 08:46,"917 1st St, Dallas, TX 75001"
2,176559,Bose SoundSport Headphones,1,99.99,04/07/19 22:30,"682 Chestnut St, Boston, MA 02215"
3,176560,Google Phone,1,600,04/12/19 14:38,"669 Spruce St, Los Angeles, CA 90001"
4,176560,Wired Headphones,1,11.99,04/12/19 14:38,"669 Spruce St, Los Angeles, CA 90001"
5,176561,Wired Headphones,1,11.99,04/30/19 09:27,"333 8th St, Los Angeles, CA 90001"
...,...,...,...,...,...,...
30389,248146,Bose SoundSport Headphones,1,99.99,08/29/19 22:19,"868 Hickory St, San Francisco, CA 94016"
30390,248147,AAA Batteries (4-pack),3,2.99,08/31/19 16:26,"206 Lakeview St, Boston, MA 02215"
30391,248148,AA Batteries (4-pack),1,3.84,08/02/19 07:25,"568 13th St, Seattle, WA 98101"
30392,248149,USB-C Charging Cable,1,11.95,08/08/19 12:10,"495 Walnut St, San Francisco, CA 94016"


In [11]:
@pytest.fixture
def test_df() -> DataFrame:
    return read_raw_data(file_path="Updated_sales.csv", chunk_size=1000)


def test_read_raw_data(test_df):
    """Testing output of raw table read in is DataFrame"""
    assert isinstance(test_df, DataFrame)  # checking if it's a DataFrame


def test_pipe_functions_output_df(test_df):
    """Testing output of raw table read in is DataFrame"""
    all_pipe_functions = [
        split_purchase_address,
        extract_product_pack_information,
        one_hot_encode_product_column,
    ]
    for function in all_pipe_functions:
        assert isinstance(function(test_df), DataFrame)

In [12]:
def test_split_purchase_address(test_df):
    """Testing multiple columns in output and rows unchanged"""
    split_purchase_address_df = split_purchase_address(test_df)
    assert len(split_purchase_address_df.columns) > len(test_df.columns)
    assert split_purchase_address_df.index.__len__() == test_df.index.__len__()
    assert_index_equal(split_purchase_address_df.index, test_df.index)  # using the Pandas testing

In [13]:
def test_extract_product_pack_information(test_df):
    """Test specific output column in new DataFrame"""
    product_pack_df = extract_product_pack_information(test_df)
    assert "Pack Information" in product_pack_df.columns

In [14]:
def test_one_hot_encode_product_column(test_df):
    """Testing if column types are correct"""
    encoded_df = one_hot_encode_product_column(test_df)
    encoded_columns = [column for column in encoded_df.columns if "_" in column]
    for encoded_column in encoded_columns:
        assert encoded_df[encoded_column].dtype == np.dtype("uint8")

In [15]:
def test_process_raw_data(test_df):
    """Testing the final output DataFrame as a final sanity check"""
    processed_df = (
        test_df.pipe(split_purchase_address)
        .pipe(extract_product_pack_information)
        .pipe(one_hot_encode_product_column)
    )

    # check if all original columns are still in DataFrame
    for column in test_df.columns:
        if column not in processed_df.columns:
            raise AssertionError(f"COLUMN -- {column} -- not in final DataFrame")

    assert all(
        element in list(test_df.columns) for element in list(processed_df.columns)
    )

    # check if final DataFrame doesn't have duplicates
    assert assert_series_equal(
        processed_df["Order ID"].drop_duplicates(), test_df["Order ID"]
    )

In [16]:
pytest --verbose

NameError: name 'verbose' is not defined