Skip to content

Commit

Permalink
Tests with new tutorial
Browse files Browse the repository at this point in the history
- Tests for COLLATE and arrays in CATEGORISE
- New tutorial to test date and number variations, plus new ACTIONS
  • Loading branch information
turukawa committed Dec 12, 2023
1 parent 8f501b0 commit 068200b
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 0 deletions.
18 changes: 18 additions & 0 deletions tests/test_action.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@
SOURCE_DATA_PORTSMOUTH = SOURCE_DIRECTORY / "test_portsmouth_source.data"
SOURCE_SCHEMA_PORTSMOUTH = SOURCE_DIRECTORY / "test_portsmouth_source.schema"
DESTINATION_SCHEMA_PORTSMOUTH = SOURCE_DIRECTORY / "test_portsmouth_destination.schema"
# BASILDON
# Basildon ratepayer data consist of dates in US format, and numbers as currency strings.
SOURCE_DATA_BASILDON = SOURCE_DIRECTORY / "test_basildon_source.data"
SOURCE_SCHEMA_BASILDON = SOURCE_DIRECTORY / "test_basildon_source.schema"
DESTINATION_SCHEMA_BASILDON = SOURCE_DIRECTORY / "test_basildon_destination.schema"


def _test_script_action(script, schema_source, schema_destination, data_source):
Expand Down Expand Up @@ -81,6 +86,7 @@ def test_calculate(self):
)

def test_categorise(self):
# As values
script = [
"CATEGORISE > 'occupation_state'::False < 'Current Relief Type'::['Empty Property Rate Non-Industrial', 'Empty Property Rate Industrial', 'Empty Property Rate Charitable']",
"CATEGORISE > 'occupation_state_reliefs'::'small_business' < 'Current Relief Type'::['Small Business Relief England', 'Sbre Extension For 12 Months', 'Supporting Small Business Relief']",
Expand All @@ -91,6 +97,18 @@ def test_categorise(self):
assert _test_script_action(
script, SOURCE_SCHEMA_PORTSMOUTH, DESTINATION_SCHEMA_PORTSMOUTH, SOURCE_DATA_PORTSMOUTH
)
# As arrays
script = [
"CATEGORISE > 'occupation_state_reliefs'::'exempt' < 'MandRlfCd'::['CASC','EDUC80','MAND80','PCON','POSTO2']",
"CATEGORISE > 'occupation_state_reliefs'::'discretionary' < 'DiscRlfCd'::['DIS100','DISC10','DISC15','DISC30','DISC40','DISC50','DISCXX','POSTOF']",
"CATEGORISE > 'occupation_state_reliefs'::'retail' < 'AddRlfCd'::['RETDS3']",
"CATEGORISE > 'occupation_state_reliefs'::'small_business' < 'SBRFlag'::['yes']",
]
assert _test_script_action(script, SOURCE_SCHEMA_BASILDON, DESTINATION_SCHEMA_BASILDON, SOURCE_DATA_BASILDON)

def test_collate(self):
script = "COLLATE > 'prop_ba_rates' < ['MandRlf', 'DiscRlf', 'AdditionalRlf', ~]"
assert _test_script_action(script, SOURCE_SCHEMA_BASILDON, DESTINATION_SCHEMA_BASILDON, SOURCE_DATA_BASILDON)

def test_deblank(self):
script = "DEBLANK"
Expand Down
75 changes: 75 additions & 0 deletions tests/test_tutorial_4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from pathlib import Path

import whyqd as qd
from whyqd.parsers import CoreParser

SOURCE_DIRECTORY = Path(__file__).resolve().parent / "data"
SOURCE_DATA = SOURCE_DIRECTORY / "raw-e07000066-tutorial-4.xlsx"
MIMETYPE = "xlsx"
DESTINATION_MIMETYPE = "csv"
SCHEMA_NAME = "test_schema.json"
SCHEMA_DESTINATION = SOURCE_DIRECTORY / SCHEMA_NAME
CATEGORY_FIELDS = ["MandRlfCd", "DiscRlfCd", "AddRlfCd", "SBRFlag", "ChgType"]
SCRIPTS = [
"NEW > 'la_code' < ['E07000066']",
"RENAME > 'ba_ref' < ['PlaceRef']",
"RENAME > 'occupant_name' < ['FOIName']",
"RENAME > 'occupation_state_date' < ['LiabStart']",
"UNITE > 'postcode' < ', '::['PropAddress1','PropAddress2','PropAddress3','PropAddress4','PropAddress5','PropPostCode']",
"CATEGORISE > 'occupation_state_reliefs'::'exempt' < 'MandRlfCd'::['CASC','EDUC80','MAND80','PCON','POSTO2']",
"CATEGORISE > 'occupation_state_reliefs'::'discretionary' < 'DiscRlfCd'::['DIS100','DISC10','DISC15','DISC30','DISC40','DISC50','DISCXX','POSTOF']",
"CATEGORISE > 'occupation_state_reliefs'::'retail' < 'AddRlfCd'::['RETDS3']",
"CATEGORISE > 'occupation_state_reliefs'::'small_business' < 'SBRFlag'::['yes']",
"CATEGORISE > 'occupation_state'::False < 'ChgType'::['V']",
"COLLATE > 'prop_ba_rates' < ['MandRlf', 'DiscRlf', 'AdditionalRlf', ~]"
]


class TestTutorialVariations:
def test_tutorial_basildon_rates_data_variations(self, tmp_path):
"""Basildon ratepayer data consist of dates in US format, and numbers as currency strings.
Demonstrating create method, date and number coersions, array collations, and explode."""
DIRECTORY = tmp_path
CoreParser().check_path(directory=DIRECTORY)
# 1. Import a data source and derive a source schema
datasource = qd.DataSourceDefinition()
datasource.derive_model(source=SOURCE_DATA, mimetype=MIMETYPE)
schema_source = qd.SchemaDefinition()
schema_source.derive_model(data=datasource.get)
for field in schema_source.fields.get_all():
if field.name in ["LiabStart"]:
field.dtype = "usdate"
if field.name in ["MandRlf", "DiscRlf", "AdditionalRlf"]:
field.dtype = "number"
for cat_field in CATEGORY_FIELDS:
if cat_field in datasource.get_data().columns:
schema_source.fields.set_categories(name=cat_field, terms=datasource.get_data())
# 2. Import and modify a destination schema
schema_destination = qd.SchemaDefinition()
schema_destination.set(schema=SCHEMA_DESTINATION)
for field in schema_destination.fields.get_all():
if field.name in ["occupation_state_reliefs", "prop_ba_rates"]:
field.dtype = "array"
# 3. Define a Crosswalk
crosswalk = qd.CrosswalkDefinition()
crosswalk.set(schema_source=schema_source, schema_destination=schema_destination)
crosswalk.actions.add_multi(terms=SCRIPTS)
crosswalk.save(directory=DIRECTORY)
# 4. Transform a data source
transform = qd.TransformDefinition(crosswalk=crosswalk, data_source=datasource.get)
transform.process()
transform.save(directory=DIRECTORY, mimetype=DESTINATION_MIMETYPE)
# 5. Validate a data source
DESTINATION_DATA = DIRECTORY / transform.model.dataDestination.name
TRANSFORM = DIRECTORY / f"{transform.model.name}.transform"
valiform = qd.TransformDefinition()
valiform.validate(
transform=TRANSFORM, data_destination=DESTINATION_DATA, mimetype_destination=DESTINATION_MIMETYPE
)
# 6. Explode the array data columns
df = transform.data.copy()
df = df.explode(["occupation_state_reliefs", "prop_ba_rates"])
df.dropna(subset=["occupation_state_reliefs", "prop_ba_rates"], inplace=True)
df.drop_duplicates(inplace=True)
assert len(df) == 1127

0 comments on commit 068200b

Please sign in to comment.