Tests with new tutorial

- Tests for COLLATE and arrays in CATEGORISE - New tutorial to test date and number variations, plus new ACTIONS
whythawk · Dec 12, 2023 · 068200b · 068200b
1 parent 8f501b0
commit 068200b
Show file tree

Hide file tree

Showing 2 changed files with 93 additions and 0 deletions.
diff --git a/tests/test_action.py b/tests/test_action.py
@@ -20,6 +20,11 @@
 SOURCE_DATA_PORTSMOUTH = SOURCE_DIRECTORY / "test_portsmouth_source.data"
 SOURCE_SCHEMA_PORTSMOUTH = SOURCE_DIRECTORY / "test_portsmouth_source.schema"
 DESTINATION_SCHEMA_PORTSMOUTH = SOURCE_DIRECTORY / "test_portsmouth_destination.schema"
+# BASILDON
+# Basildon ratepayer data consist of dates in US format, and numbers as currency strings.
+SOURCE_DATA_BASILDON = SOURCE_DIRECTORY / "test_basildon_source.data"
+SOURCE_SCHEMA_BASILDON = SOURCE_DIRECTORY / "test_basildon_source.schema"
+DESTINATION_SCHEMA_BASILDON = SOURCE_DIRECTORY / "test_basildon_destination.schema"
 
 
 def _test_script_action(script, schema_source, schema_destination, data_source):
@@ -81,6 +86,7 @@ def test_calculate(self):
         )
 
     def test_categorise(self):
+        # As values
         script = [
             "CATEGORISE > 'occupation_state'::False < 'Current Relief Type'::['Empty Property Rate Non-Industrial', 'Empty Property Rate Industrial', 'Empty Property Rate Charitable']",
             "CATEGORISE > 'occupation_state_reliefs'::'small_business' < 'Current Relief Type'::['Small Business Relief England', 'Sbre Extension For 12 Months', 'Supporting Small Business Relief']",
@@ -91,6 +97,18 @@ def test_categorise(self):
         assert _test_script_action(
             script, SOURCE_SCHEMA_PORTSMOUTH, DESTINATION_SCHEMA_PORTSMOUTH, SOURCE_DATA_PORTSMOUTH
         )
+        # As arrays
+        script = [
+            "CATEGORISE > 'occupation_state_reliefs'::'exempt' < 'MandRlfCd'::['CASC','EDUC80','MAND80','PCON','POSTO2']",
+            "CATEGORISE > 'occupation_state_reliefs'::'discretionary' < 'DiscRlfCd'::['DIS100','DISC10','DISC15','DISC30','DISC40','DISC50','DISCXX','POSTOF']",
+            "CATEGORISE > 'occupation_state_reliefs'::'retail' < 'AddRlfCd'::['RETDS3']",
+            "CATEGORISE > 'occupation_state_reliefs'::'small_business' < 'SBRFlag'::['yes']",
+        ]
+        assert _test_script_action(script, SOURCE_SCHEMA_BASILDON, DESTINATION_SCHEMA_BASILDON, SOURCE_DATA_BASILDON)
+
+    def test_collate(self):
+        script = "COLLATE > 'prop_ba_rates' < ['MandRlf', 'DiscRlf', 'AdditionalRlf', ~]"
+        assert _test_script_action(script, SOURCE_SCHEMA_BASILDON, DESTINATION_SCHEMA_BASILDON, SOURCE_DATA_BASILDON)
 
     def test_deblank(self):
         script = "DEBLANK"

diff --git a/tests/test_tutorial_4.py b/tests/test_tutorial_4.py
@@ -0,0 +1,75 @@
+from pathlib import Path
+
+import whyqd as qd
+from whyqd.parsers import CoreParser
+
+SOURCE_DIRECTORY = Path(__file__).resolve().parent / "data"
+SOURCE_DATA = SOURCE_DIRECTORY / "raw-e07000066-tutorial-4.xlsx"
+MIMETYPE = "xlsx"
+DESTINATION_MIMETYPE = "csv"
+SCHEMA_NAME = "test_schema.json"
+SCHEMA_DESTINATION = SOURCE_DIRECTORY / SCHEMA_NAME
+CATEGORY_FIELDS = ["MandRlfCd", "DiscRlfCd", "AddRlfCd", "SBRFlag", "ChgType"]
+SCRIPTS = [
+    "NEW > 'la_code' < ['E07000066']",
+    "RENAME > 'ba_ref' < ['PlaceRef']",
+    "RENAME > 'occupant_name' < ['FOIName']",
+    "RENAME > 'occupation_state_date' < ['LiabStart']",
+    "UNITE > 'postcode' < ', '::['PropAddress1','PropAddress2','PropAddress3','PropAddress4','PropAddress5','PropPostCode']",
+    "CATEGORISE > 'occupation_state_reliefs'::'exempt' < 'MandRlfCd'::['CASC','EDUC80','MAND80','PCON','POSTO2']",
+    "CATEGORISE > 'occupation_state_reliefs'::'discretionary' < 'DiscRlfCd'::['DIS100','DISC10','DISC15','DISC30','DISC40','DISC50','DISCXX','POSTOF']",
+    "CATEGORISE > 'occupation_state_reliefs'::'retail' < 'AddRlfCd'::['RETDS3']",
+    "CATEGORISE > 'occupation_state_reliefs'::'small_business' < 'SBRFlag'::['yes']",
+    "CATEGORISE > 'occupation_state'::False < 'ChgType'::['V']",
+    "COLLATE > 'prop_ba_rates' < ['MandRlf', 'DiscRlf', 'AdditionalRlf', ~]"
+]
+
+
+class TestTutorialVariations:
+    def test_tutorial_basildon_rates_data_variations(self, tmp_path):
+        """Basildon ratepayer data consist of dates in US format, and numbers as currency strings.
+
+        Demonstrating create method, date and number coersions, array collations, and explode."""
+        DIRECTORY = tmp_path
+        CoreParser().check_path(directory=DIRECTORY)
+        # 1. Import a data source and derive a source schema
+        datasource = qd.DataSourceDefinition()
+        datasource.derive_model(source=SOURCE_DATA, mimetype=MIMETYPE)
+        schema_source = qd.SchemaDefinition()
+        schema_source.derive_model(data=datasource.get)
+        for field in schema_source.fields.get_all():
+            if field.name in ["LiabStart"]:
+                field.dtype = "usdate"
+            if field.name in ["MandRlf", "DiscRlf", "AdditionalRlf"]:
+                field.dtype = "number"
+        for cat_field in CATEGORY_FIELDS:
+            if cat_field in datasource.get_data().columns:
+                schema_source.fields.set_categories(name=cat_field, terms=datasource.get_data())
+        # 2. Import and modify a destination schema
+        schema_destination = qd.SchemaDefinition()
+        schema_destination.set(schema=SCHEMA_DESTINATION)
+        for field in schema_destination.fields.get_all():
+            if field.name in ["occupation_state_reliefs", "prop_ba_rates"]:
+                field.dtype = "array"
+        # 3. Define a Crosswalk
+        crosswalk = qd.CrosswalkDefinition()
+        crosswalk.set(schema_source=schema_source, schema_destination=schema_destination)
+        crosswalk.actions.add_multi(terms=SCRIPTS)
+        crosswalk.save(directory=DIRECTORY)
+        # 4. Transform a data source
+        transform = qd.TransformDefinition(crosswalk=crosswalk, data_source=datasource.get)
+        transform.process()
+        transform.save(directory=DIRECTORY, mimetype=DESTINATION_MIMETYPE)
+        # 5. Validate a data source
+        DESTINATION_DATA = DIRECTORY / transform.model.dataDestination.name
+        TRANSFORM = DIRECTORY / f"{transform.model.name}.transform"
+        valiform = qd.TransformDefinition()
+        valiform.validate(
+            transform=TRANSFORM, data_destination=DESTINATION_DATA, mimetype_destination=DESTINATION_MIMETYPE
+        )
+        # 6. Explode the array data columns
+        df = transform.data.copy()
+        df = df.explode(["occupation_state_reliefs", "prop_ba_rates"])
+        df.dropna(subset=["occupation_state_reliefs", "prop_ba_rates"], inplace=True)
+        df.drop_duplicates(inplace=True)
+        assert len(df) == 1127