The following details out the steps taken to update the `dict.txt` file per case [Update dict.txt - remove xylazine terms](https://github.com/suzytamang/clever-rockies/issues/19).

This will treat each modification as a separate and distinct series of actions (deletes), which will be used in turn to inform case [Support Feature: Renumbering and Validation Script](https://github.com/suzytamang/clever-rockies/issues/22).

In [25]:
# dataclass to hold terminology entry


from abc import ABC
from dataclasses import dataclass

from typing import Callable, List
import pandas as pd
from math import floor
from tabulate import tabulate
from typing import Self
from typing import Literal
from typing import cast
import functools
from static_frame.core.frame import Frame
from typing import List


pd.set_option("display.max_columns", None)
# pd.set_option("display.max_columns", 20)
pd.set_option("display.width", 200)
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_seq_items", 2000)

IGNORE_VALIDATION_ERRORS = True


class FrameHelper:

    @staticmethod
    def show(title: str, df: pd.DataFrame):
        print(title)
        print(tabulate(df, headers="keys", tablefmt="psql", showindex=False))
        print()

In [26]:
# Constants
class ColumnLabels(ABC):

    @classmethod
    def all_values(cls):
        return [
            val for key, val in vars(cls).items() if not key.startswith("_")
        ]


class HashLabels(ColumnLabels):
    TERM_LABEL = "term"
    SUBCLASS_LABEL = "subclass_name"
    CLASS_LABEL = "class_name"


class TermLabels(HashLabels):
    ID_LABEL = "id"


class NumberingDetails(ColumnLabels):
    ID_CHUNK_LABEL = "id_chunk"
    HASH_LABEL = "hash_code"


class ValidationFlags(ColumnLabels):
    IS_VALID_TERM_LABEL = "is_valid_term"
    HAS_DUPLICATE_ID_LABEL = "has_duplicate_id"
    DUPLICATE_TERM_CLASS_SUBCLASS_LABEL = "duplicate_term_class_subclass"

In [27]:

class IResetChanged(ABC):

    def reset(self):
        pass


class ISetChanged(ABC):

    def set_changed(self):
        pass
    
    @property
    def is_changed(self) -> bool:
        pass


class ChangedFlag(ISetChanged, IResetChanged):
    """Class to manage changed flag; controls access to only set as changed"""

    def __init__(self) -> None:
        self._is_changed = False

    def set_changed(self):
        self._is_changed = True
    
    @property
    def is_changed(self) -> bool:
        return self._is_changed

    def reset(self):
        self._is_changed = False

    @property
    def is_changed(self) -> bool:
        return self._is_changed

In [28]:
from typing import Union


@dataclass
class Term:
    id: int
    term: str
    subclass_name: str
    class_name: str

    @staticmethod
    def calc_hash_key(row: Union[pd.Series, 'Term']):
        if isinstance(row, Term):
            term = row.term
            class_val = row.class_name
            subclass_val = row.subclass_name
        else:
            term = row[HashLabels.TERM_LABEL]
            class_val = row[HashLabels.CLASS_LABEL]
            subclass_val = row[HashLabels.SUBCLASS_LABEL]
        return hash(f"{term}_{class_val}_{subclass_val}")

    def hash_key(self):
        return Term.calc_hash_key(self)

    def __hash__(self):
        return self.hash_key()

    @staticmethod
    def deserialize(term_string: str) -> "Term":
        term_pieces = term_string.split("|")
        return Term(
            id=int(term_pieces[0]),
            term=term_pieces[1],
            subclass_name=term_pieces[2],
            class_name=term_pieces[3],
        )

    @staticmethod
    def deserialize_terms(term_strings: List[str]) -> List["Term"]:
        return [Term.deserialize(term_string) for term_string in term_strings]

In [29]:
# Terminology Validator
class TermValidator:

    def __init__(self, *rules: Callable[[Term], bool]) -> None:

        rules = list(rules)

        # if no rules provided, then default to asserting term is valid

        if len(rules) == 0:

            rules.append(TermValidator.noop)

        self._rules = rules


    def __call__(self, term: Term) -> bool:

        is_valid = all(is_valid(term) for is_valid in self._rules)

        return is_valid


    @staticmethod

    def noop(x) -> bool:

        return True

In [30]:
# Terminology Validator


@dataclass
class DuplicateCheckResults:
    duplicates_found: bool
    strategy_name: str
    labels_to_check: List[str]
    flag_label: str
    duplicate_entries: pd.DataFrame


class DuplicateCheckingStrategy(ABC):

    def __init__(
        self,
        strategy_name: str,
        labels_to_check: str | List[str],
        flag_label: str,
    ) -> None:
        super().__init__()

        self._strategy_name: str = strategy_name
        if isinstance(labels_to_check, str):
            labels_to_check = [labels_to_check]

        self._labels_to_check: List[str] = labels_to_check
        self._flag_label: str = flag_label

    def __call__(self, terms_df: pd.DataFrame) -> DuplicateCheckResults:
        return self.duplicate_check(
            terms_df, self._labels_to_check, self._flag_label
        )

    @property
    def strategy_name(self):
        return self._strategy_name

    def duplicate_check(
        self,
        terms_df: pd.DataFrame,
        labels_to_check: List[str],
        flag_label: str,
    ) -> DuplicateCheckResults:
        """Check to see if duplicates exist for a vector of columns

        Args:
            labels_to_check (List[str]): vector of columns to check
            flag_label (str): label to add to dataframe for this flag

        Returns:
            bool: _description_
        """
        duplicate_mask: pd.Series = terms_df[labels_to_check].duplicated()
        terms_df[flag_label] = duplicate_mask
        return DuplicateCheckResults(
            duplicates_found=any(duplicate_mask),
            strategy_name=self.strategy_name,
            labels_to_check=labels_to_check,
            flag_label=flag_label,
            duplicate_entries=terms_df[terms_df[flag_label]],
        )


class DuplicateIdsCheckingStrategy(DuplicateCheckingStrategy):

    def __init__(self) -> None:
        super().__init__(
            "ids", TermLabels.ID_LABEL, ValidationFlags.HAS_DUPLICATE_ID_LABEL
        )


class DuplicateTermsCheckingStrategy(DuplicateCheckingStrategy):

    def __init__(self) -> None:
        super().__init__(
            "term, class, subclass",
            [
                TermLabels.TERM_LABEL,
                TermLabels.CLASS_LABEL,
                TermLabels.SUBCLASS_LABEL,
            ],
            ValidationFlags.DUPLICATE_TERM_CLASS_SUBCLASS_LABEL,
        )


class TerminologyValidator:

    def __init__(
        self,
        term_validator: TermValidator,
        *duplicate_checking: DuplicateCheckingStrategy,
    ) -> None:
        self._term_validator: TermValidator = term_validator
        if duplicate_checking is None:
            duplicate_checking = list()
        else:
            duplicate_checking = list(duplicate_checking)
        self._duplicate_checking_strategies: List[
            DuplicateCheckingStrategy
        ] = duplicate_checking

    def __call__(self, terms_df: pd.DataFrame) -> bool:

        # validate each term with any term-level rules
        terms_df[ValidationFlags.IS_VALID_TERM_LABEL] = terms_df.apply(
            lambda term: self._term_validator(term), axis=1
        )

        terminology_is_valid = all(
            terms_df[ValidationFlags.IS_VALID_TERM_LABEL]
        )

        for duplicate_check in self._duplicate_checking_strategies:
            check_results: DuplicateCheckResults = duplicate_check(terms_df)

            if check_results.duplicates_found:
                terminology_is_valid = False
                FrameHelper.show(
                    f"Duplicates found for {check_results.labels_to_check}",
                    check_results.duplicate_entries,
                )

        return terminology_is_valid

    def does_term_exist(
        self, terminology_df: pd.DataFrame, term: Term
    ) -> bool:
        """Check to see if the term string exists

        Args:
            term (Term): _description_

        Returns:
            bool: _description_
        """
        term_count = len(
            terminology_df[terminology_df[TermLabels.TERM_LABEL] == term.term]
        )
        if term_count > 1:
            print("More than one instance of term found: %s", term)

        return term_count == 1

In [31]:
class TerminologyRenumberStrategy:
    """This strategy assumes the following:

    - Terms ids are in 1k chunks
    - Deleting a term drops the id and renumbers
        all following terms up to the next 1k boundary

    """

    ID_BLOCK_SIZE = 1000

    def __init__(self) -> None:
        self._monitor: IResetChanged | None = None

    def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
        """Iterates over each 1k block and compacts ids
        to create a monotonically increasing id starting at
        each 1k boundary.

        Example:

        | Original ID 	| Updated ID 	|
        |-------------	|------------	|
        | 1           	| 1          	|
        | 2           	| 2          	|
        | 10          	| 3          	|
        | 100         	| 4          	|
        | 1001        	| 1000       	|
        | 1002        	| 1001       	|
        | 1010        	| 1002       	|

        Args:
            df (pd.DataFrame): current copy of the terminology

        Returns:
            pd.DataFrame: terminology with updated ids
        """

        # Find all 1k chunks

        df[NumberingDetails.ID_CHUNK_LABEL] = (
            df[TermLabels.ID_LABEL] / TerminologyRenumberStrategy.ID_BLOCK_SIZE
        ).apply(floor)

        def reset_index(x: pd.DataFrame):
            return x.reset_index(drop=True)

        working_df: pd.DataFrame = (
            df.groupby(by=NumberingDetails.ID_CHUNK_LABEL, as_index=False)[
                df.columns.tolist()
            ]
            .apply(reset_index)
            .reset_index()
            .drop(labels=["level_0", TermLabels.ID_LABEL], axis=1)
            .rename({"level_1": TermLabels.ID_LABEL}, axis=1)
        )

        # Update id numbering for first chunk (id < 1000)
        first_chunk_df: pd.DataFrame = working_df[
            working_df[NumberingDetails.ID_CHUNK_LABEL] == 0
        ].copy()
        first_chunk_df[TermLabels.ID_LABEL] = (
            first_chunk_df[TermLabels.ID_LABEL] + 1
        )

        # Update id numbering for chunks after the first one (id >= 1000)
        remaining_chunks_df: pd.DataFrame = working_df[
            working_df[NumberingDetails.ID_CHUNK_LABEL] > 0
        ].copy()
        remaining_chunks_df[TermLabels.ID_LABEL] = (
            remaining_chunks_df[NumberingDetails.ID_CHUNK_LABEL]
            * TerminologyRenumberStrategy.ID_BLOCK_SIZE
            + remaining_chunks_df[TermLabels.ID_LABEL]
        )

        df = pd.concat([first_chunk_df, remaining_chunks_df])
        # columns_to_drop = [NumberingDetails.ID_CHUNK_LABEL] + ValidationFlags.all_values()
        # df = df.drop(
        #     labels=columns_to_drop,
        #     axis=1,
        # )

        self._monitor.reset()

        return df

    def set_monitor(self, monitor: IResetChanged):
        self._monitor = monitor

In [32]:
# Primary management class for terminology actions


class NotAllowed(Exception):
    pass


class TerminologyManager:
    """Manage additions, deletions, and updates to terminology"""

    def __init__(
        self,
        terms: pd.DataFrame | str,
        terminology_validator: TerminologyValidator,
        renumber_strategy: TerminologyRenumberStrategy,
        ignore_validation_errors: bool = False,
    ) -> None:
        self._is_terminology_valid: bool = False
        self._ignore_validation_errors: bool = ignore_validation_errors
        change_monitor: ChangedFlag = ChangedFlag()
        self._change_monitor: ISetChanged = change_monitor
        if isinstance(terms, str):
            terms: pd.DataFrame = pd.read_csv(
                terms,
                delimiter="|",
                names=[
                    TermLabels.ID_LABEL,
                    TermLabels.TERM_LABEL,
                    TermLabels.SUBCLASS_LABEL,
                    TermLabels.CLASS_LABEL,
                ],
            )

        terms[NumberingDetails.HASH_LABEL] = terms.apply(
            Term.calc_hash_key, axis=1
        )

        self._updated_terminology_df: pd.DataFrame = terms.copy()
        self._original_terminology_df: pd.DataFrame = terms.copy()
        self._terminology_validator: TerminologyValidator = (
            terminology_validator
        )
        self._renumber_strategy: TerminologyRenumberStrategy = (
            renumber_strategy
        )
        self._renumber_strategy.set_monitor(
            cast(IResetChanged, self._change_monitor)
        )

        self._is_terminology_valid = self._terminology_validator(
            self._original_terminology_df
        )

    def __enter__(self) -> Self:
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.renumber()

    def renumber(self):
        self._updated_terminology_df = self._renumber_strategy(
            self._updated_terminology_df
        )

    def guard(condition):
        def decorator(func):
            @functools.wraps(func)
            def wrapper(self, *args, **kwargs):
                if not condition(self):
                    raise NotAllowed(
                        f"Cannot call {func}; terminology is not valid"
                    )
                return func(self, *args, **kwargs)

            return wrapper

        return decorator

    @staticmethod
    def as_readonly(df: pd.DataFrame) -> pd.DataFrame:
        # return sf.Frame.from_pandas(df)
        return df.copy()

    def is_terminology_valid(self) -> bool:
        return self._is_terminology_valid

    def ignoring_errors(self) -> bool:
        return self._ignore_validation_errors

    def can_access_method(self) -> bool:
        return self.is_terminology_valid() or self.ignoring_errors()

    def find(self, source: Literal["original"] | Literal["updated"], **kwargs):
        """Search for entries in the original or updated terminology

        Args:
            source (Literal[&quot;original&quot;] | Literal[&quot;updated&quot;]): which dataset to pull from
            **kwargs
                ids (List[str]): list of ids
                hash_codes (List[str]): list of hash codes for terms
        Returns:
            _type_: _description_
        """
        df: pd.DataFrame

        if source == "original":
            df = self.get_original_terminology()
        else:
            df = self.get_updated_terminology()

        search_column: str
        search_key: str
        search_terms: List

        if "ids" in kwargs:
            search_column = TermLabels.ID_LABEL
            search_key = "ids"
        elif "hash_codes" in kwargs:
            search_column = NumberingDetails.HASH_LABEL
            search_key = "hash_codes"

        search_terms = kwargs.get(search_key)
        if isinstance(search_terms, List) is False:
            search_terms = [search_terms]

        return df[df[search_column].isin(search_terms)]

    def get_original_terminology(self) -> pd.DataFrame:
        """Return a copy of the original terminology

        Returns:
            pd.DataFrame: _description_
        """
        return TerminologyManager.as_readonly(self._original_terminology_df)

    def get_updated_terminology(self) -> pd.DataFrame:
        """Return the updated terminology, renumbering if a change was made

        Returns:
            pd.DataFrame: _description_
        """
        if self._change_monitor.is_changed is True:
            self.renumber()

        return TerminologyManager.as_readonly(self._updated_terminology_df)

    @guard(can_access_method)
    def delete_term(self, term: Term) -> Self | None:
        """Remove a term from the terminology

        Args:
            term (Term): Delete a term from the terminology
        """
        if (
            self._terminology_validator.does_term_exist(
                self._updated_terminology_df, term
            )
            is False
        ):
            print("Term %s does not exist")
            return
        df: pd.DataFrame = self._updated_terminology_df

        term_mask: pd.Series = (
            df[NumberingDetails.HASH_LABEL] == term.hash_key()
        )

        if not any(term_mask):
            raise Exception("Could not locate term in dict: %s", str(term))

        if len(df[term_mask]) > 1:
            raise Exception(
                "Found more than one instance of term: %s", str(term)
            )

        self._updated_terminology_df = df.loc[
            ~term_mask, :
        ]  # everything but the selected term
        self._change_monitor.set_changed()
        return self

    @guard(can_access_method)
    def add_term(self, term: Term) -> Self | None:
        raise NotImplementedError("add_term")
        self._change_monitor.set_changed()
        return self

    @guard(can_access_method)
    def update_term(self, term: Term) -> Self | None:
        raise NotImplementedError("update_term")
        self._change_monitor.set_changed()
        return self

Load terminology

In [33]:
terminology_manager = TerminologyManager(
    r"..\..\res\dicts\dict.txt",
    terminology_validator=TerminologyValidator(
        TermValidator(),
        DuplicateIdsCheckingStrategy(),
        DuplicateTermsCheckingStrategy(),
    ),
    renumber_strategy=TerminologyRenumberStrategy(),
    ignore_validation_errors=IGNORE_VALIDATION_ERRORS
)

Duplicates found for ['term', 'class_name', 'subclass_name']
+-------+--------+-----------------+--------------+----------------------+-----------------+--------------------+---------------------------------+
|    id | term   | subclass_name   | class_name   |            hash_code | is_valid_term   | has_duplicate_id   | duplicate_term_class_subclass   |
|-------+--------+-----------------+--------------+----------------------+-----------------+--------------------+---------------------------------|
| 19004 | basdai | FSDA            | assessment   | -4489148185848859167 | True            | False              | True                            |
+-------+--------+-----------------+--------------+----------------------+-----------------+--------------------+---------------------------------+



Remove entries

In [34]:
# if terminology_manager.is_terminology_valid():
search_windows_ids = list(range(3000, 4001))

original_df: pd.DataFrame = terminology_manager.find(
    ids=search_windows_ids, source="original"
)

window_hash_codes = original_df[NumberingDetails.HASH_LABEL].tolist()




terms: List[Term] = Term.deserialize_terms(
    [
        "3001|tranq|XYLA|drug",
        "3002|tranq dope|XYLA|drug",
        "3003|zylazine|XYLA|drug",
    ]
)

FrameHelper.show(
    "Terms which will be deleted",
    original_df[original_df[NumberingDetails.HASH_LABEL].isin([x.hash_key() for x in terms])]
)

FrameHelper.show(
    "Window of terms to observe including surrounding terms",
    original_df,
)

Terms which will be deleted
+------+------------+-----------------+--------------+----------------------+-----------------+--------------------+---------------------------------+
|   id | term       | subclass_name   | class_name   |            hash_code | is_valid_term   | has_duplicate_id   | duplicate_term_class_subclass   |
|------+------------+-----------------+--------------+----------------------+-----------------+--------------------+---------------------------------|
| 3001 | tranq      | XYLA            | drug         | -1802980043639505019 | True            | False              | False                           |
| 3002 | tranq dope | XYLA            | drug         | -5111535385192017806 | True            | False              | False                           |
| 3003 | zylazine   | XYLA            | drug         |  8072201324078923795 | True            | False              | False                           |
+------+------------+-----------------+--------------+------------

Delete terms

In [35]:
for term in terms:
    
    terminology_manager.delete_term(term)

    updated_terminology_df = terminology_manager.find(hash_codes=window_hash_codes, source="updated")
    FrameHelper.show(
        f'Showing subset updated after deletion of "{term.term}", "{term.subclass_name}", "{term.class_name}"',
        updated_terminology_df,
    )

Showing subset updated after deletion of "tranq", "XYLA", "drug"
+------+-----------------------------------+-----------------+--------------+----------------------+------------+
|   id | term                              | subclass_name   | class_name   |            hash_code |   id_chunk |
|------+-----------------------------------+-----------------+--------------+----------------------+------------|
| 3000 | xylazine                          | XYLA            | drug         |  5312780436728734500 |          3 |
| 3001 | tranq dope                        | XYLA            | drug         | -5111535385192017806 |          3 |
| 3002 | zylazine                          | XYLA            | drug         |  8072201324078923795 |          3 |
| 4000 | for controlled substances outside | PDMP            | response1    | -2068336252037976051 |          4 |
+------+-----------------------------------+-----------------+--------------+----------------------+------------+

Showing subset updated

Get renumbered terminology

In [36]:
FrameHelper.show("Original terminology", terminology_manager.get_original_terminology().head(10))

updated_df: pd.DataFrame = terminology_manager.get_updated_terminology()
FrameHelper.show("Updated terminology with renumbered ids", updated_df.head(10))

Original terminology
+------+---------+-----------------+--------------+----------------------+-----------------+--------------------+---------------------------------+
|   id | term    | subclass_name   | class_name   |            hash_code | is_valid_term   | has_duplicate_id   | duplicate_term_class_subclass   |
|------+---------+-----------------+--------------+----------------------+-----------------+--------------------+---------------------------------|
|    1 | .       | DOT             | boundry      |   813019279694751451 | True            | False              | False                           |
|    2 | ;       | DOT             | boundry      | -9192791342654108884 | True            | False              | False                           |
|   10 | old     | HX              | modifier     |  1468161944162412334 | True            | False              | False                           |
|   11 | hx      | HX              | modifier     |  5978150931564632931 | True            