In [21]:
import re

TERM_TYPE_TINY_TO_LONG_MAPPING = {
    "SK": "standalone_keyterm",
    "PK": "parented_keyterm",
    "SN": "standalone_named_entity",
}

def remove_all_annotations_from_text(annotated_text):
    new_text = re.sub(
        r"\((.*?)\|(((PK|SN) .+?)|(SK))\)",
        lambda match: match.group(1),
        annotated_text,
        flags=re.DOTALL,
    )
    return new_textextract_annotations_as_list(u'This (is|PK are) (a|SK) (test|SUBJECT).')

def extract_annotations_as_list(
    annotated_text, term_types_to_extract=None, entity_names_to_extract=None
):

    """ Returns all annotations from an annotated text as a list of dictionaries.
    
        - Valid types to extract: 'standalone_keyterm', 'parented_keyterm', 'named_entity'.        
        - If types_to_extract and/or entity_names_to_extract are None, all types/entities are extracted.
        - The returned position ranges ignore other annotations, ie. as if the other annotations did not exist.
        - Beware that the returned positions are meant to be used as ranges. annotated_text[5:14] might return
          the desired result while annotated_text[14] may encounter an index out of range exception.
    """

    # ensure that types_to_extract has valid entries
    if term_types_to_extract:
        for type_to_extract in term_types_to_extract:
            if type_to_extract not in [
                "standalone_keyterm",
                "parented_keyterm",
                "standalone_named_entity",
            ]:
                raise ValueError(
                    "At least one entry in param 'types_to_extract' is invalid. Ensure that only valid types are used."
                )

    result = []
    parenthesis_level = 0
    buffer = ''
    for index, char in enumerate(annotated_text):
        if char == '(': parenthesis_level += 1
        if parenthesis_level > 0:
            buffer += char
        if char == ')':
            if parenthesis_level == 1:
                result.appe
                print(buffer)
                buffer = ''
            parenthesis_level -= 1    
    
    return result

In [22]:
extract_annotations_as_list(u'This (is|PK are) (a|SK) (test|SUBJECT).')

(is|PK are)
(a|SK)
(test|SUBJECT)


[]

In [8]:
import re

TERM_TYPE_TINY_TO_LONG_MAPPING = {
    "SK": "standalone_keyterm",
    "PK": "parented_keyterm",
    "SN": "standalone_named_entity",
}

def remove_all_annotations_from_text(annotated_text):
    new_text = re.sub(
        r"\((.*?)\|(((PK|SN) .+?)|(SK))\)",
        lambda match: match.group(1),
        annotated_text,
        flags=re.DOTALL,
    )
    return new_textextract_annotations_as_list(u'This (is|PK are) (a|SK) (test|SUBJECT).')

def extract_annotations_as_list(
    annotated_text, term_types_to_extract=None, entity_names_to_extract=None
):

    """ Returns all annotations from an annotated text as a list of dictionaries.
    
        - Valid types to extract: 'standalone_keyterm', 'parented_keyterm', 'named_entity'.        
        - If types_to_extract and/or entity_names_to_extract are None, all types/entities are extracted.
        - The returned position ranges ignore other annotations, ie. as if the other annotations did not exist.
        - Beware that the returned positions are meant to be used as ranges. annotated_text[5:14] might return
          the desired result while annotated_text[14] may encounter an index out of range exception.
    """

    # ensure that types_to_extract has valid entries
    if term_types_to_extract:
        for type_to_extract in term_types_to_extract:
            if type_to_extract not in [
                "standalone_keyterm",
                "parented_keyterm",
                "standalone_named_entity",
            ]:
                raise ValueError(
                    "At least one entry in param 'types_to_extract' is invalid. Ensure that only valid types are used."
                )

    result = []
    for match in re.finditer(
        r"\((?P<term>[^()]+?)\|(?P<term_type_tiny>(SK|PK|SN))( (?P<postfix>[^()]+?))?\)",
        annotated_text,
        flags=re.DOTALL,
    ):
        # compute full result
        term = match.group("term")
        term_type_long = TERM_TYPE_TINY_TO_LONG_MAPPING.get(
            match.group("term_type_tiny")
        )
        postfix = match.group("postfix")
        start_position = len(
            remove_all_annotations_from_text(annotated_text[: match.start()])
        )
        end_position = start_position + len(
            remove_all_annotations_from_text(
                annotated_text[match.start() : match.end()]
            )
        )
        dict_to_add = {
            "term": term,
            "start": start_position,
            "end": end_position,
            "term_type_long": term_type_long,
        }
        if term_type_long == "parented_keyterm":
            dict_to_add["parent_terms"] = postfix
        if term_type_long == "standalone_named_entity":
            dict_to_add["entity_name"] = postfix
        result.append(dict_to_add)
        # apply filters if needed
        if term_types_to_extract:
            result = [
                item
                for item in result
                if item["term_type_long"] in term_types_to_extract
            ]
        if entity_names_to_extract:
            result = [
                item
                for item in result
                if item["term_type_long"] != "standalone_named_entity"
                or (
                    item["term_type_long"] == "standalone_named_entity"
                    and "entity_name" in item
                    and item["entity_name"] in entity_names_to_extract
                )
            ]
    return result

In [9]:
extract_annotations_as_list(u'This (is|PK are) (a|SK) (test|SUBJECT).')

[{'term': 'is',
  'start': 5,
  'end': 7,
  'term_type_long': 'parented_keyterm',
  'parent_terms': 'are'},
 {'term': 'a', 'start': 8, 'end': 9, 'term_type_long': 'standalone_keyterm'}]