In [14]:
import spacy


def create_pattern(text_input):
    parts = text_input.split()
    base_patterns = [
        "orth", "text", "norm", "lower", "lemma"
    ]

    grammar_patterns = [
        "pos", "tag", "morph", "dep", "shape", "ent_type", "ent_iob", "ent_id", "ent_kb_id"

    ]

    op = "op"

    op_patterns = ["!", "?", "+", "*", "{n}", "{n,m}", "{n,}", "{,m}"]
    
    full_sequence = []

    for part in parts:
        if "(" in part and ")" in part:  # Check format is correct
            token, rules = part[:-1].split("(")  # Remove the closing parenthesis while splitting
            rules = rules.split("|")
            token_attributes = {}

            for rule in rules:
                rule_key = rule.lower()
                if rule_key in base_patterns:
                    if rule_key == "lower":
                        token_attributes[rule_key.upper()] = token.lower()
                    else:
                        token_attributes[rule_key.upper()] = token
                if rule_key.split("=")[0] in grammar_patterns:
                    key, value = rule.split("=")
                    key = key.upper()
                    token_attributes[key.upper()] = value.upper()
                if rule_key.split("=")[0] == "op":
                    key, value = rule_key.split("=")
                    token_attributes[key.upper()] = value.upper()

            if token_attributes:
                full_sequence.append(token_attributes)

        else:
            print(f"Error in part format: {part}")

    return full_sequence

def build_matcher(nlp, patterns):
    from spacy.matcher import Matcher
    matcher = Matcher(nlp.vocab)
    matcher.add("PATTERN_NAME", [patterns])
    return matcher

def query_docs(docs, matcher):
    matches = []
    for doc in docs:
        matches.append(matcher(doc))
    return matches

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Prepare documents
texts = ["John Smith ran fast.", "But Joe ran faster."]
docs = list(nlp.pipe(texts))

pattern = "(ent_type=person|op={2}) run(lemma)"
# Setup matcher and query documents
pattern = create_pattern(pattern)
print(pattern)
matcher = build_matcher(nlp, pattern)
matched_docs = query_docs(docs, matcher)
print(matched_docs)

[{'ENT_TYPE': 'PERSON', 'OP': '{2}'}, {'LEMMA': 'run'}]
[[(8128502578493265141, 0, 3)], []]


In [10]:
import spacy
from spacy.matcher import Matcher

class SpacyEx:
    def __init__(self, nlp):
        """
        Initialize the SpacyMatcher with a spaCy language model.

        Args:
        nlp (spacy.Language): A spaCy language model.
        """
        self.nlp = nlp
        self.matcher = Matcher(nlp.vocab)

    def create_pattern(self, text_input):
        """
        Process the input string to generate patterns for the spaCy Matcher.

        Args:
        text_input (str): The input string defining the patterns.

        Returns:
        list: A list of dictionaries containing token attributes for matching.
        """
        base_patterns = ["orth", "text", "norm", "lower", "lemma"]
        grammar_patterns = [
            "pos", "tag", "morph", "dep", "shape", "ent_type", "ent_iob", "ent_id", "ent_kb_id"
        ]
        full_sequence = []

        for part in text_input.split():
            if "(" in part and ")" in part:
                token, rules = part[:-1].split("(")
                rules = rules.split("|")
                token_attributes = {}

                for rule in rules:
                    key, _, value = rule.partition("=")
                    key_lower = key.lower()

                    if key_lower in base_patterns:
                        token_attributes[key.upper()] = token.lower() if key_lower == "lower" else token

                    elif key_lower in grammar_patterns:
                        token_attributes[key.upper()] = value.upper()

                    elif key_lower == "op":
                        token_attributes[key.upper()] = value

                if token_attributes:
                    full_sequence.append(token_attributes)
            else:
                print(f"Error in part format: {part}")

        return full_sequence

    def add_patterns(self, pattern_name, patterns):
        """
        Add the specified patterns to the matcher.

        Args:
        pattern_name (str): The identifier for these patterns.
        patterns (list): The patterns to add to the matcher.
        """
        self.matcher.add(pattern_name, [patterns])

    def query_docs(self, docs):
        """
        Match patterns against a sequence of documents.

        Args:
        docs (Iterable[spacy.tokens.Doc]): Documents to match against.

        Returns:
        list: Match results for each document.
        """
        matches = []
        for doc in docs:
            matches.append(self.matcher(doc))
        return matches

# Usage example
nlp = spacy.load("en_core_web_sm")
matcher_tool = SpacyMatcher(nlp)
pattern = "(ent_type=person|op={3}) run(lemma) fast(pos=ADV)"
patterns = matcher_tool.create_pattern(pattern)
print(patterns)

matcher_tool.add_patterns("PATTERN_NAME", patterns)
texts = ["John Smith ran fast.", "But Joe ran faster.", "John Jacob Smith ran fast."]
docs = list(nlp.pipe(texts))
matches = matcher_tool.query_docs(docs)
print(matches)


[{'ENT_TYPE': 'PERSON', 'OP': '{3}'}, {'LEMMA': 'run'}, {'POS': 'ADV'}]
[[], [], [(8128502578493265141, 0, 5)]]
