In [2]:
from collections import deque
from typing import List, Tuple

from pydantic import BaseModel, Field
from spondee.search import nlp_pipeline, identify_statements




In [44]:
class LeafLabel(BaseModel):
    id: int
    text: str
    upos: str
    xpos: str
    feats: str = Field(default="")
    start_char: int
    end_char: int
    misc: str = Field(default="")
    

In [4]:
def extract_node_label(node):
    path = []
    stack = [ node ]
    prev_label = None
    while stack:
        node = stack.pop()
        if len(node.children) == 0:
            path.append((prev_label, node.label))

        stack.extend(node.children)
        prev_label = node.label

    path.reverse()
    return path

def extract_noun_phrases(node):
    paths = []
    q = deque([ node ])
    while q:
        node = q.popleft()
        if node.label == "NP":
            paths.append(extract_node_label(node))

        else:
            q.extend(node.children)

    return paths

def filter_noun_phrases(extracted_np):
    noun_tags = set(["NN", "NNS", "NNP", "NNPS"])
    extract_tags = set([t for t, _ in extracted_np])

    if len(noun_tags & extract_tags) == 0:
        return []

    first_tag, _ = extracted_np[0]
    if first_tag == "DT" or first_tag[:3] == "PRP":
        return extracted_np[1:]

    return extracted_np

In [5]:
def nounphrase_metadata(npm, extract):
    meta_q = deque(npm)
    extract_q = deque(extract)

    found = []
    while extract_q:
        tag, s = extract_q.popleft()
        
        while meta_q:
            _meta = meta_q.popleft()
            if _meta["xpos"] == tag and _meta['text'] == s:
                found.append(_meta)
                break

    return found

In [6]:
def sentence_metadata(statements, simple_sentence:List[dict]):
    paths = []
    stack = statements
    q = deque(simple_sentence)
    while stack:
        noun_phrase, verb_phrase = stack.pop()

        _tagged_np = [ filter_noun_phrases(r) for r in extract_noun_phrases(noun_phrase) ]        
        _tagged_vp = [ filter_noun_phrases(r) for r in extract_noun_phrases(verb_phrase) ]

        npq = deque(noun_phrase.leaf_labels())
        vpq = deque(verb_phrase.leaf_labels())

        _np = []
        _vp = []

        while npq or vpq:
            _leaf = q.popleft()

            if npq and npq[0] == _leaf['text']:
                _np.append(_leaf)
                npq.popleft()

            elif vpq and vpq[0] == _leaf['text']:
                _vp.append(_leaf)
                vpq.popleft()

        _found_np = []
        for tagged in _tagged_np:
            _meta = nounphrase_metadata(_np, tagged)
            if len(_meta) > 0:
                _found_np.append(_meta)

        _found_vp = []
        for tagged in _tagged_vp:
            _meta = nounphrase_metadata(_vp, tagged)
            if len(_meta) > 0:
                _found_vp.append(_meta)
            
        paths.append((_np, _vp, _found_np, _found_vp))

    return paths

In [7]:
nlp = nlp_pipeline()

2024-07-07 00:45:08 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-07-07 00:45:08 INFO: Downloaded file to /Users/tylerbrown/stanza_resources/resources.json
2024-07-07 00:45:08 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| mwt          | combined            |
| pos          | combined_charlm     |
| constituency | ptb3-revised_charlm |

2024-07-07 00:45:08 INFO: Using device: cpu
2024-07-07 00:45:08 INFO: Loading: tokenize
2024-07-07 00:45:09 INFO: Loading: mwt
2024-07-07 00:45:09 INFO: Loading: pos
2024-07-07 00:45:09 INFO: Loading: constituency
2024-07-07 00:45:09 INFO: Done loading processors!


In [8]:
txt = "".join(
        [
            "Gavin Sheets hit his first career grand slam, and the ",
            "Chicago White Sox won their second straight after a ",
            "franchise-record 14-game losing streak, beating the ",
            "Boston Red Sox 6-1 on Saturday.",
        ]
    )

In [9]:
#search_text(txt, nlp)

In [46]:
docs = nlp(txt)

In [48]:
for sentence in docs.sentences:
    ...

In [50]:
#sentence

In [11]:
tree = docs.sentences[0].constituency
statements = identify_statements(tree)

In [12]:
_paths = sentence_metadata(statements, docs.sentences[0].to_dict())

In [21]:
for grp in _paths[1][3]:
    for m in grp:
        LeafLabel.model_validate(m)

ValidationError: 1 validation error for LeafLabel
feats
  Field required [type=missing, input_value={'id': 22, 'text': '-', '...'misc': 'SpaceAfter=No'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.8/v/missing

In [20]:
len(_paths)

2

In [29]:
_paths[1][2][

SyntaxError: leading zeros in decimal integer literals are not permitted; use an 0o prefix for octal integers (1495596285.py, line 1)

In [45]:
for grp in _paths[1][3]:
    for m in grp:
        try:
            LeafLabel.model_validate(m)
        except Exception as exc:
            print(m, exc)