In [26]:
from collections import deque
from typing import List, Tuple

from pydantic import BaseModel, Field
from spondee.search import nlp_pipeline, search_text, identify_statements, nounphrase_text

In [10]:
class LeafLabel(BaseModel):
    id: int
    text: str
    upos: str
    xpos: str
    feats: str
    start_char: int
    end_char: int
    

In [11]:
def simple_sentence_metadata(statements, simple_sentence:List[dict]):
    paths = []
    stack = statements
    q = deque(simple_sentence)
    while stack:
        noun_phrase, verb_phrase = stack.pop()

        npq = deque(noun_phrase.leaf_labels())
        vpq = deque(verb_phrase.leaf_labels())

        _np = []
        _vp = []

        while npq or vpq:
            _leaf = q.popleft()

            if npq and npq[0] == _leaf['text']:
                _np.append(_leaf)
                npq.popleft()

            elif vpq and vpq[0] == _leaf['text']:
                _vp.append(_leaf)
                vpq.popleft()

        paths.append((_np, _vp))

    return paths

In [12]:
nlp = nlp_pipeline()

2024-07-06 22:38:46 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-07-06 22:38:46 INFO: Downloaded file to /Users/tylerbrown/stanza_resources/resources.json
2024-07-06 22:38:47 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| mwt          | combined            |
| pos          | combined_charlm     |
| constituency | ptb3-revised_charlm |

2024-07-06 22:38:47 INFO: Using device: cpu
2024-07-06 22:38:47 INFO: Loading: tokenize
2024-07-06 22:38:47 INFO: Loading: mwt
2024-07-06 22:38:47 INFO: Loading: pos
2024-07-06 22:38:47 INFO: Loading: constituency
2024-07-06 22:38:47 INFO: Done loading processors!


In [13]:
txt = "".join(
        [
            "Gavin Sheets hit his first career grand slam, and the ",
            "Chicago White Sox won their second straight after a ",
            "franchise-record 14-game losing streak, beating the ",
            "Boston Red Sox 6-1 on Saturday.",
        ]
    )

In [14]:
#search_text(txt, nlp)

In [15]:
docs = nlp(txt)

In [16]:
tree

(ROOT (S (S (NP (NNP Gavin) (NNP Sheets)) (VP (VBD hit) (NP (PRP$ his) (JJ first) (NN career) (JJ grand) (NN slam)))) (, ,) (CC and) (S (NP (DT the) (NNP Chicago) (NNP White) (NNPS Sox)) (VP (VBD won) (NP (PRP$ their) (JJ second) (NN straight)) (PP (IN after) (NP (DT a) (NML (NN franchise) (HYPH -) (NN record)) (NML (CD 14) (HYPH -) (NN game)) (NN losing) (NN streak))) (, ,) (S (VP (VBG beating) (NP (DT the) (NNP Boston) (NNP Red) (NNPS Sox)) (NP (NP (CD 6)) (PP (SYM -) (NP (CD 1)))) (PP (IN on) (NP (NNP Saturday))))))) (. .)))

In [28]:
tree = docs.sentences[0].constituency
statements = identify_statements(tree)

In [19]:
_paths = simple_sentence_metadata(statements, docs.sentences[0].to_dict())

In [30]:
_node = statements[0][1]

In [42]:
def extract_node_label(node):
    path = []
    stack = [ node ]
    prev_label = None
    while stack:
        node = stack.pop()
        if len(node.children) == 0:
            path.append((prev_label, node.label))

        stack.extend(node.children)
        prev_label = node.label

    path.reverse()
    return path

def extract_noun_phrases(node):
    paths = []
    q = deque([ node ])
    while q:
        node = q.popleft()
        if node.label == "NP":
            paths.append(extract_node_label(node))

        else:
            q.extend(node.children)

    return paths

def filter_noun_phrases(extracted_np):
    noun_tags = set(["NN", "NNS", "NNP", "NNPS"])
    extract_tags = set([t for t, _ in extracted_np])

    if len(noun_tags & extract_tags) == 0:
        return None, False

    first_tag, _ = extracted_np[0]
    if first_tag == "DT" or first_tag[:3] == "PRP":
        return extracted_np[1:], True

    return extracted_np, True

In [43]:
found = []
for extract in extract_noun_phrases(statements[0][1]):
    tags, status = filter_noun_phrases(extract)
    if status:
        found.append(tags)

In [45]:
found

[[('JJ', 'second'), ('NN', 'straight')],
 [('NN', 'franchise'),
  ('HYPH', '-'),
  ('NN', 'record'),
  ('CD', '14'),
  ('HYPH', '-'),
  ('NN', 'game'),
  ('NN', 'losing'),
  ('NN', 'streak')],
 [('NNP', 'Boston'), ('NNP', 'Red'), ('NNPS', 'Sox')],
 [('NNP', 'Saturday')]]

In [None]:
#tree.leaf_labels()