In [61]:
from collections import deque
from typing import Tuple

from spondee.search import nlp_pipeline, search_text, identify_statements

In [2]:
nlp = nlp_pipeline()

2024-07-05 21:34:09 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-07-05 21:34:10 INFO: Downloaded file to /Users/tylerbrown/stanza_resources/resources.json
2024-07-05 21:34:10 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| mwt          | combined            |
| pos          | combined_charlm     |
| constituency | ptb3-revised_charlm |

2024-07-05 21:34:10 INFO: Using device: cpu
2024-07-05 21:34:10 INFO: Loading: tokenize
2024-07-05 21:34:10 INFO: Loading: mwt
2024-07-05 21:34:10 INFO: Loading: pos
2024-07-05 21:34:10 INFO: Loading: constituency
2024-07-05 21:34:11 INFO: Done loading processors!


In [6]:
txt = "".join(
        [
            "Gavin Sheets hit his first career grand slam, and the ",
            "Chicago White Sox won their second straight after a ",
            "franchise-record 14-game losing streak, beating the ",
            "Boston Red Sox 6-1 on Saturday.",
        ]
    )

In [7]:
search_text(txt, nlp)

[Sentence(sidx=0, subject=['Chicago White Sox'], subject_text=['the', 'Chicago', 'White', 'Sox'], predicate=['second straight', 'franchise-record 14-game losing streak', 'Boston Red Sox', 'Saturday'], predicate_text=['won', 'their', 'second', 'straight', 'after', 'a', 'franchise', '-', 'record', '14', '-', 'game', 'losing', 'streak', ',', 'beating', 'the', 'Boston', 'Red', 'Sox', '6', '-', '1', 'on', 'Saturday']),
 Sentence(sidx=0, subject=['Gavin Sheets'], subject_text=['Gavin', 'Sheets'], predicate=['first career grand slam'], predicate_text=['hit', 'his', 'first', 'career', 'grand', 'slam'])]

In [8]:
docs = nlp(txt)

In [12]:
tree = docs.sentences[0].constituency

In [16]:
noun_phrase, verb_phrase = identify_statements(tree)

In [17]:
verb_phrase

((NP (NNP Gavin) (NNP Sheets)),
 (VP (VBD hit) (NP (PRP$ his) (JJ first) (NN career) (JJ grand) (NN slam))))

In [75]:
 def nounphrase_text(node) -> Tuple[str, bool]:                                  
    """Depth first search recovers child node and parent label."""              
    txt = []                                                                    
    stack = [node]                                                              
    prev_label = node.label                                                     
    while stack:                                                                
        node = stack.pop()                                                      
        if len(node.children) == 0:                                             
            txt.append((prev_label, node.label))                                
            
        stack.extend(node.children)                                             
        prev_label = node.label                                                 
                                                                        
    txt.reverse()
    return txt

In [76]:
def extract_np(tree):
    paths = []
    q = deque([tree])
    while q:
        node = q.popleft()
        print(node)
        if node.label == "NP":
            paths.append(nounphrase_text(node))

        else:
            q.extend(node.children)

    return paths

In [77]:
verb_phrase[1]

(VP (VBD hit) (NP (PRP$ his) (JJ first) (NN career) (JJ grand) (NN slam)))

In [78]:
hmm = extract_np(verb_phrase[1])

(VP (VBD hit) (NP (PRP$ his) (JJ first) (NN career) (JJ grand) (NN slam)))
(VBD hit)
(NP (PRP$ his) (JJ first) (NN career) (JJ grand) (NN slam))
hit


In [79]:
hmm

[[('PRP$', 'his'),
  ('JJ', 'first'),
  ('NN', 'career'),
  ('JJ', 'grand'),
  ('NN', 'slam')]]

In [105]:
q = deque([ d for d in docs.sentences[0].to_dict() ])

In [106]:
paths = []
while q:
    node = q.popleft()
    k = (node['xpos'], node['text'])

    if k == hmm[0][0]:
        grp = [ node ]
        ok = all([ hmm[0][i] == (q[i-1]['xpos'], q[i-1]['text']) for i in range(1, len(hmm[0])) ])
        if ok:
            for i in range(1, len(hmm[0])):
                node = q.popleft()
                grp.append(node)

        paths.append(grp)

In [107]:
paths

[[{'id': 4,
   'text': 'his',
   'upos': 'PRON',
   'xpos': 'PRP$',
   'feats': 'Case=Gen|Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs',
   'start_char': 17,
   'end_char': 20},
  {'id': 5,
   'text': 'first',
   'upos': 'ADJ',
   'xpos': 'JJ',
   'feats': 'Degree=Pos|NumType=Ord',
   'start_char': 21,
   'end_char': 26},
  {'id': 6,
   'text': 'career',
   'upos': 'NOUN',
   'xpos': 'NN',
   'feats': 'Number=Sing',
   'start_char': 27,
   'end_char': 33},
  {'id': 7,
   'text': 'grand',
   'upos': 'ADJ',
   'xpos': 'JJ',
   'feats': 'Degree=Pos',
   'start_char': 34,
   'end_char': 39},
  {'id': 8,
   'text': 'slam',
   'upos': 'NOUN',
   'xpos': 'NN',
   'feats': 'Number=Sing',
   'start_char': 40,
   'end_char': 44,
   'misc': 'SpaceAfter=No'}]]