In [10]:
import stanza

nlp = stanza.Pipeline(lang='en', processors='tokenize,pos, lemma, constituency, depparse', download_method=None)

2023-02-15 20:14:04 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| lemma        | combined |
| depparse     | combined |
| constituency | wsj      |

2023-02-15 20:14:04 INFO: Use device: cpu
2023-02-15 20:14:04 INFO: Loading: tokenize
2023-02-15 20:14:04 INFO: Loading: pos
2023-02-15 20:14:05 INFO: Loading: lemma
2023-02-15 20:14:05 INFO: Loading: depparse
2023-02-15 20:14:05 INFO: Loading: constituency
2023-02-15 20:14:07 INFO: Done loading processors!


In [2]:
### ENTER EXAMPLE TEXT
doc = nlp('I threw the ball to Mary. She liked it.')

In [3]:
from lxml import etree

def parse_string_to_xml(node, constituent):
    for child in constituent.children:
        if len(str(child).split(' ')) == 2:
            elem = etree.SubElement(node, 'terminal')
            elem.set('POS', str(child).split(' ')[0].strip('(')) 
            elem.text = str(child).split(' ')[1].strip(')')
            continue
        else:
            try:
                elem = etree.SubElement(node, str(child).split(' ')[0].strip('('))
            except ValueError:
                elem = etree.SubElement(node, 'PUNCT')
            if len(str(child).split(' ')) == 2:
                elem.text = str(child).split(' ')[1].strip(')')
                continue
            
        parse_string_to_xml(elem, child)
                
    return node
  
###CREATING A TREE FOR ONE SENTENCE
sentence = doc.sentences[0]
print("SENTENCE:", sentence.constituency)
print()
root = etree.Element("sentence")
tree = parse_string_to_xml(root, sentence.constituency)

print("XML TREE:")
etree.dump(tree)

SENTENCE: (ROOT (S (NP (PRP I)) (VP (VBD threw) (NP (DT the) (NN ball)) (PP (IN to) (NP (NNP Mary)))) (. .)))

XML TREE:
<sentence>
  <S>
    <NP>
      <terminal POS="PRP">I</terminal>
    </NP>
    <VP>
      <terminal POS="VBD">threw</terminal>
      <NP>
        <terminal POS="DT">the</terminal>
        <terminal POS="NN">ball</terminal>
      </NP>
      <PP>
        <terminal POS="IN">to</terminal>
        <NP>
          <terminal POS="NNP">Mary</terminal>
        </NP>
      </PP>
    </VP>
    <terminal POS=".">.</terminal>
  </S>
</sentence>


In [7]:
# Feature 1
#GET THE FULL CONSTITUENT FOR THE TOKEN 'BALL' - both tokens and 
constituent_tokens = []
constituent_pos = []
for elem in tree.iter():
    if elem.text == 'ball':
        parent = (elem.getparent())
        for elem2 in parent.findall("terminal"):
            constituent_tokens.append(elem2.text)
            constituent_pos.append(elem2.attrib['POS'])
            
print(constituent_tokens)
print(constituent_pos)

['the', 'ball']
['DT', 'NN']


In [8]:
# Feature 5
### getting phrase type of a token
def get_phrase_type(tree, token):
    for elem in tree.iter():
        if elem.text == token:
            parent = (elem.getparent())
            return parent.tag

    return 'Token not in tree'   

get_phrase_type(tree, 'Mary')

'NP'

In [9]:
# Feature 6
feature_6 = []

for elem in root.findall('.//terminal'):
    if elem.getparent().tag == 'NP':
        if elem.getparent() not in root.find('.//VP').findall('.//NP'):
            feature_6.append('S')
        else:
            feature_6.append('VP')
    else:
        feature_6.append(None)

print(feature_6)

['S', None, 'VP', 'VP', None, 'VP', None]
