In [1]:
import nltk
import sys

TERMINALS = """
Adj -> "country" | "dreadful" | "enigmatical" | "little" | "moist" | "red"
Adv -> "down" | "here" | "never"
Conj -> "and" | "until"
Det -> "a" | "an" | "his" | "my" | "the"
N -> "armchair" | "companion" | "day" | "door" | "hand" | "he" | "himself"
N -> "holmes" | "home" | "i" | "mess" | "paint" | "palm" | "pipe" | "she"
N -> "smile" | "thursday" | "walk" | "we" | "word"
P -> "at" | "before" | "in" | "of" | "on" | "to"
V -> "arrived" | "came" | "chuckled" | "had" | "lit" | "said" | "sat"
V -> "smiled" | "tell" | "were"
"""

NONTERMINALS = """
S -> NP VP | S Conj S | S Conj VP
PP -> P NP
NP -> N | Adj NP | Det NP | NP PP
VP -> V | VP NP | VP Adv | Adv VP | VP PP

"""

grammar = nltk.CFG.fromstring(NONTERMINALS + TERMINALS)
parser = nltk.ChartParser(grammar)


def main():

    # If filename specified, read sentence from file
    if len(sys.argv) == 2:
        with open(sys.argv[1]) as f:
            s = f.read()

    # Otherwise, get sentence as input
    else:
        s = input("Sentence: ")

    # Convert input into list of words
    s = preprocess(s)

    # Attempt to parse sentence
    try:
        trees = list(parser.parse(s))
    except ValueError as e:
        print(e)
        return
    if not trees:
        print("Could not parse sentence.")
        return

    # Print each tree with noun phrase chunks
    for tree in trees:
        tree.pretty_print()

        print("Noun Phrase Chunks")
        for np in np_chunk(tree):
            print(" ".join(np.flatten()))


def preprocess(sentence):
    """
    Convert `sentence` to a list of its words.
    Pre-process sentence by converting all characters to lowercase
    and removing any word that does not contain at least one alphabetic
    character.
    """
    words = nltk.word_tokenize(sentence)
    processed_words = []
    for word in words:
        alphanumeric_count = 0
        clean_word = []
        for char in word:
            if char.isalnum():
                alphanumeric_count = alphanumeric_count + 1
                clean_word.append(char)
        clean_word = ''.join(clean_word)
        if alphanumeric_count > 0:
            processed_words.append(clean_word.lower())
    return processed_words


def np_chunk(tree):
    """
    Return a list of all noun phrase chunks in the sentence tree.
    A noun phrase chunk is defined as any subtree of the sentence
    whose label is "NP" that does not itself contain any other
    noun phrases as subtrees.
    """
    return []




In [73]:
s = 'Holmes lit a pipe'
s = preprocess(s)

# Attempt to parse sentence
try:
    trees = list(parser.parse(s))
except ValueError as e:
    print(e)

if not trees:
    print("Could not parse sentence.")


In [74]:
trees

[Tree('S', [Tree('NP', [Tree('N', ['holmes'])]), Tree('VP', [Tree('VP', [Tree('V', ['lit'])]), Tree('NP', [Tree('Det', ['a']), Tree('NP', [Tree('N', ['pipe'])])])])])]

In [75]:
trees[1].label()

IndexError: list index out of range

In [108]:
subs = trees[0].subtrees(lambda s: s.height() == 0)
for s in subs:
    for s in sub.subtrees(lambda s: s.height() == 3):
        if s.label() == 'NP':
            contain_subtrees = False
            for s2 in s.subtrees():
                if s2.label() == 'NP':
                    contain_subtrees = True
            #if contain_subtrees == False:
            print(s)

In [77]:

def is_np_chunk(tree):
    """
    Returns true if given tree is a NP chunk.
    A noun phrase chunk is defined as any subtree of the sentence
    whose label is "NP" that does not itself contain any other
    noun phrases as subtrees.
    """
    if tree.label() == 'NP' and \
            not list(tree.subtrees(lambda t: t.label() == 'NP' and t != tree)):
        return True
    else:
        return False

In [78]:
subs = trees[0].subtrees()


In [79]:
subtree_np_check(trees[0])

False

In [80]:
subs = trees[0].subtrees()


In [81]:
subs

<generator object Tree.subtrees at 0x7fee528f0f50>

In [94]:
subs = trees[0].subtrees()

i = 0
for s in subs:
    if i == 3:
        s2s = s.subtrees()
        for s2 in s2s:
            print(s2)
            s2.pretty_print()
    i = i + 1
    #print(s)

(VP (VP (V lit)) (NP (Det a) (NP (N pipe))))
     VP         
  ___|___        
 |       NP     
 |    ___|___    
 VP  |       NP 
 |   |       |   
 V  Det      N  
 |   |       |   
lit  a      pipe

(VP (V lit))
 VP
 |  
 V 
 |  
lit

(V lit)
 V 
 |  
lit

(NP (Det a) (NP (N pipe)))
     NP     
  ___|___    
 |       NP 
 |       |   
Det      N  
 |       |   
 a      pipe

(Det a)
Det
 |  
 a 

(NP (N pipe))
 NP 
 |   
 N  
 |   
pipe

(N pipe)
 N  
 |   
pipe



In [88]:
subs = trees[0].subtrees()
for s in subs:
    if s.label() == 'NP':
        print(subtree_np_check(s))
        
        #if contain_subtrees == False:
        print(s)

this is here
(NP (N holmes))
True
(NP (N holmes))
this is here
(NP (Det a) (NP (N pipe)))
True
(NP (Det a) (NP (N pipe)))
this is here
(NP (N pipe))
True
(NP (N pipe))
