In [1]:
!pip install nltk

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
from nltk import CFG, ChartParser
from nltk.treeprettyprinter import TreePrettyPrinter
import re

import json

# [Production rules](https://en.wikipedia.org/wiki/Production_(computer_science))

In [3]:
production_rules = """
# CFG that derives "some cats are mysterious"

S   -> DP VP

# Determiner Phrase
DP   -> Det NP
DP   -> NP

# Noun Phrase
NP   -> N

# Verb Phrase
VP   -> V
VP   -> V AP

# Adjective Phrase
AP   -> A

# NOTE: terminals *must* be quoted!
V    -> "is"
V    -> "are"

Det  -> "the"
# we can use | for disjunctions
Det  -> "an" | "a"
Det  -> "some" | "any" | "every" | "all"

N    -> "cat" | "cats"

A    -> "mysterious"
"""

# Chart parsing

In [4]:
cfg = CFG.fromstring(production_rules)

In [5]:
cfg.productions()

[S -> DP VP,
 DP -> Det NP,
 DP -> NP,
 NP -> N,
 VP -> V,
 VP -> V AP,
 AP -> A,
 V -> 'is',
 V -> 'are',
 Det -> 'the',
 Det -> 'an',
 Det -> 'a',
 Det -> 'some',
 Det -> 'any',
 Det -> 'every',
 Det -> 'all',
 N -> 'cat',
 N -> 'cats',
 A -> 'mysterious']

Initialize a chart parser

In [6]:
parser = ChartParser(cfg)

In [7]:
parser.grammar().productions()

[S -> DP VP,
 DP -> Det NP,
 DP -> NP,
 NP -> N,
 VP -> V,
 VP -> V AP,
 AP -> A,
 V -> 'is',
 V -> 'are',
 Det -> 'the',
 Det -> 'an',
 Det -> 'a',
 Det -> 'some',
 Det -> 'any',
 Det -> 'every',
 Det -> 'all',
 N -> 'cat',
 N -> 'cats',
 A -> 'mysterious']

In [8]:
sent = ["some", "cats", "are", "mysterious"]

for tree in parser.parse(sent):
    productions = [str(prod) for prod in tree.productions()]
    print("Production rules: {}\n".format(json.dumps(productions, indent=4)))
    tree.pretty_print()

Production rules: [
    "S -> DP VP",
    "DP -> Det NP",
    "Det -> 'some'",
    "NP -> N",
    "N -> 'cats'",
    "VP -> V AP",
    "V -> 'are'",
    "AP -> A",
    "A -> 'mysterious'"
]

               S                    
       ________|_______              
      DP               VP           
  ____|___          ___|______       
 |        NP       |          AP    
 |        |        |          |      
Det       N        V          A     
 |        |        |          |      
some     cats     are     mysterious



In [9]:
tree = next(parser.parse(["some", "cats", "are", "mysterious"]))

In [10]:
# find subtrees
cats = [st for st in tree.subtrees() \
      if len(st.leaves()) == 1 \
      and st.leaves()[-1] == "cats"]

cats[0].pretty_print()

adjp = [st for st in tree.subtrees() if st.label() == "AP"]
adjp[0].pretty_print()
#pretty_tree = TreePrettyPrinter(tree, highlight=(st))

 NP 
 |   
 N  
 |   
cats

    AP    
    |      
    A     
    |      
mysterious



### Render `nltk.Tree` using LaTeX

#### [`qtree`](https://ctan.org/pkg/qtree) package

In [11]:
print(tree.pformat_latex_qtree())

\Tree [.S
        [.DP [.Det some ] [.NP [.N cats ] ] ]
        [.VP [.V are ] [.AP [.A mysterious ] ] ] ]


#### [`forest`](https://ctan.org/pkg/forest) package

In [12]:
def to_latex_forest(tree):
    '''
    Converts an NLTK.Tree
    into a format compatible with the LaTeX forest package 
    (see https://github.com/sasozivanovic/forest)
    '''
    res = tree.pformat_latex_qtree() \
           .replace("[.", "[") \
           .replace("\Tree", "") \
           .replace("_BAR", "'")
    # format terminals
    res = re.sub(r"\s([^\s\]\[]+)\s", r" [\1] ", res)
    # forest env
    # terminal alignment solution: https://tex.stackexchange.com/a/214657
    return f"""
\\begin{{forest}}
  for tree={{
    font=\scshape,
    grow=south,
    parent anchor=south, 
    child anchor=north,
    if n children={{0}}{{tier=terminal, no edge}}{{}},
  }},
  for leaves={{font=\itshape}}
  {res}
\end{{forest}}
""".strip()

In [13]:
print(to_latex_forest(tree))

\begin{forest}
  for tree={
    font=\scshape,
    grow=south,
    parent anchor=south, 
    child anchor=north,
    if n children={0}{tier=terminal, no edge}{},
  },
  for leaves={font=\itshape}
   [S
        [DP [Det [some] ] [NP [N [cats] ] ] ]
        [VP [V [are] ] [AP [A [mysterious] ] ] ] ]
\end{forest}


In [14]:
prods = tree.productions()

## Recovering partial parses

We can recover partial parses from our chart.

In [15]:
sent  = ["cats", "are", "mysterious"]
chart = parser.chart_parse(sent)

In [16]:
# get the chart as a dot (https://en.wikipedia.org/wiki/DOT_(graph_description_language))
# !pip install graphviz
# see https://graphviz.readthedocs.io/en/stable/manual.html#using-raw-dot
#chart.dot_digraph()

In [17]:
chart.edges()

[[Edge: [0:1] 'cats'],
 [Edge: [1:2] 'are'],
 [Edge: [2:3] 'mysterious'],
 [Edge: [0:1] N  -> 'cats' *],
 [Edge: [0:1] NP -> N *],
 [Edge: [0:1] DP -> NP *],
 [Edge: [0:1] S  -> DP * VP],
 [Edge: [1:2] V  -> 'are' *],
 [Edge: [1:2] VP -> V *],
 [Edge: [1:2] VP -> V * AP],
 [Edge: [0:2] S  -> DP VP *],
 [Edge: [2:3] A  -> 'mysterious' *],
 [Edge: [2:3] AP -> A *],
 [Edge: [1:3] VP -> V AP *],
 [Edge: [0:3] S  -> DP VP *]]

In [18]:
tree_edge = chart.edges()[-2]
tree_edge

[Edge: [1:3] VP -> V AP *]

In [19]:
# What nonterminal does this production capture?
tree_edge.lhs()

VP

In [20]:
# What does the nonterminal map/expand to?
tree_edge.rhs()

(V, AP)

In [21]:
tree_edge.is_complete()

True

In [22]:
sent[tree_edge.start():tree_edge.end()]

['are', 'mysterious']

# Data
A non-exhaustive list of treebanks for English and non-English languages:
 - https://en.wikipedia.org/wiki/Treebank#Syntactic_treebanks