In [1]:
from nltk.parse.generate import generate, demo_grammar

In [2]:
from nltk import CFG

In [3]:
with open('grammarku.txt', 'r') as myfile:
    data = myfile.read()
print(data)

S -> NP VP
NP -> Det N | NP PP
PP -> P NP
VP -> V NP | VP PP
Det -> 'the' | 'a'
N -> 'cat' | 'dog'
V -> 'chased' | 'sat'
P -> 'in' | 'on'



In [4]:
gram = CFG.fromstring(data)
print(gram)

Grammar with 14 productions (start state = S)
    S -> NP VP
    NP -> Det N
    NP -> NP PP
    PP -> P NP
    VP -> V NP
    VP -> VP PP
    Det -> 'the'
    Det -> 'a'
    N -> 'cat'
    N -> 'dog'
    V -> 'chased'
    V -> 'sat'
    P -> 'in'
    P -> 'on'


In [5]:
g = CFG.fromstring(demo_grammar)

In [6]:
print(g)

Grammar with 13 productions (start state = S)
    S -> NP VP
    NP -> Det N
    PP -> P NP
    VP -> 'slept'
    VP -> 'saw' NP
    VP -> 'walked' PP
    Det -> 'the'
    Det -> 'a'
    N -> 'man'
    N -> 'park'
    N -> 'dog'
    P -> 'in'
    P -> 'with'


In [7]:
for sentence in generate(gram, n=10):
    print(' '.join(sentence))

the cat chased the cat
the cat chased the dog
the cat chased a cat
the cat chased a dog
the cat chased the cat in the cat
the cat chased the cat in the dog
the cat chased the cat in a cat
the cat chased the cat in a dog
the cat chased the cat in the cat in the cat
the cat chased the cat in the cat in the dog


In [8]:
for sentence in generate(gram, depth=5):
    print(' '.join(sentence))

the cat chased the cat
the cat chased the dog
the cat chased a cat
the cat chased a dog
the cat sat the cat
the cat sat the dog
the cat sat a cat
the cat sat a dog
the dog chased the cat
the dog chased the dog
the dog chased a cat
the dog chased a dog
the dog sat the cat
the dog sat the dog
the dog sat a cat
the dog sat a dog
a cat chased the cat
a cat chased the dog
a cat chased a cat
a cat chased a dog
a cat sat the cat
a cat sat the dog
a cat sat a cat
a cat sat a dog
a dog chased the cat
a dog chased the dog
a dog chased a cat
a dog chased a dog
a dog sat the cat
a dog sat the dog
a dog sat a cat
a dog sat a dog


In [9]:
len(list(generate(g)))

114

In [10]:
# for sentence in generate(gram):
#     print(' '.join(sentence))

# NLTK Book

In [11]:
import nltk

groucho_grammar = nltk.CFG.fromstring("""
    S -> NP VP
    PP -> P NP
    NP -> Det N | Det N PP | 'I'
    VP -> V NP | VP PP
    Det -> 'an' | 'my'
    N -> 'elephant' | 'pajamas'
    V -> 'shot'
    P -> 'in'
    """)

In [12]:
sent = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas']
parser = nltk.ChartParser(groucho_grammar)
for tree in parser.parse(sent):
    print(tree)

(S
  (NP I)
  (VP
    (VP (V shot) (NP (Det an) (N elephant)))
    (PP (P in) (NP (Det my) (N pajamas)))))
(S
  (NP I)
  (VP
    (V shot)
    (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))


In [13]:
grammar1 = nltk.CFG.fromstring("""
    S -> NP VP
    VP -> V NP | V NP PP
    PP -> P NP
    V -> "saw" | "ate" | "walked"
    NP -> "John" | "Mary" | "Bob" | Det N | Det N PP
    Det -> "a" | "an" | "the" | "my"
    N -> "man" | "dog" | "cat" | "telescope" | "park"
    P -> "in" | "on" | "by" | "with"
""")

In [14]:
sent = "Mary saw Bob".split()
rd_parser = nltk.RecursiveDescentParser(grammar1)
for tree in rd_parser.parse(sent):
    print(tree)

(S (NP Mary) (VP (V saw) (NP Bob)))


In [15]:
grammar2 = nltk.data.load('file:mygrammar.cfg')
sent = "Mary saw Bob".split()
rd_parser = nltk.RecursiveDescentParser(grammar2)
for tree in rd_parser.parse(sent):
    print(tree)

(S (NP Mary) (VP (V saw) (NP Bob)))


In [16]:
grammar3 = nltk.CFG.fromstring("""
  S  -> NP VP
  NP -> Det Nom | PropN
  Nom -> Adj Nom | N
  VP -> V Adj | V NP | V S | V NP PP
  PP -> P NP
  PropN -> 'Buster' | 'Chatterer' | 'Joe'
  Det -> 'the' | 'a'
  N -> 'bear' | 'squirrel' | 'tree' | 'fish' | 'log'
  Adj  -> 'angry' | 'frightened' |  'little' | 'tall'
  V ->  'chased'  | 'saw' | 'said' | 'thought' | 'was' | 'put'
  P -> 'on'
  """)

In [17]:
rd_parser = nltk.RecursiveDescentParser(grammar1)
sent = "Mary saw a dog".split()
for tree in rd_parser.parse(sent):
    print(tree)

(S (NP Mary) (VP (V saw) (NP (Det a) (N dog))))


In [18]:
from nltk.corpus import treebank
t = treebank.parsed_sents('wsj_0001.mrg')[0]
print(t)

(S
  (NP-SBJ
    (NP (NNP Pierre) (NNP Vinken))
    (, ,)
    (ADJP (NP (CD 61) (NNS years)) (JJ old))
    (, ,))
  (VP
    (MD will)
    (VP
      (VB join)
      (NP (DT the) (NN board))
      (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
      (NP-TMP (NNP Nov.) (CD 29))))
  (. .))


In [19]:
grammar = {
    "_S"  : ["_NP _VP"],
    "_NP" : ["_NN", 
             "_NN _DT",
             "_NN _JJ"],
    "_VP" : ["_VBI _NP",
             "_VBI _IN _NP",
             "_VBI _NP _PP"],
    "_PP" : ["_IN _NP"],
    "_NN" : ["shelter", "kampung", "pantai", "marina", "pelangi", "jalan", "simpang"],
    "_VBI": ["naik", "turun"],
    "_JJ" : ["deket"],
    "_IN" : ["di", "ke", "dari"],
    "_DT" : ["ini", "itu"]
}

In [20]:
grammar1 = {
    "_JJP": ["_JJ",
             "_JJ _CC _JJ"],
    "_S"  : ["_NP _VP"],
    "_NP" : ["_NP _CC _NP", 
             "_NN _DT",
             "_NN _JJP",
             "_NN"],
    "_VP" : ["_PP"],
    "_PP" : ["_IN _NP"],
    "_NN" : ["simpang", "arus", "menit", "lintas", "tugu", "parkir", "kawasan", "bawah"],
    "_JJ" : ["lalu", "muda"],
    "_IN" : ["di", "ke", "dari"],
    "_DT" : ["ini", "itu"],
    "_CC" : ["dan", "atau"]
    }

In [19]:
grammar2 = {
    "_S"  : ["_NP _VP"],
    "_NP" : ["_NP _CC _NP", 
             "_NN _DT",
             "_NN _JJP",
             "_NN"],
    "_VP" : ["_VBI _NP",
             "_VBI _NP _PP"],
    "_PP" : ["_IN _NP"],
    "_JJP": ["_JJ",
             "_JJ _CC _JJ"],
    "_VBI": ['memakan', 'melihat'],
    "_NN" : ["sapi", "kambing", "manusia"],
    "_JJ" : ["gemuk", "kurus"],
    "_IN" : ["di", "ke", "dari"],
    "_DT" : ["ini", "itu"],
    "_CC" : ["dan", "atau"]
    }

In [20]:
import random
def is_terminal(token):
    return token[0] != "_"

In [21]:
def expand(grammar, tokens):
    for i, token in enumerate(tokens):

        # skip over terminals
        if is_terminal(token): continue

        # if we get here, we found a non-terminal token
        # so we need to choose a replacement at random
        replacement = random.choice(grammar[token])

        if is_terminal(replacement):
            tokens[i] = replacement
        else:
            tokens = tokens[:i] + replacement.split() + tokens[(i+1):]

        # now call expand on the new list of tokens
        return expand(grammar, tokens)

    # if we get here we had all terminals and are done
    return tokens

In [22]:
def generate_sentence(grammar):
    return expand(grammar, ["_S"])

In [24]:
for i in range(10):
    print(' '.join(generate_sentence(grammar2)))

kambing melihat kambing itu di manusia itu
sapi melihat sapi ini atau sapi dan sapi
manusia atau sapi kurus atau gemuk atau kambing kurus atau kurus dan sapi ini atau kambing kurus dan gemuk melihat sapi gemuk dari sapi
manusia kurus melihat kambing gemuk
manusia itu melihat kambing itu
manusia melihat manusia itu atau kambing gemuk dan gemuk dan kambing itu ke sapi kurus atau kurus atau manusia kurus dan kurus
kambing kurus atau gemuk memakan kambing ini di sapi itu
kambing itu dan sapi gemuk dan kurus melihat manusia gemuk
manusia memakan sapi
manusia itu melihat manusia gemuk ke sapi
