# Context-free grammar text generator

###### Choose which corpora to work with

In [1]:
pathroot = 'corpora_and_rules/frankenstein/'
filename_root = pathroot+'frankenstein'

###### Clean and tag a corpus

In [3]:
import nltk
import string
import glob
import random
# import cPickle as pickle
import pickle
  
path = filename_root+'.txt'   
corpus_members=glob.glob(path)
corpus = ''

# get rid of random line breaks and exclude troublesome expressions like quotes
for member in corpus_members:
    with open (member, "r") as openfile:
        data = openfile.read()
        for badchar in ['\t','\n','-\n','\'','\"','`','|','--']:
            data = data.replace(badchar, ' ')
        data = data.replace('.','. ')
        data = data.replace(',',', ')
        data = data.replace(';','; ')
        data = data.replace(':',': ')
    corpus = corpus + ' '+ data
tokens = nltk.word_tokenize(corpus)

# looks at each word in the context of sentence and tags it
pos_tagged_tokens = nltk.pos_tag(tokens)
pickle.dump( pos_tagged_tokens, open( filename_root+"_tagged.pkl", "wb" ) )

#### Record all terminal characters in a new file

In [4]:
with open(filename_root+"_tagged.pkl", 'rb') as handle:
    pos_tagged_tokens = pickle.load(handle)

# clear file
open(filename_root+"_rules.txt", 'w').close()

file = open(filename_root+"_rules.txt", 'a')

tags = list({tupe[1] for tupe in pos_tagged_tokens})
# dollar signs make everything go wrong
#tags = [item.replace('$','x')for item in tags]
badtags = ['#','$',',','-NONE-','.',':','TO','POS',"''"] # try to get NONE back if possible
tags = [item for item in tags if item not in badtags]
for tag in tags:
#     if tag in ['#','$',',','-NONE-','.',':']:
#         continue
#     else:
    allsyms = [('\'' + tupe[0] + '\'') for tupe in pos_tagged_tokens if tupe[1]== tag]
    gr_rule = (tag + " -> ")
    gr_rule += ' | '.join(allsyms)
    gr_rule += '\n'
    gr_rule = gr_rule.replace('PRP$','PRPx')
    gr_rule = gr_rule.replace('WP$','WPx')
    gr_rule = gr_rule.replace('-NONE-','xNONEx')
    file.write(gr_rule)

# specify these guys when creating grammar
# still need to work with apostrophe, hypens ``
file.write('''xperiod -> '.'\n''')
file.write('''xcomma -> ','\n''')
file.write('''xcolon -> ':'\n''')
file.write('''xsemicolon -> ';'\n''')
file.write('''openparen -> '('\n''')
file.write('''closeparen -> ')'\n''')
file.write('''xapostrophe -> "\'"\n''')
file.write('''xquote -> "''"\n''')

# Save some compiling time
file.write('''TO -> 'to'\n''')

# need to fix this
#file.write('''POS -> "\'s"\n''')


file.close()


###### Pick an random sentence from corpus and print syntax tree

In [5]:
import nltk
from nltk import CFG, ChartParser
from random import choice

from stat_parser import Parser
parser = Parser()
# pick a random sentence to parse
# leave out the period at the end of the sentence
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
rsent = choice(tokenizer.tokenize(corpus))
print (rsent)
parsee=parser.parse(rsent)

rules = ""
to_replace = [',','.',':',';',"''",'(',')','$','+']
replacements = ['xcomma','xperiod','xcolon','xsemicolon',\
                'xquote','openparen','closeparen','x','x']
 
# possibly add: brackets, double quotes

for production in parsee.productions():
    rules += str(production) + '\n'

# now re-tag special characters
swappairs = zip(to_replace, replacements)
for member in swappairs:
    rules = rules.replace(member[0],member[1])

print (rules)


We accordingly lay to,  hoping that some change would take place in the atmosphere and weather.
SBARxS -> NP VP
NP -> PRP
PRP -> 'we'
VP -> ADVP VB SxVP
ADVP -> RB
RB -> 'accordingly'
VB -> 'lay'
SxVP -> TO VP
TO -> 'to'
VP -> xcomma SxVP
xcomma -> 'xcomma'
SxVP -> VBG SBAR
VBG -> 'hoping'
SBAR -> IN S
IN -> 'that'
S -> NP VP xperiod
NP -> DT NN
DT -> 'some'
NN -> 'change'
VP -> MD VB NP
MD -> 'would'
VB -> 'take'
NP -> NP PP
NP -> NN
NN -> 'place'
PP -> IN NP
IN -> 'in'
NP -> NP CC NN
NP -> DT NN
DT -> 'the'
NN -> 'atmosphere'
CC -> 'and'
NN -> 'weather'
xperiod -> 'xperiod'



### Run grammar model

pick the nonterminal grammar rules that you want to use. Usually these need to be manually made by parsing single sentences and removing torublesome characters

In [8]:
rules_file = 'corpora_and_rules/ALL_NT_RULES/homemade_NTrules.txt'

In [9]:
import nltk

from nltk import CFG, ChartParser, Nonterminal
from random import choice

def is_terminal(item):
    """
    Return true a symbol (str) is terminal
    """
    return hasattr(item, '__hash__') and not isinstance(item, Nonterminal)

def produce(grammar, symbol):
    words = []
    productions = grammar.productions(lhs = symbol)
    production = choice(productions)
    for sym in production.rhs():
        if is_terminal(sym):
            words.append(sym)
        # recursion
        else:
            words.extend(produce(grammar, sym))
    return words

# read nonterminal rules from treebank file
file = open(rules_file, 'r')
cfg_str = file.read()
file.close()

# read leaves from corpus
file = open(filename_root+"_rules.txt", 'r')
cfg_str += '\n' + file.read()
file.close()

grammar = CFG.fromstring(cfg_str)

parser = ChartParser(grammar)

gr = parser.grammar()

# make an S symbol to start the fun
tgr = ChartParser(CFG.fromstring('''S -> NP VP''')).grammar()
tgr.start()
# gr.max_len()

S

### Generate some random sentences

In [11]:
#message = produce(gr, tgr.start())
for ii in xrange(10):
    print ' '.join(produce(gr, tgr.start())) 


# # tokenize by sentence and pick some random sentences to use?
# # really need to get punctuation working
# from stat_parser import Parser
# parser = Parser()
# kk=parser.parse("At the end of the nineteenth century the well-tested mechanical principles of Isaac Newton were the very heart of physical theory.")

sailors rested that it that enemies of the imagination
feelings often , deathbed although first fiend in but past of myself .
which did few fate , forward to not was the same university , until this horror was the nature for me : the wretchedness the building with world along the moral proposition , the mountain of I against Beware each trade is creature of Hampden slaughter- . : your meal is content combat with her mother .
which to taste I , again dated and pleasing of him on dearest with I : The letter as This miserable arm upon The reflection of its thousand and the death of the wood make called by the murderer : her fearful much opened nature , in the north religion and the progress , and to your pain , Nature In been into Below .
Have is dictate as breathless
next kindness her black , light of It I wished as our perish certainty .
horrible wretch that memory , their lake I to me , which the glad unhappiness of the paroxysm of a several engagement of Clerval in her apartment and a

## Appendix

### create a grammar file from Treebank

In [None]:
parser = ChartParser(tbank_grammar)
gr = parser.grammar()
print ' '.join(produce(gr, gr.start()))