# Context-free grammar text generator

In [87]:
import nltk

from nltk import data, CFG, ChartParser, Nonterminal
from random import choice

import string
import glob
import random
# import cPickle as pickle
# import pickle
import os.path
import warnings

from stat_parser import Parser



def clean_corpus(path):
    '''
    Clean up a corpus by removing characters and expressions that just aren't worth dealing with
    
    path : str
        A string to a .txt file containing a corpus
    
    '''
    filename_root = os.path.dirname(path)
    corpus_members=glob.glob(path)
    corpus = ''

    # get rid of random line breaks and exclude troublesome expressions like quotes
    for member in corpus_members:
        with open (member, "r") as openfile:
            data = openfile.read()
            badchars = ['\t','\n','\r','-\n', '\'','\"','`','|','--']
#             badchars = ['\t','\n','\r','-\n', '|','--']
            for badchar in badchars:
                data = data.replace(badchar, ' ')
            data = data.replace('.','.')
            data = data.replace('`','\'')
            data = data.replace(',',',')
            data = data.replace(';',';')
            data = data.replace(':',':')
        corpus = corpus + ' ' + data
    
    return corpus
        
def tag_corpus(corpus):
    '''
    Use NLTK to identify the linguisitic function of
    the words in a corpus
    
    Parameters
    ----------
    
    corpus : str
        A corpus that has been stripped of all troublesome
        characters using the clean_corpus() function
        
    Returns
    -------
    
    pos_tagged_tokens : list of tuples
        A list of tuples consisting of a word in position 1,
        and its function within the sentence in position 2
    
    '''
    
    tokens = nltk.word_tokenize(corpus)
    pos_tagged_tokens = nltk.pos_tag(tokens)
    
    return pos_tagged_tokens

def make_terminal_rules(pos_tagged_tokens):
    '''
    Search through a list of tagged words and obtain
    all of the Terminal characters
    
    path : str
    
    '''
    
    all_rules = ''
    
    tags = list({tupe[1] for tupe in pos_tagged_tokens})
    badtags = ['#','$',',','-NONE-','.',':','TO','POS',"''",'(',')'] # bad terminal tags
    tags = [item for item in tags if item not in badtags]
    for tag in tags:
        allsyms = [('\'' + tupe[0] + '\'') for tupe in pos_tagged_tokens if tupe[1]==tag]
        gr_rule = (tag + " -> ")
        gr_rule += ' | '.join(allsyms)
        gr_rule += '\n'
        gr_rule = gr_rule.replace('PRP$','PRPx')
        gr_rule = gr_rule.replace('WP$','WPx')
        gr_rule = gr_rule.replace('-NONE-','xNONEx')
        all_rules += gr_rule

    all_rules +=('''xperiod -> '.'\n''')
    all_rules +=('''xcomma -> ','\n''')
    all_rules +=('''xcolon -> ':'\n''')
    all_rules +=('''xsemicolon -> ';'\n''')
    all_rules +=('''openparen -> '('\n''')
    all_rules +=('''closeparen -> ')'\n''')
    all_rules +=('''xapostrophe -> "\'"\n''')
    all_rules +=('''xquote -> "''"\n''')
    all_rules +=('''TO -> 'to'\n''')
    
    return all_rules


def parse_sentence(my_sentence):
    '''
    
    my_sentence : str
        A single sentence (str) 
    
    '''
       
    parser = Parser()
    parsee=parser.parse(my_sentence)

    rules = ""
    to_replace = [',','.',':',';',"''",'(',')','$','+']
    replacements = ['xcomma','xperiod','xcolon','xsemicolon',\
                    'xquote','openparen','closeparen','xdollar','xplus']

    # possibly add: brackets, double quotes

    for production in parsee.productions():
        rules += str(production) + '\n'

    # now re-tag special characters
    swappairs = zip(to_replace, replacements)
    for member in swappairs:
        rules = rules.replace(member[0],member[1])

    return rules


def is_terminal(symb):
    '''determine if a symbol is terminal
    
    Parameters
    ----------
    
    symb : str
    
    Returns
    -------
    
    out : bool
        whether if symb is terminal

    '''
    out = hasattr(symb, '__hash__') and not isinstance(symb, Nonterminal)
    return out


def produce(grammar, symbol, depth=0, maxdepth=25):
    '''
    
    grammar : nltk.grammar.CFG
    
    symbol : nltk.grammar.Nonterminal
    
    depth : int
        The depth of the recursive tree search
        
    maxdepth : int
        The maximum allowed recursion depth before throwing a
        ValueError
        
    TODO: make a custom UserError type
    
    '''
    if depth > maxdepth:
        raise ValueError('Recursion went too deep, one of the example syntax sentences might be poorly formed or poorly parsed')
    words = []
    productions = grammar.productions(lhs = symbol)
    production = choice(productions)
    for sym in production.rhs():
        if is_terminal(sym):
            words.append(sym)
            #print(depth)
        else:
            words.extend(produce(grammar, sym, depth=depth+1, maxdepth=maxdepth))
    return words

def make_sentence(cfg_str):
    '''
    cfg_str : str
        a string containing a context free grammar
        
    '''


    grammar = CFG.fromstring(cfg_str)
    parser = ChartParser(grammar)
    gr = parser.grammar()

    startpt = cfg_str[:cfg_str.find(' ->')]
    startpt = nltk.grammar.Nonterminal(startpt)
    
    out_txt = (' '.join(produce(gr, startpt)) )
    

    from_replace = [',','.',':',';',"''",'(',')','$','+']
    replacements = ['xcomma','xperiod','xcolon','xsemicolon',\
                    'xquote','openparen','closeparen','xdollar','xplus']

    # now re-tag special characters
    swappairs = zip(replacements,from_replace)
    for member in swappairs:
        out_txt = out_txt.replace(member[0],member[1])
    
    
    return out_txt


def make_sentence2(corpus, term_rules, maxdepth=25):
    '''
    
    Generate sentences with random structure and word choice
    using a context-free grammar
    
    The start point is taken from the sentence itself.
    
    Parameters
    ----------
    
    corpus : str
        a string containing the full, cleaned corpus
        
    term_rules : str
        a string containing all the terminal rules
        
    maxdepth : int
        The maximum allowed recursion depth before throwing a
        ValueError
        
    '''

    #tgr = ChartParser(CFG.fromstring('''S -> NP VP''')).grammar()
    #tgr.start()
    
    
    flag = False
    attempts = 0
    while not flag and attempts < 30:
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        rsent = choice(tokenizer.tokenize(corpus))
        parsed_syntax = parse_sentence(rsent)
        cfg_str = term_rules + parsed_syntax
        try:  
            startpt = parsed_syntax[:parsed_syntax.find(' ->')]
            startpt = nltk.grammar.Nonterminal(startpt)
            grammar = CFG.fromstring(cfg_str)
            parser = ChartParser(grammar)
            gr = parser.grammar()
            out_txt = (' '.join(produce(gr, startpt,  maxdepth=maxdepth)) )
            flag = True
        except ValueError:
            warnings.warn('Badly formed sentence encountered, resampling the corpus.')
            attempts = attempts + 1

    from_replace = [',','.',':',';',"''",'(',')','$','+']
    replacements = ['xcomma','xperiod','xcolon','xsemicolon',\
                    'xquote','openparen','closeparen','xdollar','xplus']

    # now re-tag special characters
    swappairs = zip(replacements,from_replace)
    for member in swappairs:
        out_txt = out_txt.replace(member[0],member[1])
    
    
    return out_txt

In [None]:
WHY DO SENTENCES SEEM TO STOP ABRUPTLY? LACK OF A PERIOD? CHECK THE STRUCTURE
DEAL WITH EMPTY PRODUCTIONS IN GRAMMAR

In [54]:
kk3 = ChartParser(CFG.fromstring('''S -> NP VP''')).grammar()


In [68]:
type(kk3.start())

nltk.grammar.Nonterminal

In [69]:
nltk.grammar.Nonterminal('S')

S

In [67]:
kk3

AttributeError: 'list' object has no attribute 'start'

In [88]:
for ii in range(10):
    print(make_sentence2(mycorp, termrules_mycorp))
    print('\n')

PRPx
the whole wretchedness became of The with the the and in shrill and lips as all impertinent account by the that dangerous but smiles of one cousin for that joy science of You human and friends But slaked which must desired by the with the the either arrived which can be replenished of the tried abhorrence . . .


PRPx
which magistrate ?


PRPx
When I bloomed , for all kindness although the boundless , of the love passed , of the aim of my father of I , If blood for the perceived The evil was hear to ten before us the man was dress to felt with I not right down kind and against I had despairing , that yourself drunk , but fallen now more innocent in the destruction probably yet forever sun and of him knew you , by the evening own , and passed I not more wide from I .


PRPx
monster was little from the steeples , the peaceful brightness were various of the cheerfulness


PRPx
you trembled hushed a effect for I She had young to was brought every triumph that promise


PRPx
little amu

In [74]:
# random sentences with fixed grammar

mycorp = clean_corpus('/Users/william/python_files/cfgen/full_books/frankenstein.txt')
tagged_corpus = tag_corpus(mycorp)
termrules_mycorp = make_terminal_rules(tagged_corpus)

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
rsent = choice(tokenizer.tokenize(mycorp))
print (rsent)

kk1 = parse_sentence(rsent)

for ii in range(10):
    print(make_sentence(termrules_mycorp+kk1))
    print('\n')

When I recovered I found myself surrounded by the people of the inn; their countenances expressed a breathless terror, but the horror of others appeared only as a mockery, a shadow of the feelings that oppressed me.
PRPx
, But the time seen answer led that returns


PRPx
every a property for me was they


PRPx
, and I as I had vices


PRPx
I had the delight .


PRPx
which seemed looks : The happy appearance resolved I


PRPx
a the calmer in the misery stole we acquainted pursue softened off the poor s .


PRPx
the girl were the neighbourhood .


PRPx
She approved a snow went the business .


PRPx
him for These case which were anticipations


PRPx
how all family was longer of some dear sea . ; this horror sparkled he




### TODO

+ Remove capital letters on all words that start a sentence (identify with regular expression '\. .'
+ + Put them back into the final format using the same operation inverted
+ + Need to identify words that are capital both when they start the sentence and whenever they appear (first names). Maybe make a list of all words that appear in both capital and non capital form?
+ Fix the weird double period/comma problem, figure out where that's coming from.