# Context-free grammar text generator

**Pack everything up nice and neatly, get generator working as well as it can **

+ Replace pyStatParser with something Python 3 compatible (attempt to write an efficient CYK parser)
    + a very good option is [disco-dop](https://github.com/andreasvc/disco-dop)
+ When preparing corpus, delete lines that are shorter than most (likely chapter headings)
+ feed a slightly modified corpus into the kgram dictionary generator in order to account for periods and commas and other punctuation
+ Need to identify words that are capital both when they start the sentence and whenever they appear (first names). Maybe make a list of all words that appear in both capital and non capital form?

+ If pyStatParser fails, select sentence rules from a library of already-processed sentences (this might actually work better)
+ If a generated sentence has bad grammar, try again.
+ The 3-markov sentences of a given length really should have the largest similarity scores


**Next Research ideas**
+ compare different authors using bootstrapping
+ generate text using words from one author and syntax from a different author (style swapper)
+ can I 1. observe zipf's law in Frankenstein, and 2. observe it in text generated from Markov and CFG markov? (could be a way to bootstrap)

In [6]:
import sys
sys.path.append("..")
# from app import object
from cfgen import *

%load_ext autoreload
%autoreload 2
# %autoreload 0

In [38]:

from stat_parser import Parser

sents = [
    "No mortal could support the horror of that countenance.",
    "You will smile at my allusion, but I will disclose a secret.",
    "Before this I was not unacquainted with the more obvious laws of electricity.",
    "I passed the night wretchedly.",
    "The cow jumped over the moon, and the man rode a horse.",
    "The monster ran out the door, and the creator rode a horse."
]

#http://linguistics.stackexchange.com/questions/2252/what-is-a-determiner
#http://www.clips.ua.ac.be/pages/mbsp-tags

parser = Parser()

# for item in sents:
#     print(parse_sentence(item))
#     print('\n')

for item in sents:
    parsee=parser.parse(item)
    for item in parsee.productions():
        print(item)
    print('\n')

S -> NP VP .
NP -> DT NN
DT -> 'no'
NN -> 'mortal'
VP -> MD VB NP
MD -> 'could'
VB -> 'support'
NP -> NP PP
NP -> DT NN
DT -> 'the'
NN -> 'horror'
PP -> IN NP
IN -> 'of'
NP -> DT NN
DT -> 'that'
NN -> 'countenance'
. -> '.'


S+SBAR -> S , S
S -> NP VP
NP -> PRP
PRP -> 'you'
VP -> MD VB PP
MD -> 'will'
VB -> 'smile'
PP -> IN NP
IN -> 'at'
NP -> PRP$ NN
PRP$ -> 'my'
NN -> 'allusion'
, -> ','
S -> CC NP VP .
CC -> 'but'
NP -> PRP
PRP -> 'I'
VP -> MD VB NP
MD -> 'will'
VB -> 'disclose'
NP -> DT NN
DT -> 'a'
NN -> 'secret'
. -> '.'


PRN+SBAR -> IN S
IN -> 'before'
S -> NP NP VP .
NP -> DT
DT -> 'this'
NP -> PRP
PRP -> 'I'
VP -> VBD RB VBN PP
VBD -> 'was'
RB -> 'not'
VBN -> 'unacquainted'
PP -> IN NP
IN -> 'with'
NP -> NP PP
NP -> DT ADJP NNS
DT -> 'the'
ADJP -> RBR JJ
RBR -> 'more'
JJ -> 'obvious'
NNS -> 'laws'
PP -> IN NP
IN -> 'of'
NP -> NN
NN -> 'electricity'
. -> '.'


SBARQ -> SQ .
SQ -> NP VP
NP -> PRP
PRP -> 'I'
VP -> VBD NP ADVP
VBD -> 'passed'
NP -> DT NN
DT -> 'the'
NN -> 'night

In [9]:
termrules_mycorp

"VBG -> 'increasing' | 'advancing' | 'skirting' | 'diffusing' | 'preceding' | 'sailing' | 'surpassing' | 'supposing' | 'discovering' | 'ascertaining' | 'arriving' | 'learning' | 'dying' | 'inuring' | 'failing' | 'travelling' | 'walking' | 'remaining' | 'freezing' | 'paying' | 'collecting' | 'glowing' | 'becoming' | 'keeping' | 'finding' | 'being' | 'having' | 'throwing' | 'confessing' | 'being' | 'thinking' | 'according' | 'wavering' | 'trembling' | 'preparing' | 'going' | 'having' | 'floating' | 'indicating' | 'advancing' | 'renovating' | 'tracing' | 'being' | 'swelling' | 'recording' | 'leaving' | 'hoping' | 'being' | 'fearing' | 'breaking' | 'talking' | 'being' | 'persuading' | 'perceiving' | 'hearing' | 'hearing' | 'rubbing' | 'forcing' | 'understanding' | 'drawing' | 'concerning' | 'breaking' | 'being' | 'concerning' | 'feeling' | 'watching' | 'burning' | 'heaving' | 'intoxicating' | 'having' | 'being' | 'quelling' | 'concerning' | 'finding' | 'respecting' | 'elevating' | 'concern

In [8]:
# initialize all of the relevant variables
mycorp = clean_corpus('/Users/william/python_files/cfgen/resources/corpora/frankenstein.txt')
# mycorp = unicode(mycorp,encoding = "ISO-8859-1")
tagged_corpus = tag_corpus(mycorp)
termrules_mycorp = make_terminal_rules(tagged_corpus)
my_kgram = make_kgram(mycorp, k=2)

# testing various use cases of make_sentence()

simple_sentence = 'The cow jumped over the moon, and the man rode a horse.'

# Try parsing a fixed grammar with random words inserted for terminal symbols
for ii in range(3):
    some_txt = make_sentence(mycorp, termrules_mycorp, fixed_grammar=True, sample_sentence=simple_sentence)
    some_txt = clean_output_text(some_txt)
    print(some_txt)
    print('\n')
    
print("----------\n")

# Try parsing a fixed grammar with Markov-biased selection of terminal words
for ii in range(3):
    some_txt = make_sentence(mycorp, termrules_mycorp, my_kgram, fixed_grammar=True, sample_sentence=simple_sentence)
    some_txt = clean_output_text(some_txt)
    print(some_txt)
    print('\n')

print("----------\n")
    

# Pick sentence structure randomly as well
for ii in range(3):
    some_txt = make_sentence(mycorp, termrules_mycorp)
    some_txt = clean_output_text(some_txt)
    print(some_txt)
    print('\n')
    
print("----------\n")
    
# Pick sentence structure randomly with Markov-biased selection
for ii in range(3):
    some_txt = make_sentence(mycorp, termrules_mycorp, my_kgram)
    some_txt = clean_output_text(some_txt)
    print(some_txt)
    print('\n')
    

and the indecent set of the enterprise, the state answer a path


and a country wander the world, the I have the tale


the creator suppose the loveliness, and the sister I this hate .


----------

HIT!
HIT!
HIT!
and all nature be the means, and all nature procure a disgust .


HIT!
HIT!
HIT!
HIT!
HIT!
HIT!
HIT!
but this apparition seemed of a criminal, and the youth joined of a superior .


HIT!
HIT!
HIT!
and a sum sought as the peasant, the speed swallowed of every enjoyment


----------

i to you of single not again to be the cabin to manners to the sympathy to william for no feeling by saying and to ardent rarely officially to overcame this I to agatha tormentor of to strangers kirwin as you with precious first however to associate the mortal at obscure even now to I the frowning of the gallant while environs for an death I on the deformity, me, perish that we near to was making and to proceeded to converse to vaud of to I on hills .


have me


changed to defend up crimes. could 

In [50]:

# initialize all of the relevant variables
mycorp = clean_corpus('/Users/william/python_files/cfgen/full_books/text.txt')
mycorp = unicode(mycorp, encoding = "ISO-8859-1")
mycorp = mycorp.decode("utf-8")
with open("text.txt", "w") as text_file:
    text_file.write(mycorp)
# for line in mycorp:
#     print(line)

In [44]:
all_simscores

[0.038787325670935173,
 0.040584091492456432,
 0.04047001112283604,
 0.037304280865870003]

In [36]:
all_simscores

[0.00051336166329178905, 0.00039928129367139149]

In [27]:
from numpy import mean,std
mean(all_simscores)
std(all_simscores)

0.03675579322638145

In [36]:

all_sample = ''
SAMP_LEN = 10
while len(word_tokenize(all_sample)) < SAMP_LEN:
    try:
        some_txt = make_sentence(mycorp, termrules_mycorp)
        some_txt2 = clean_output_text(str(some_txt))
        all_sample += ' ' + some_txt2
    except IndexError:
        warnings.warn('Some trouble encountered, retrying')
all_sample = clean_output_text(all_sample)
    

In [35]:
clean_output_text(some_txt)

u'the selfishness overlooked irretrievably knows longer excellent yet consolation, and unearthly than a friend as same, in which monster was then is longer ephemeral and form, and more unsatisfied and fainter falsely slow. and untimely of the'

In [33]:
some_txt

u'the selfishness overlooked irretrievably knows longer excellent yet consolation , and unearthly than a friend as same , in which monster was then is longer ephemeral and form , and more unsatisfied and fainter falsely slow . and untimely of the'

In [32]:
print(str(all_sample))

the selfishness overlooked irretrievably knows longer excellent yet consolation, and unearthly than a friend as same, in which monster was then is longer ephemeral and form, and more unsatisfied and fainter falsely slow. and untimely of the


In [None]:

corpus_words = word_tokenize(mycorp)
nrange = len(corpus_words) - SAMP_LEN - 1

for ii in range(REPEATS):

    ind = choice(range(nrange))
    ind = choice(range(nrange))
        text_sample = corpus_words[ind:ind+SAMP_LEN]
        text_sample = corpus_words[ind:ind+SAMP_LEN]
    text_sample = ' '.join(text_sample)
    
    
    

###cfgen batch script

In [73]:
# Estimate the quality of the text statistically
# make 1000 sentences

from cfgen import *

ORDER = 3
REPEATS = 100
SAMP_LEN = 1000
mycorp = clean_corpus('/Users/william/python_files/cfgen/full_books/frankenstein.txt')
#mycorp = clean_corpus('frankenstein.txt')

tagged_corpus = tag_corpus(mycorp)
termrules_mycorp = make_terminal_rules(tagged_corpus)
my_kgram = make_kgram(mycorp, k=ORDER)
out_name = str(ORDER)+"cfgen_score1.txt"

all_gscores = list()
all_simscores = list()

with open(out_name, 'a') as myfile:
    myfile.write('\n')

for ii in range(REPEATS):

    text_sample = ''
    while len(word_tokenize(text_sample)) < SAMP_LEN:
        try:
            some_txt = make_sentence(mycorp, termrules_mycorp, my_kgram) ### edit this line
            some_txt = clean_output_text(some_txt)
            text_sample += ' ' + some_txt
        except IndexError:
            warnings.warn('Some trouble encountered, retrying')
    text_sample = clean_output_text(text_sample)

    print('made it')
    
    simscore = similarity_score(text_sample, mycorp)
    gscore = grammar_score(text_sample)
    all_simscores.append(simscore)
    all_gscores.append(gscore)


    with open(out_name, 'a') as myfile:
        myfile.write(str(simscore)+'\t'+str(gscore)+"\n")
    print(ii)

HIT!
HIT!
HIT!
HIT!
HIT!
made it




KeyboardInterrupt: 

#### Markov batch script

In [43]:
# Estimate the quality of the text statistically
# make 1000 sentences

from cfgen import *

ORDER = 3
REPEATS = 100
SAMP_LEN = 1000
#mycorp = clean_corpus('/Users/william/python_files/cfgen/full_books/frankenstein.txt')
mycorp = clean_corpus('frankenstein.txt')

tagged_corpus = tag_corpus(mycorp)
termrules_mycorp = make_terminal_rules(tagged_corpus)
my_kgram = make_kgram(mycorp, k=ORDER)
out_name = str(ORDER)+"markov_score.txt"

all_gscores = list()
all_simscores = list()

with open(out_name, 'a') as myfile:
    myfile.write('\n')

for ii in range(REPEATS):

    text_sample = make_sentence_markov(my_kgram, SAMP_LEN )

    simscore = similarity_score(text_sample, mycorp)
    gscore = grammar_score(text_sample)
    all_simscores.append(simscore)
    all_gscores.append(gscore)


    with open(str(ORDER)+"markov_score.txt", 'a') as myfile:
        myfile.write(str(simscore)+'\t'+str(gscore)+"\n")
    print(ii)

0
1
2
3
4
5
6
7
8
9
10
11
12
13


KeyboardInterrupt: 

In [None]:
# Markov batch script with random sentence length

from cfgen import *
from random import choice
import nltk
from nltk import data
from nltk.tokenize import word_tokenize
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

ORDER = 3
REPEATS = 100
#mycorp = clean_corpus('/Users/william/python_files/cfgen/full_books/frankenstein.txt')
mycorp = clean_corpus('frankenstein.txt')

tagged_corpus = tag_corpus(mycorp)
termrules_mycorp = make_terminal_rules(tagged_corpus)
my_kgram = make_kgram(mycorp, k=ORDER)
out_name = str(ORDER)+"_variablelengthmarkov_score.txt"
out_sentences_name = "_sentences_" + out_name

all_gscores = list()
all_simscores = list()

with open(out_name, 'a') as myfile:
    myfile.write('\n')
with open(out_sentences_name, 'a') as myfile:
    myfile.write('\n')

for ii in range(REPEATS):
    
    samp_len = len(choice(tokenizer.tokenize(mycorp)))
    text_sample = make_sentence_markov(my_kgram, samp_len)

    simscore = similarity_score(text_sample, mycorp)
    gscore = grammar_score(text_sample)
    all_simscores.append(simscore)
    all_gscores.append(gscore)


    with open(out_name, 'a') as myfile:
        myfile.write(str(simscore)+'\t'+str(gscore)+"\n")
    with open(out_sentences_name, 'a') as myfile:
        myfile.write(text_sample+"\n")
    print(ii)

When comparing the generated text for repetitions, the number of common substrings between the genereated text and the corpus was computed and divided by the maximum possible value.

In order to check the grammar of the generated text, the open-source library LanguageTool and its accompany Python API language-check were used to individually count the number of uniqe gramamtical errors in each sentence generated by the tool. For the output of cfgen, the original sentence from which the grammatical structure was parsed, and the number of "original" erros was subtrated from the number etected in the generated text.

In [73]:
' '.join(['this', 'is', 'it'])

'this is it'

In [4]:
#Analyze input corpus
# write chosen sentences to a separate output file as well
from cfgen import *
from random import choice
import nltk
from nltk import data
from nltk.tokenize import word_tokenize
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

corpus_name = 'frankenstein.txt'
out_name = corpus_name[:-4]+"_control.txt"

REPEATS = 100
# SAMP_LEN = 1000

all_gscores = list()
all_simscores = list()

with open(out_name, 'a') as myfile:
    myfile.write('\n')

with open (corpus_name, "r") as openfile:
    corpus = openfile.read()
        
for ii in range(REPEATS):
    
    
    text_sample = choice(tokenizer.tokenize(corpus))
    text_sample = clean_output_text(text_sample)

    simscore = similarity_score(text_sample, corpus)
    gscore = grammar_score(text_sample)
    all_simscores.append(simscore)
    all_gscores.append(gscore)


    with open(out_name, 'a') as myfile:
        myfile.write(str(simscore)+'\t'+str(gscore)+"\n")
    print(ii)



0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87


KeyboardInterrupt: 

In [22]:
# count number of sentences in corpus

import glob

frank = glob.glob('/Users/william/python_files/cfgen/resources/corpora/frankenstein.txt')
with open (frank[0], "r") as openfile:
    frank = openfile.read()

print(len(frank.split('.')))

2900


# Appendix (old code)

In [None]:
###LEGACY VERSION OF THIS SCRIPT ON SHERLOCK
# Estimate the quality of the text statistically

# make 1000 sentences

# initialize all of the relevant variables
# mycorp = clean_corpus('/Users/william/python_files/cfgen/full_books/frankenstein.txt')
# tagged_corpus = tag_corpus(mycorp)
# termrules_mycorp = make_terminal_rules(tagged_corpus)
# my_kgram = make_kgram(mycorp, k=3)

all_gscores = list()
all_simscores = list()

for ii in range(100):

    text_sample = make_sentence_markov(my_kgram, 1000)

    simscore = similarity_score(text_sample, mycorp)
    gscore = grammar_score(text_sample)
    all_simscores.append(simscore)
    all_gscores.append(gscore)


    with open("3markov_score.txt", 'a') as myfile:
        myfile.write(str(simscore)+'\t'+str(gscore)+"\n")
    print(ii)

In [None]:
import jellyfish


def originality_score(sentence, corpus):
    '''
    Return the "originality" of the sentence, normalized by its length
    
    sentence : str
        A generated sentence
        
    corpus : str
        A large body of text to compare against
    '''
    
    mylen = len(corpus)
    str_dist = jellyfish.damerau_levenshtein_distance(unicode(sentence), unicode(corpus))
    
    # 1 - (edit distance / length of the larger of the two strings)
    
    norm_str_dist = float(str_dist)/mylen
    
    return str_dist

In [None]:
%load_ext Cython

In [None]:
%%cython

cdef double fr(double x) except? -2:
    return x**2-x


cdef double all_common_substring2(s1, s2,threshold_length=15):
    
    m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
    all_sub = list()
    longest, x_longest = 0, 0
    for x in range(1, 1 + len(s1)):
        for y in range(1, 1 + len(s2)):
            if s1[x - 1] == s2[y - 1]:
                m[x][y] = m[x - 1][y - 1] + 1
                if m[x][y] == threshold_length:
                    longest = m[x][y]
                    x_longest = x
                    sout = s1[x_longest - longest: x_longest]
                    all_sub.append(sout)
            else:
                m[x][y] = 0
                
    return all_sub

print(fr(6))

In [None]:
from difflib import SequenceMatcher
with open('file1.txt') as file_1,open('file2.txt') as file_2:
    file1_data = file_1.read()
    file2_data = file_2.read()
    similarity_ratio = SequenceMatcher(None,file1_data,file2_data).ratio()
    print similarity_ratio  #plagiarism detected

In [None]:
# # functions for processing text output

# import language_check
# from numpy import median, floor


# def all_common_substring(s1, s2,threshold_length=15):
#     '''
#     Return a list of all substrings of a given length that two
#     strings have in common
    
#     Based on standard code for solving the "longest common substring" problem
    
#     '''
    
#     m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
#     all_sub = list()
#     longest, x_longest = 0, 0
#     for x in range(1, 1 + len(s1)):
#         for y in range(1, 1 + len(s2)):
#             if s1[x - 1] == s2[y - 1]:
#                 m[x][y] = m[x - 1][y - 1] + 1
#                 if m[x][y] == threshold_length:
#                     longest = m[x][y]
#                     x_longest = x
#                     sout = s1[x_longest - longest: x_longest]
#                     all_sub.append(sout)
#             else:
#                 m[x][y] = 0
#     return all_sub


# def similarity_score(s1, s2, threshold_length='auto'):
#     '''
#     Compute the similarity between two strings based on the
#     number of identical substrings of at least a given length
    
#     Parameters
#     ----------
    
#     s1 : str
#     s2 : str
#         The two strings to compare
        
#     threshold_length : int
#         The length for overlapping substrings to be significant
#         If this is not specified, it is set to thrice the median
#         length of words in the two strings
        
#     Returns
#     -------
    
#     score : float
#         The similarity score, a number between 0.0 and 1.0
    
#     '''
#     if threshold_length=='auto':
#         ave_word_len = median([len(item) for item in (s1 + ' ' + s2).split(' ')])
#         threshold_length = int(3*ave_word_len)
    
#     min_len = max([len(s1), len(s2)])
#     max_sim = floor(min_len/float(threshold_length))
    
#     all_comm = all_common_substring(s1, s2, threshold_length=threshold_length)
    
#     score = float(len(all_comm))/max_sim
    
#     return score

# def grammar_score(some_text):
#     '''
#     Count the total number of errors in a text
    
#     Excludes cosmetic errors, like misuse of capitals, 
#     and instead focus on structural issues
    
#     Parameters
#     ----------
#     some_text : str
#     '''
#     tool = language_check.LanguageTool('en-US')
#     matches = tool.check(some_text)

#     structural_errors = list()
#     for item in matches:
#         if item.ruleId.find('WHITESPACE') != -1:
#             continue
#         elif item.ruleId.find('UPPERCASE') != -1:
#             continue
#         elif item.ruleId.find('LOWERCASE') != -1:
#             continue
#         elif item.ruleId.find('MORFOLOGIK_RULE_EN_US') != -1:
#             continue
#         elif item.ruleId.find('ENGLISH_WORD_REPEAT_BEGINNING_RULE') != -1:
#             continue
#         else:
#             structural_errors.append(item)
    
#     error_score = float(len(structural_errors))/len(some_text)
    
#     return error_score

### Class CFGen 

instance variables:
    bad tags to substitute out then back in

In [None]:
class CFGen:
    '''
    
    k : int
        The order of the Markov model
    '''
    exclusions = [] # global to the class by not user-facing
    
    def __init__(self, corpus, k):
        self.corpus = name    # instance variable unique to each instance
        self.k = k
    
        self.kgram = make_kgram(self.corpus, k=self.k)
        self.tagged_corpus = tag_corpus(mycorp)
        self.term_rules = make_terminal_rules(self.tagged_corpus)
    
    def generate():
        return None