In [1]:
# import statements
import json
import os
import time
import re

In [2]:
# Define the keywords that will be used to extract the imperative sentences
# Rules are identified from the paper - https://www.aclweb.org/anthology/W14-2117.pdf
# 1. "find those sentences with a verb (in its base form) as the root 
#     in the phrase structure and this particular verb has no 
#.    subject child in the dependency structure" 
#.    Example: You must first discuss the matter there, and you need to be specific”
# 2. "recognize the use of a personal pronoun or noun (e.g., “you”, “we”,
#.     or a username) followed by a modal verb (e.g., “should”, “must”, “need”) as an imperative"


# Source: https://www.myenglishpages.com/site_php_files/grammar-lesson-modals.php
modal_verbs = ['can', 'could', 'may', 'might', 'will', 'would', 'shall','should', 'must', 'ought', 'dare', 'had better', 'ask']
modal_verbs = ['can', 'may', 'might', 'will', 'would', 'shall','should', 'must', 'ought', 'dare', 'had better', 'ask']

# Source: https://grammar.yourdictionary.com/parts-of-speech/pronouns/list-of-personal-pronouns.html
# personal_pronoun = ['I', 'me', 'we', 'us', 'you', 'he', 'she', 'her', 'him', 'it', 'they', 'them']
personal_pronoun = ['us', 'you', 'he', 'she', 'her', 'him', 'it', 'they', 'them']

In [3]:
# Check for rule 2

# Create combinations of possibilities for a sentence to be imperative
combinations = []
# form combinations of words and put them as a list
for pronoun in personal_pronoun:
    for verb in modal_verbs:
        combinations.append(pronoun + ' ' + verb)
        
print('Number of combinations for rule 2 =', len(combinations))

Number of combinations for rule 2 = 108


In [4]:
# Using rule 2 to check imperative snetences
def check_imperative_r2(combinations, sentence):
    if any(combo in sentence.lower() for combo in combinations):
        return True
    return False

In [9]:
IP_DIR = '../data/input/'
OP_DIR = '../data/output/'

dataset = 'guttenberg'

## Using gutenberg dataset

In [6]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg

[nltk_data] Downloading package gutenberg to /Users/talat/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [7]:
gutenberg_files = gutenberg.fileids()

In [8]:
print("Names of the sample books from Gutenberg are:\n")

i = 1
for g_file in gutenberg_files:
    print(str(i)+'. ' + g_file)
    i += 1

Names of the sample books from Gutenberg are:

1. austen-emma.txt
2. austen-persuasion.txt
3. austen-sense.txt
4. bible-kjv.txt
5. blake-poems.txt
6. bryant-stories.txt
7. burgess-busterbrown.txt
8. carroll-alice.txt
9. chesterton-ball.txt
10. chesterton-brown.txt
11. chesterton-thursday.txt
12. edgeworth-parents.txt
13. melville-moby_dick.txt
14. milton-paradise.txt
15. shakespeare-caesar.txt
16. shakespeare-hamlet.txt
17. shakespeare-macbeth.txt
18. whitman-leaves.txt


# Rule 2

In [10]:
#filter sentences from gutenberg ebooks

sentences_g = []
imper_g = []

for f in gutenberg_files:
    sentences = gutenberg.sents(f)
#    print(f, 'has', len(sentences), 'sentences in total')
    for s in sentences:
        sent = " ".join(s)
        sentences_g.append(sent)
        if check_imperative_r2(combinations, sent):
            imper_g.append(sent)
            
    print(f, ' = (Total', len(sentences), '/', len(imper_g), 'imperative)')

austen-emma.txt  = (Total 7752 / 1261 imperative)
austen-persuasion.txt  = (Total 3747 / 1763 imperative)
austen-sense.txt  = (Total 4999 / 2481 imperative)
bible-kjv.txt  = (Total 30103 / 6259 imperative)
blake-poems.txt  = (Total 438 / 6265 imperative)
bryant-stories.txt  = (Total 2863 / 6471 imperative)
burgess-busterbrown.txt  = (Total 1054 / 6534 imperative)
carroll-alice.txt  = (Total 1703 / 6663 imperative)
chesterton-ball.txt  = (Total 4779 / 6969 imperative)
chesterton-brown.txt  = (Total 3806 / 7202 imperative)
chesterton-thursday.txt  = (Total 3742 / 7433 imperative)
edgeworth-parents.txt  = (Total 10230 / 8308 imperative)
melville-moby_dick.txt  = (Total 10059 / 8940 imperative)
milton-paradise.txt  = (Total 1851 / 9036 imperative)
shakespeare-caesar.txt  = (Total 2163 / 9144 imperative)
shakespeare-hamlet.txt  = (Total 3106 / 9265 imperative)
shakespeare-macbeth.txt  = (Total 1907 / 9323 imperative)
whitman-leaves.txt  = (Total 4250 / 9539 imperative)


In [11]:
imper_g[0:100]

['Mr . Weston is such a good - humoured , pleasant , excellent man , that he thoroughly deserves a good wife ;-- and you would not have had Miss Taylor live with us for ever , and bear all my odd humours , when she might have a house of her own ?"',
 "And as for James , you may be very sure he will always like going to Randalls , because of his daughter ' s being housemaid there .",
 'I only doubt whether he will ever take us anywhere else .',
 'It was very lucky , for I would not have had poor James think himself slighted upon any account ; and I am sure she will make a very good servant : she is a civil , pretty - spoken girl ; I have a great opinion of her .',
 'I am sure she will be an excellent servant ; and it will be a great comfort to poor Miss Taylor to have somebody about her that she is used to see .',
 'Whenever James goes over to see his daughter , you know , she will be hearing of us .',
 'He will be able to tell her how we all are ."',
 'I am afraid you must have had a s

In [15]:
op_filename = os.path.join(OP_DIR, dataset, 'imperative_rule2.txt')

with open(op_filename, 'w') as op:
    for sent in imper_g:
        op.write(sent+'\w')

# Rule 1

In [16]:
#load libraries and environments
import spacy
from spacy import displacy

import en_core_web_lg
nlp = en_core_web_lg.load()

#rule: Has a verb in its lemma (base) form and is the root and does not have any subject child in it's dependency structure
def root_verb_in_lemma_nosubj(sent):
    doc = nlp(sent)
    cond1 = False
    cond2 = True
    for token in doc:
        if token.dep_=='ROOT' and token.pos_=='VERB' and token.text.lower() == token.lemma_:
            cond1 = True
            #print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)
        if token.dep_ == 'nsubj':
            cond2 = False   
    return (cond1 and cond2)

In [17]:
#filter sentences from gutenberg ebooks

sentences_g = []
imper_g = []

for f in gutenberg_files:
    sentences = gutenberg.sents(f)
#    print(f, 'has', len(sentences), 'sentences in total')
    for s in sentences:
        sent = " ".join(s)
        sentences_g.append(sent)
        if root_verb_in_lemma_nosubj(sent):
            imper_g.append(sent)
            
    print(f, ' = (Total', len(sentences), '/', len(imper_g), 'imperative)')

austen-emma.txt  = (Total 7752 / 101 imperative)
austen-persuasion.txt  = (Total 3747 / 137 imperative)
austen-sense.txt  = (Total 4999 / 160 imperative)
bible-kjv.txt  = (Total 30103 / 886 imperative)
blake-poems.txt  = (Total 438 / 909 imperative)
bryant-stories.txt  = (Total 2863 / 982 imperative)
burgess-busterbrown.txt  = (Total 1054 / 999 imperative)
carroll-alice.txt  = (Total 1703 / 1046 imperative)
chesterton-ball.txt  = (Total 4779 / 1119 imperative)
chesterton-brown.txt  = (Total 3806 / 1150 imperative)
chesterton-thursday.txt  = (Total 3742 / 1207 imperative)
edgeworth-parents.txt  = (Total 10230 / 1494 imperative)
melville-moby_dick.txt  = (Total 10059 / 1994 imperative)
milton-paradise.txt  = (Total 1851 / 2035 imperative)
shakespeare-caesar.txt  = (Total 2163 / 2200 imperative)
shakespeare-hamlet.txt  = (Total 3106 / 2387 imperative)
shakespeare-macbeth.txt  = (Total 1907 / 2521 imperative)
whitman-leaves.txt  = (Total 4250 / 2772 imperative)


In [18]:
imper_g

['Look at my shoes .',
 'Pray do not make any more matches ."',
 'Invite him to dinner , Emma , and help him to the best of the fish and the chicken , but leave him to chuse his own wife .',
 '" Only think of our happening to meet him !-- How very odd !',
 'Compare Mr . Martin with either of _them_ .',
 'Compare their manner of carrying themselves ; of walking ; of speaking ; of being silent .',
 '" Thank you .',
 'No , Mr . Knightley , do not foretell vexation from that quarter ."',
 'say beautiful rather .',
 "Keep your raptures for Harriet ' s face .",
 'Pray , pray attempt it .',
 'Look at the tree ."',
 'Dear Miss Woodhouse , do advise me ."',
 '" Thank you , thank you , my own sweet little friend .',
 '" My dear sir , do not make a stranger of me ."',
 '" Thank you .',
 '" Not Harriet \' s equal !"',
 'Take your own ."',
 "Approve my charade and my intentions in the same glance .'",
 'Do try to find it out , Miss Woodhouse .',
 'Do help me .',
 'Behold him there , the monarch of 

In [19]:
op_filename = os.path.join(OP_DIR, dataset, 'imperative_rule1.txt')

with open(op_filename, 'w') as op:
    for sent in imper_g:
        op.write(sent+'\w')