In [1]:
# import statements
import json
import os
import time

In [2]:
# Define the keywords that will be used to extract the imperative sentences
# Rules are identified from the paper - https://www.aclweb.org/anthology/W14-2117.pdf
# 1. "find those sentences with a verb (in its base form) as the root 
#     in the phrase structure and this particular verb has no 
#.    subject child in the dependency structure" 
#.    Example: You must first discuss the matter there, and you need to be specific”
# 2. "recognize the use of a personal pronoun or noun (e.g., “you”, “we”,
#.     or a username) followed by a modal verb (e.g., “should”, “must”, “need”) as an imperative"


# Source: https://www.myenglishpages.com/site_php_files/grammar-lesson-modals.php
modal_verbs = ['can', 'could', 'may', 'might', 'will', 'would', 'shall','should', 'must', 'ought', 'dare', 'had better', 'ask']
modal_verbs = ['can', 'may', 'might', 'will', 'would', 'shall','should', 'must', 'ought', 'dare', 'had better', 'ask']

# Source: https://grammar.yourdictionary.com/parts-of-speech/pronouns/list-of-personal-pronouns.html
# personal_pronoun = ['I', 'me', 'we', 'us', 'you', 'he', 'she', 'her', 'him', 'it', 'they', 'them']
personal_pronoun = ['us', 'you', 'he', 'she', 'her', 'him', 'it', 'they', 'them']

In [3]:
# Check for rule 2

# Create combinations of possibilities for a sentence to be imperative
combinations = []
# form combinations of words and put them as a list
for pronoun in personal_pronoun:
    for verb in modal_verbs:
        combinations.append(pronoun + ' ' + verb)
        
print('Number of combinations for rule 2 =', len(combinations))

Number of combinations for rule 2 = 108


In [4]:
# Using rule 2 to check imperative snetences
def check_imperative_r2(combinations, sentence):
    if any(combo in sentence.lower() for combo in combinations):
        return True
    return False

In [5]:
IP_DIR = '../data/input/'
OP_DIR = '../data/output/'

## Quora Dataset

In [6]:
# Read Quora Question Answer Dataset
def get_quora_sentences(ip_path):
    
    sentences = []

    with open(ip_path) as fp:
        data = json.load(fp)

        for qa_pair in data.items():
            q_id, answer = qa_pair
            
            for sent in answer['sentences']:
                # print(sent)
                split_sentences = list(sent.split('.'))
                for x in split_sentences:
                    if len(x) > 1:
                        sentences.append(x)
            '''
            # This part is one of the sentences from answer['sentences']
            split_sentences = list(sent.split('.'))
            for x in split_sentences:
                if len(x) > 1:
                    print('\t', x)
                    sentences.append(x)
            '''
                
    print('Number of sentences =', len(sentences))
    return sentences

In [7]:
sentences_q = []
dataset = 'Quora_QAD'

fname = 'Q2AD_v1_test.json'
ip_path = os.path.join(IP_DIR, dataset, fname)
sentences_q.extend(get_quora_sentences(ip_path))

fname = 'Q2AD_v1_train.json'
ip_path = os.path.join(IP_DIR, dataset, fname)
sentences_q.extend(get_quora_sentences(ip_path))

print('Total sentences available: ', len(sentences_q))

Number of sentences = 109
Number of sentences = 457
Total sentences available:  566


In [8]:
sentences_q

["It's a matter of economy of scale",
 'So in a buffet situation, you can buy larger quantities from vendors and save a huge amount',
 'The trend now is that beauty is all about the look',
 'The eye lashes, lip sticks, foundations and face cakes',
 "Because it's also the end of all joy and happiness and social interaction",
 "We don't fear death so much as we fear having to give up everything we know and love",
 'Table tennis is tremendously fun, and a much better source of exercise than people tend to give it credit for',
 'She doesn’t need to throw out useful things for no reason',
 'She received these things as gifts, they are still (presumably) functional and useful, why would she get rid of them?',
 'I get cultural and religious shocks once in a while',
 'Here on Quora, there are bold people who write explicitly and even if I appreciate the meat therein, I feel restrained to vote them up',
 'Mr',
 ' Mohandas Karam Chand Gandhi - Bapuji was a barrister by profession',
 'There are m

# Rule 2

In [9]:
# For each sentence, find if the combination exists
imper_q = []
for sentence in sentences_q:
    if check_imperative_r2(combinations, sentence):
        imper_q.append(sentence)
        
print(len(imper_q))

71


In [10]:
op_filename = os.path.join(OP_DIR, dataset, 'imperative_rule2.txt')

with open(op_filename, 'w') as op:
    for sent in imper_q:
        op.write(sent+'\w')

In [11]:
imper_q

['So in a buffet situation, you can buy larger quantities from vendors and save a huge amount',
 'If you go through the profiles and lives of JEE Advanced toppers of last few year, you will find half or them or more cracking IIT JEE without having any coaching',
 'IF the $130K minimum wage for H-1Bs is passed it would apply equally to all states',
 'It would be the same requirement for rural Mississippi as it would be for a high cost-of-living location like Palo Alto or Princeton',
 'if you’re somewhat strict about these things, you might conclude these companies were lawbreakers and did not grow ethically',
 "It would not be nearly as ubiquitous of an item if is wasn't for the sugar conspiracy",
 'The best thing you can do is read in English everyday - read outside of your interests in particular',
 'You would also experience a great deal of constipation due to a lack of fiber as well as a myriad of differing illness (potentially including but not limited to scurvy, rickets, and weigh

In [12]:
dataset = 'quora_QAD'
ip_path = os.path.join(IP_DIR, dataset)
op_path = os.path.join(OP_DIR, dataset)

op_filepath = os.path.join(op_path, 'imperatives.txt')
with open(op_filepath, 'w') as fp:
    for sent in imper_q:
        fp.write(str(sent) + '\n')

# Rule 1

In [13]:
#load libraries and environments
import spacy
from spacy import displacy

import en_core_web_lg
nlp = en_core_web_lg.load()

#rule: Has a verb in its lemma (base) form and is the root and does not have any subject child in it's dependency structure
def root_verb_in_lemma_nosubj(sent):
    doc = nlp(sent)
    cond1 = False
    cond2 = True
    for token in doc:
        if token.dep_=='ROOT' and token.pos_=='VERB' and token.text.lower() == token.lemma_:
            cond1 = True
            #print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)
        if token.dep_ == 'nsubj':
            cond2 = False   
    return (cond1 and cond2)

In [14]:
# For each sentence, find if the combination exists
imper_q = []
for sentence in sentences_q:
    if root_verb_in_lemma_nosubj(sentence):
        imper_q.append(sentence)
        
print(len(imper_q))

14


In [15]:
imper_q

['To start with indulge in cardio',
 'Just practice programming thoroughly',
 'Stop looking outward and look inward, into your heart',
 'Sing your death song and die like a hero going home',
 "Take Bruce Lee's or the mixed martial artist's approach",
 'More specifically, study the martial arts that are used in MMA fighting',
 'Also, pay closer attention to calories on the nutritional facts',
 'Just go for Redmi Note 3 without any second thought ',
 'To prevent dictatorial excesses like the Cultural Revolution',
 'Go to work',
 'Any way, find a speech therapist and stick to them',
 'Read things all the time',
 'Avoid gossip and negativity AT ALL COSTS',
 'Start taking your interests and hobbies seriously']

In [16]:
op_filename = os.path.join(OP_DIR, dataset, 'imperative_rule1.txt')

with open(op_filename, 'w') as op:
    for sent in imper_q:
        op.write(sent+'\w')