In [6]:
# import statements
import json
import os
import re
import time

In [7]:
# Define the keywords that will be used to extract the imperative sentences
# Rules are identified from the paper - https://www.aclweb.org/anthology/W14-2117.pdf
# 1. "find those sentences with a verb (in its base form) as the root 
#     in the phrase structure and this particular verb has no 
#.    subject child in the dependency structure" 
#.    Example: You must first discuss the matter there, and you need to be specific”
# 2. "recognize the use of a personal pronoun or noun (e.g., “you”, “we”,
#.     or a username) followed by a modal verb (e.g., “should”, “must”, “need”) as an imperative"


# Source: https://www.myenglishpages.com/site_php_files/grammar-lesson-modals.php
modal_verbs = ['can', 'could', 'may', 'might', 'will', 'would', 'shall','should', 'must', 'ought', 'dare', 'had better', 'ask']
modal_verbs = ['can', 'may', 'might', 'will', 'would', 'shall','should', 'must', 'ought', 'dare', 'had better', 'ask']

# Source: https://grammar.yourdictionary.com/parts-of-speech/pronouns/list-of-personal-pronouns.html
# personal_pronoun = ['I', 'me', 'we', 'us', 'you', 'he', 'she', 'her', 'him', 'it', 'they', 'them']
personal_pronoun = ['us', 'you', 'he', 'she', 'her', 'him', 'it', 'they', 'them']

In [8]:
# Check for rule 2

# Create combinations of possibilities for a sentence to be imperative
combinations = []
# form combinations of words and put them as a list
for pronoun in personal_pronoun:
    for verb in modal_verbs:
        combinations.append(pronoun + ' ' + verb)
        
print('Number of combinations for rule 2 =', len(combinations))

Number of combinations for rule 2 = 108


In [9]:
# Using rule 2 to check imperative snetences
def check_imperative_r2(combinations, sentence):
    if any(combo in sentence.lower() for combo in combinations):
        return True
    return False

In [10]:
IP_DIR = '../data/input/'
OP_DIR = '../data/output/'

## Tweets

In [11]:
def preprocess_tweet(text):
    # text = text.replace('@', '')
    text = text.replace('\n', ' ')
    # text = text.replace('#', '')
    text = text.replace('RT ', '')
    text = text.translate ({ord(c): ' ' for c in "!@#$%^&*()[]{};:,/<>?\|`~-=_+"})
    
    removeSpecialChars = text.translate ({ord(c): " " for c in "!@#$%^&*()[]{};:,/<>?\|`~-=_+"})
    
    text = re.sub(r'http\S+', '', text) #Remove URLs
    text = re.sub('([^\x00-\x7F])+', '', text) # Remove non-english characters
    text = re.sub(' +', ' ', text) # Remove multiple spaces
    
    text = text.strip()
    
    return text

In [12]:
dataset = 'tweets/'
ip_path = os.path.join(IP_DIR, dataset)
op_path = os.path.join(OP_DIR, dataset)

text_key = 'text'
op_ext = '.txt'
op_filebase = 'processed_tweets_'

sentence = []
count = 0
file_count = 0

tic = time.time()

tweets_files = os.listdir(ip_path)
# print('Files to be processed =', len(tweets_files))

for filename in tweets_files:
    if filename.endswith('.tweet'): 
            
        ip_filename = os.path.join(ip_path, filename)

        with open(ip_filename, "r") as fp:
            for line in fp:
                tweet = json.loads(line.strip())
                
                if text_key in tweet.keys():
                    text = tweet['text']
                    text = preprocess_tweet(text)

                    if ((count%10000) == 0):
                        op_filename = op_filebase+str(count+1)+op_ext
                        op_filepath = os.path.join(op_path, op_filename)
                        #print('Writing to ', op_filepath, end='\r')
                        with open(op_filepath, 'w', encoding='utf-8') as fp:
                            fp.write(text+'\n')
                    else:
                        with open(op_filepath, 'a', encoding='utf-8') as fp:
                            fp.write(text+'\n')
                
                    # sentence.append(text)
                    
                    count += 1
    file_count += 1
    print('Files ('+str(file_count)+'/'+str(len(tweets_files))+') - Tweets = ' + str(count) \
            + ' - File = ' + op_filename, end='\r')
    
print('\nTotal time:', (time.time()-tic), 'seconds')

Files (40821/40821) - Tweets = 4068989 - File = processed_tweets_4060001.txt
Total time: 981.3293538093567 seconds


# Rule 2

In [13]:
dataset = 'tweets/'
ip_path = os.path.join(OP_DIR, dataset)
op_filename = os.path.join(OP_DIR, 'imperative_tweets_rule2.txt')

tic = time.time()

processed_tweets_files = os.listdir(ip_path)
print('Files to be processed =', len(processed_tweets_files))

imper_t = []
count = 0
file_count = 0

with open(op_filename, 'w', encoding='utf-8') as op:
    for filename in processed_tweets_files:
        if filename.endswith('.txt'): 
            ip_filename = os.path.join(ip_path, filename)
            
            with open(ip_filename, 'r', encoding='utf-8') as ip:

                for sentence in ip:
                    if check_imperative_r2(combinations, sentence):
                        imper_t.append(sentence)
                        op.write(sentence+'\n')

                        count += 1

            file_count += 1
        print('Files processed ('+str(file_count)+'/'+str(len(processed_tweets_files))+ \
                      ') - Total imperatives = ' + str(len(imper_t)), end='\r')

print('\nTotal time:', (time.time()-tic), 'seconds')

Files to be processed = 408
Files processed (407/408) - Total imperatives = 145726
Total time: 102.13355302810669 seconds


# Rule 1

In [14]:
#load libraries and environments
import spacy
from spacy import displacy

import en_core_web_lg
nlp = en_core_web_lg.load()

#rule: Has a verb in its lemma (base) form and is the root and does not have any subject child in it's dependency structure
def root_verb_in_lemma_nosubj(sent):
    doc = nlp(sent)
    cond1 = False
    cond2 = True
    for token in doc:
        if token.dep_=='ROOT' and token.pos_=='VERB' and token.text.lower() == token.lemma_:
            cond1 = True
            #print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)
        if token.dep_ == 'nsubj':
            cond2 = False   
    return (cond1 and cond2)

In [15]:
dataset = 'tweets/'
ip_path = os.path.join(OP_DIR, dataset)
op_filename = os.path.join(OP_DIR, 'imperative_tweets_rule1.txt')

tic = time.time()

processed_tweets_files = os.listdir(ip_path)
print('Files to be processed =', len(processed_tweets_files))

imper_t = []
count = 0
file_count = 0

with open(op_filename, 'w', encoding='utf-8') as op:
    for filename in processed_tweets_files:
        if filename.endswith('.txt'): 
            ip_filename = os.path.join(ip_path, filename)
            
            with open(ip_filename, 'r', encoding='utf-8') as ip:

                for sentence in ip:
                    if root_verb_in_lemma_nosubj(sentence):
                        imper_t.append(sentence)
                        op.write(sentence+'\n')

                        count += 1

            file_count += 1
        print('Files processed ('+str(file_count)+'/'+str(len(processed_tweets_files))+ \
                      ') - Total imperatives = ' + str(len(imper_t)), end='\r')

print('\nTotal time:', (time.time()-tic), 'seconds')

Files to be processed = 408
Files processed (318/408) - Total imperatives = 49249

OSError: [Errno 28] No space left on device