In [1]:
import os
import spacy 
nlp = spacy.load("en_core_web_md",  disable=['textcat', "ner"]) 
from nltk.corpus import wordnet as wn
from nltk import FreqDist
from datetime import datetime, timedelta
import time
from collections import defaultdict

In [2]:
def get_syns(verb):
    """
    Find 
    :return: list of synonyms
    """
    v_syns = []
    for synset in wn.synsets(verb,'v'):
        for lemma in synset.lemma_names():
            if lemma.find("_") == -1:
                v_syns.append(lemma)
            else:
                v_syns.append(" ".join(lemma.split("_")))
    return v_syns

verbs = ["say", "tell", "communicate", "speak", "claim"] 
syns = []

for verb in verbs:
    syns.extend(get_syns(verb))
synonyms = list(set(syns))
print("Total number of synonyms: {}\nSynonyms:\n{} ".format(len(synonyms), synonyms))    

Total number of synonyms: 48
Synonyms:
['distinguish', 'recount', 'narrate', 'assure', 'commune', 'speak', 'pass', 'pronounce', 'state', 'enjoin', 'verbalize', 'evidence', 'verbalise', 'pass on', 'lay claim', 'recite', 'aver', 'take', 'allege', 'sound out', 'articulate', 'read', 'severalize', 'pass along', 'talk', 'secernate', 'convey', 'differentiate', 'claim', 'enunciate', 'intercommunicate', 'suppose', 'transmit', 'utter', 'arrogate', 'address', 'mouth', 'communicate', 'separate', 'severalise', 'tell apart', 'order', 'enounce', 'exact', 'tell', 'put across', 'say', 'secern'] 


In [3]:
def is_adverb(token):
    """ 
    Find adverbs with '-ly'
    :param token: spaCy token
    :return: string (adverb) or None
    """
    if token.tag_ == "RB" and token.text.endswith("ly"):
        return token
    
headline = "Not overtly and not directly, but she will speak in code saying that Obama can't win."
doc = nlp(headline)
for token in doc:
    if is_adverb(token) != None:
        print("LY Adverb:", token)
    else:
        print("{} - {}".format(token, token.pos_))

Not - ADV
LY Adverb: overtly
and - CCONJ
not - ADV
LY Adverb: directly
, - PUNCT
but - CCONJ
she - PRON
will - VERB
speak - VERB
in - ADP
code - NOUN
saying - VERB
that - ADP
Obama - PROPN
ca - AUX
n't - ADV
win - VERB
. - PUNCT


In [4]:
def find_adverbs(headlines):
    """"
    Find verbs and their '-ly' adverbs 
    :param doc: Doc, a parsed sentence
    :return: a collocation dictionary ('verb':["adv1", ... ]) or empty dictionary
    """
    advs = defaultdict(list)
    lines = 0
    for line in headlines:
        lines += 1
        doc = nlp(line)
        for token in doc:
            # if we find verbs from the synonym list
            if token.pos_ == "VERB" and token.lemma_ in synonyms:
                for child in token.children:
                    # if the verb has '-ly'adverbs as its children
                    if is_adverb(child) != None and child.dep_ == 'advmod':
                        advs[token.lemma_].append(child.text.lower())
                        for grandch in child.children:
                            # if '-ly'adverb is a parent for other '-ly'adverb
                            if grandch.tag_ == "RB" and grandch.dep_ == 'conj': 
                                advs[token.lemma_].append(grandch.text.lower())
    return advs, lines

In [None]:
t1 = time.time()
script_path = os.path.abspath('__file__') 
path_list = script_path.split(os.sep)
script_directory = path_list[0:len(path_list)-1]
rel_path = "tasks/02-structural-linguistics/data/"
PATH = "/".join(script_directory[:4]) + "/" + rel_path

def get_collocations():
    with open(PATH + 'blog2008.txt') as f:
        data = f.readlines()
    
    verbs = ["say", "tell", "communicate", "speak", "claim"] 
    syns = []
    collocations = {}
    
    file = open("collocations.txt", "w+")
    
    for verb in verbs:
        syns.extend(get_syns(verb))
    synonyms = list(set(syns))
   
    print("Finding adverbs ...", )
    adverbs, headlines = find_adverbs(data)
    for key in adverbs.keys():
        m_common = FreqDist(adverbs[key]).most_common(10)
        collocations[key] = m_common
    
    
    
    file.write("Total headlines processed: {}\n".format(headlines))
    file.write("{} synonyms to look for:\n{} ".format(len(synonyms), synonyms)) 
    file.write("Total number of unique synonyms: {}\n".format(len(adverbs)))
    file.write("Most common synonyms and their collocations:\n")
    
    print("Total headlines processed: {}".format(headlines))
    print("{} synonyms to look for:\n{} ".format(len(synonyms), synonyms))  
    print("\nTotal number of synonyms in the headlines: {}".format(len(adverbs)))
    print("\nMost common synonyms and their collocations:")
    for v, adv in collocations.items():
        file.write('{}: {}\n'.format(v, adv))
    file.close()
    print("See results in 'collocations.txt'")
    
get_collocations()    
t2 = time.time()
end_time = t2 - t1
print("Time: ", str(timedelta(seconds=end_time)))


# Finding adverbs ...
# Total headlines processed: 303994
# 48 synonyms to look for:
# ['read', 'enunciate', 'secern', 'claim', 'communicate', 'recite', 'state', 'recount', 'talk', 'sound out', 'tell', 'pass along', 'allege', 'pronounce', 'differentiate', 'arrogate', 'commune', 'suppose', 'severalize', 'secernate', 'utter', 'enjoin', 'articulate', 'exact', 'mouth', 'convey', 'intercommunicate', 'speak', 'narrate', 'severalise', 'verbalise', 'address', 'separate', 'enounce', 'verbalize', 'put across', 'evidence', 'aver', 'pass', 'lay claim', 'say', 'distinguish', 'tell apart', 'order', 'pass on', 'transmit', 'take', 'assure'] 
# Total number of synonyms in the headlines: 30
# Most common synonyms and their collocations:
# See results in 'collocations.txt'