In [1]:
from collections import defaultdict
import spacy

In [2]:
!ls prj-nlp-2020/tasks/02-structural-linguistics/data

blog2008.txt  examiner-headlines.txt  headlines-test-set.json  tyhrolovy.txt


In [3]:
with open('prj-nlp-2020/tasks/02-structural-linguistics/data/blog2008.txt') as f:
    blog_lines = [l.strip() for l in f.readlines()]
len(blog_lines)

303994

In [4]:
nlp = spacy.load("en_core_web_md")

In [5]:
verbs = [
    "say", "tell", "speak", "claim", "communicate", "announce", "declare",
    "inform", "notify", "chat", "shout", "voice", "whisper", "utter", "interact"
]

In [6]:
collocation_stats = {}
for verb in verbs:
    collocation_stats[verb] = defaultdict(int)

def get_token_children_ly(token):
    return [c for c in token.children if c.text.endswith('ly')]

docs = nlp.pipe(blog_lines)
for sentence in docs:
    for token in sentence:
        if token.pos_ == 'VERB' and token.lower_ in verbs:
            children = get_token_children_ly(token)
            if children:
                for child in children:
                    collocation_stats[token.lower_][child.lower_] += 1

In [7]:
def format_stats(stats):
    result = []
    for k, v in stats:
        result.append('({}, {})'.format(k, v))
    return ' '.join(result)

for verb, stats in collocation_stats.items():
    top_stats = [(k, v) for k, v in sorted(stats.items(), reverse=True, key=lambda item: item[1])][:10]
    print('{0}: {1}'.format(verb, format_stats(top_stats)))

say: (really, 17) (actually, 16) (only, 15) (simply, 11) (explicitly, 10) (publicly, 10) (honestly, 9) (probably, 7) (merely, 7) (recently, 6)
tell: (really, 5) (simply, 5) (probably, 3) (honestly, 3) (likely, 3) (only, 3) (exactly, 2) (possibly, 2) (usually, 2) (actually, 2)
speak: (directly, 12) (publicly, 9) (only, 5) (loudly, 4) (freely, 4) (briefly, 3) (plainly, 3) (probably, 2) (certainly, 2) (candidly, 2)
claim: (falsely, 5) (credibly, 3) (surely, 2) (brazenly, 1) (apparently, 1) (fiercely, 1) (annually, 1) (immediately, 1) (openly, 1) (remotely, 1)
communicate: (directly, 3) (effectively, 2) (freely, 1) (really, 1) (regularly, 1) (quickly, 1) (profoundly, 1) (finally, 1)
announce: (early, 5) (publicly, 3) (openly, 2) (shortly, 2) (quickly, 2) (essentially, 2) (reportedly, 2) (quietly, 1) (electronically, 1) (interestingly, 1)
declare: (publicly, 5) (probably, 1) (absolutely, 1) (clearly, 1) (explicitly, 1) (immediately, 1) (suddenly, 1) (summarily, 1) (duly, 1) (entirely, 1)
in