In [1]:
from collections import defaultdict
import spacy

In [2]:
!ls prj-nlp-2020/tasks/02-structural-linguistics/data

blog2008.txt  examiner-headlines.txt  headlines-test-set.json  tyhrolovy.txt


In [3]:
with open('prj-nlp-2020/tasks/02-structural-linguistics/data/blog2008.txt') as f:
    blog_lines = [l.strip() for l in f.readlines() if 'ly' in l]
len(blog_lines)

78188

In [4]:
nlp = spacy.load("en_core_web_md")

In [5]:
verbs = [
    "say", "tell", "speak", "claim", "communicate", "announce", "declare",
    "inform", "notify", "chat", "shout", "voice", "whisper", "utter", "interact"
]

In [6]:
collocation_stats = {}
for verb in verbs:
    collocation_stats[verb] = defaultdict(int)

def get_token_children_ly(token):
    return [c for c in token.children if c.text.endswith('ly')]

docs = nlp.pipe(blog_lines)
for sentence in docs:
    for token in sentence:
        if token.pos_ == 'VERB' and token.lemma_ in verbs:
            children = get_token_children_ly(token)
            if children:
                for child in children:
                    collocation_stats[token.lemma_][child.lower_] += 1

In [7]:
def format_stats(stats):
    result = []
    for k, v in stats:
        result.append('({}, {})'.format(k, v))
    return ' '.join(result)

for verb, stats in collocation_stats.items():
    top_stats = [(k, v) for k, v in sorted(stats.items(), reverse=True, key=lambda item: item[1])][:10]
    print('{0}: {1}'.format(verb, format_stats(top_stats)))

say: (actually, 76) (recently, 73) (repeatedly, 55) (simply, 46) (explicitly, 39) (publicly, 36) (basically, 34) (really, 31) (only, 25) (previously, 22)
tell: (recently, 25) (reportedly, 14) (privately, 12) (only, 10) (basically, 9) (finally, 9) (repeatedly, 8) (actually, 8) (really, 8) (specifically, 8)
speak: (directly, 33) (publicly, 15) (only, 12) (fiercely, 12) (briefly, 9) (recently, 8) (generally, 8) (openly, 8) (politically, 7) (loudly, 7)
claim: (falsely, 80) (o'reilly, 13) (previously, 9) (repeatedly, 8) (recently, 4) (initially, 3) (actually, 3) (absurdly, 3) (credibly, 3) (publicly, 3)
communicate: (effectively, 3) (directly, 3) (freely, 1) (really, 1) (verbally, 1) (apparently, 1) (loudly, 1) (hopefully, 1) (daily, 1) (actually, 1)
announce: (recently, 15) (officially, 11) (publicly, 8) (early, 5) (proudly, 4) (shortly, 4) (openly, 3) (previously, 3) (quickly, 3) (newly, 3)
declare: (publicly, 13) (falsely, 4) (suddenly, 4) (o'reilly, 4) (recently, 4) (officially, 3) (ope