-
Notifications
You must be signed in to change notification settings - Fork 1
/
collocation.py
51 lines (43 loc) · 1.87 KB
/
collocation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import sys
from collections import Counter
from nltk.tokenize import TweetTokenizer, word_tokenize
import re
def collocation(document, word, l, r, tokenizer="standard"):
'''Main function for acquiring collocation data.
'''
# counter dict of words left of search term
left_counter = Counter()
# counter dict of words right of search term
right_counter = Counter()
# full counter - needed for later calculation (log-likelihood etc.)
full_counter = Counter()
# setting word to lower case and including regex
word = re.compile(word.lower())
# setting document to lower case
document = document.lower()
# select tokenizer (word is default; otherwise "tweet")
if tokenizer == "standard":
word_list = word_tokenize(document)
## only keeping words TODO: this might lead to wrong results as it currently is, better using stop word list later on
word_list = [word for word in word_list if re.compile(r"\w+").match(word)]
elif tokenizer == "tweet":
tweet_tokenizer = TweetTokenizer()
word_list = tweet_tokenizer.tokenize(document)
# words in document
length = len(word_list)
# get word count of full document
full_counter = Counter(word_list)
# how often does the search term appear in document?
word_counter = word_list.count(word)
# main function to collect words left/right of search term
for i in range(len(word_list)):
# check if word matches the current token in the document taking into account regex
match = word.match(word_list[i])
if match:
for x in range(1,int(l)+1):
if i-x >= 0:
left_counter.update([word_list[i-x]])
for x in range(1,int(r)+1):
if i+x < length:
right_counter.update([word_list[i+x]])
return full_counter, left_counter, right_counter