In [10]:
from nltk.tokenize import wordpunct_tokenize 
from nltk.corpus import stopwords 
from collections import defaultdict, Counter

In [2]:
k0            = 1
k1            = 1
U0            = 10
max_distance  = 5

In [5]:
import nltk
nltk.download('stopwords')
eng_stopwords = set(stopwords.words('english'))
eng_symbols   = '{}"\'()[].,:;+!?-*/&|<>=~$'

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/VictorLin/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
def ngram_is_valid(ngram):
    first, last = ngram[0], ngram[-1]
    if first in eng_stopwords or last in eng_stopwords: return False
    if any( num in first or num in last for num in '0123456789'): return False
    if any( eng_symbol in word for word in ngram for eng_symbol in eng_symbols): return False
    return True

In [26]:
# 求句子的n-gram    
def to_ngrams( unigrams, ngram):
    return zip(*[unigrams[i:] for i in range(ngram)])  


- max_distance 是 5 表示看 5 gram

In [27]:
# 可以發現只會配對到最短的長度
for i in zip([1,2,3],[2,3]):
    print(i)

(1, 2)
(2, 3)


In [28]:
ngram_counts = defaultdict(Counter)

text_file = open('citeseerx_descriptions_sents.txt.50000', 'r')

for index,line in enumerate(text_file): 
    words = wordpunct_tokenize(line)
    for n in range(2, max_distance + 1):
        item = filter(ngram_is_valid, to_ngrams(words, n))
        ngram_counts[n].update(item)

In [30]:
skip_bigram_info = defaultdict(lambda: defaultdict(Counter))
for n in range(2, max_distance + 1):
    for ngram, count in ngram_counts[n].items():
        skip_bigram_info[ngram[0]][ngram[-1]] += Counter({n-1: count})
        skip_bigram_info[ngram[-1]][ngram[0]] += Counter({1-n: count}) # 求负向距离，单词对调，距离求相反数即可

In [55]:
c = Counter({'a': 4, 'b': 2}) 

In [57]:
c += Counter({'a': 4, 'c': 2}) 

In [58]:
c

Counter({'a': 8, 'b': 2, 'c': 2})

In [60]:
import numpy as np
skip_bigram_abc = defaultdict(lambda: 0)
for word, vals in skip_bigram_info.items():
    count = []
    for coll, val in vals.items():
        c = val.values()
        c_bar = sum(c) / (2*max_distance)
        skip_bigram_abc[(word, coll, 'freq')] = sum(c)
        skip_bigram_abc[(word, coll, 'spread')] = (sum([x**2 for x in c]) - 2*c_bar*sum(c) + 2*max_distance*c_bar**2) / (2 * max_distance)
        count.append(sum(c))
    skip_bigram_abc[(word, 'avg_freq')] = np.mean(count)
    skip_bigram_abc[(word, 'dev')] = np.std(count)

In [62]:
import math

def skip_bigram_filter(skip_bigram_info, skip_bigram_abc):
    cc = []
    for word, vals in skip_bigram_info.items():
        f = skip_bigram_abc[(word, 'avg_freq')]
        for coll, val in vals.items():
            if skip_bigram_abc[(word, 'dev')]-0 < 1E-6:
                strength = 0
            else:
                strength = (skip_bigram_abc[(word, coll, 'freq')] - f) / skip_bigram_abc[(word, 'dev')]
            if strength < k0:
                continue
            spread = skip_bigram_abc[(word, coll, 'spread')]
            if spread < U0:
                continue
            c_bar = sum(val.values()) / (2*max_distance)
            peak = c_bar + k1 * math.sqrt(spread)
            for dist, count in val.items():
                if count >= peak:
                    cc.append((word, coll, dist, strength, spread, peak, count))
    return cc

cc = skip_bigram_filter(skip_bigram_info, skip_bigram_abc)

In [63]:
import pandas
collocations_df = pandas.DataFrame(cc,
                                   columns = ['base word', 'collocate', 'distance', 'strength', 'spread', 'peak', 'p'])
collocations_df = collocations_df.set_index(['base word', 'collocate', 'distance']).sort_index()

In [64]:
collocations_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,strength,spread,peak,p
base word,collocate,distance,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
#,#,-2,34.148031,1158.64,91.438801,107
#,#,2,34.148031,1158.64,91.438801,107
#,The,-3,1.668985,13.09,6.718011,10
#,The,-2,1.668985,13.09,6.718011,8
#,c,1,2.207311,114.20,14.686440,36
#,ciency,1,2.566196,181.44,18.069967,45
#,cient,1,8.906488,2049.16,60.467648,151
#,ciently,1,2.207311,128.60,15.340194,38
#,coe,-1,1.250286,43.04,8.960488,22
#,cult,1,1.848427,104.04,13.600000,34


text 太長可以用 +\ 來作為換段落

In [79]:
akl = dict([(x+'-n', True) for x in 'focus, ability, absence'.split(', ')]+\
           [(x+'-v', True) for x in 'accept, account'.split(', ')])

In [80]:
akl

{'ability-n': True,
 'absence-n': True,
 'accept-v': True,
 'account-v': True,
 'focus-n': True}

In [81]:
print( 'ability', 'n', ('ability', 'n') in akl)
print([ x for x in akl.keys() ][:7])

ability n False
['focus-n', 'ability-n', 'absence-n', 'accept-v', 'account-v']


In [82]:
akl.keys()

dict_keys(['focus-n', 'ability-n', 'absence-n', 'accept-v', 'account-v'])