In [12]:
import itertools

from nltk.lm import Vocabulary
from nltk.lm.models import MLE
from nltk.util import ngrams
import spacy

In [2]:
N = 3
filepath = "text/peachy/peachy-4304732.txt"

In [13]:
nlp = spacy.load("ja_ginza")

In [18]:
for token in nlp(sentences[0]):
    print(token.text, token.pos_)

女性 NOUN
を ADP
潤す VERB
新た ADJ
な AUX
注目 NOUN
ワード NOUN
は ADP
“ PUNCT
アミノ NOUN
酸 NOUN
” PUNCT


In [20]:
sentences = []
poss = []
with open(filepath, "r") as f:
    for line in f.readlines()[2:]:
        line = line.strip()
        if len(line) == 0:
            continue
        doc = nlp(line)
        s = []
        pos = []
        for token in doc:
            s.append(token.text)
            pos.append(token.pos_)

        sentences.append(s)
        poss.append(pos)

In [23]:
sentences[0]

['女性', 'を', '潤す', '新た', 'な', '注目', 'ワード', 'は', '“', 'アミノ', '酸', '”']

In [42]:
vocabulary = Vocabulary(itertools.chain.from_iterable(sentences))
word_ngram = [ngrams(sentence, N) for sentence in sentences]

In [25]:
list(ngrams(sentences[0], N))[:3]

[('女性', 'を', '潤す'), ('を', '潤す', '新た'), ('潤す', '新た', 'な')]

In [43]:
lm = MLE(order=N, vocabulary=vocabulary)
lm.fit(word_ngram)

In [49]:
context = ("注目", "ワード")
for word in lm.context_counts(lm.vocab.lookup(context)):
    print(f"{word}: {lm.score(word, context)}")

は: 1.0


In [51]:
pos_vocabulary = Vocabulary(itertools.chain.from_iterable(poss))
pos_ngram = [ngrams(pos, N) for pos in poss]

In [53]:
list(ngrams(poss[0], N))[:3]

[('NOUN', 'ADP', 'VERB'), ('ADP', 'VERB', 'ADJ'), ('VERB', 'ADJ', 'AUX')]

In [55]:
pos_lm = MLE(order=N, vocabulary=pos_vocabulary)
pos_lm.fit(pos_ngram)

In [54]:
context = ("NOUN", "ADP")
for pos in pos_lm.context_counts(pos_lm.vocab.lookup(context)):
    print(f"{pos}: {pos_lm.score(pos, context)}")

NOUN: 0.45714285714285713
VERB: 0.35714285714285715
PUNCT: 0.07142857142857142
ADJ: 0.02857142857142857
ADP: 0.02857142857142857
ADV: 0.014285714285714285
AUX: 0.014285714285714285
PROPN: 0.014285714285714285
NUM: 0.014285714285714285


In [45]:
text = "新たな注目ワード"
tokens = [token.text for token in nlp(text)]
list(ngrams(tokens, N))[:3]

[('新た', 'な', '注目'), ('な', '注目', 'ワード')]

In [46]:
test_contexts = list(ngrams(tokens, N))

In [47]:
scores = [0 for _ in range(len(tokens))]
i = 0
tp = 0
for context in test_contexts:
    p = lm.score(context[-1], context[:-1])
    print(f"{context}: {p}")
    if p <= tp:
        for j in range(N):
            scores[i+j] -= 1
    i+=1

('新た', 'な', '注目'): 1.0
('な', '注目', 'ワード'): 1.0
