In [10]:
from glob import glob
import itertools
import os
from typing import List, Tuple

from nltk.lm import Vocabulary
from nltk.lm.models import MLE
from nltk.util import ngrams
import spacy
from tqdm import tqdm

In [15]:
N = 3
filedir = "text/it-life-hack/*.txt"

In [12]:
nlp = spacy.load("ja_ginza")

In [13]:
def load_sentences(filepath: str) -> List[str]:
    sentences = []
    with open(filepath, "r") as f:
        for line in f.readlines()[2:]:
            line = line.strip()
            if len(line) == 0:
                continue
            doc = nlp(line)
            s = []
            for token in doc:
                s.append(token.text)
            sentences.append(s)
    return sentences

In [16]:
sentences = []
for filepath in tqdm(glob(filedir)):
    if os.path.basename(filepath) == "LICENSE.txt":
        continue
    sentences.extend(load_sentences(filepath))

100%|██████████| 871/871 [07:55<00:00,  1.83it/s]


In [17]:
len(sentences)

21286

In [18]:
vocabulary = Vocabulary(itertools.chain.from_iterable(sentences))
word_ngram = [ngrams(sentence, N) for sentence in sentences]

In [19]:
list(ngrams(sentences[0], N))[:3]

[('Ultrabook', 'を', 'パワーアップ'), ('を', 'パワーアップ', '！'), ('パワーアップ', '！', '\u3000')]

In [20]:
lm = MLE(order=N, vocabulary=vocabulary)
lm.fit(word_ngram)

In [33]:
context = ("サービス", "の")
probabilities = dict()
for word in lm.context_counts(lm.vocab.lookup(context)):
    # print(f"{word}: {lm.score(word, context)}")
    probabilities[word] = lm.score(word, context)
sorted(probabilities.items(), key=lambda x: x[1])[::-1][:10]

[('レビューアー', 0.08433734939759036),
 ('「', 0.07228915662650602),
 ('ほう', 0.04819277108433735),
 ('中', 0.04819277108433735),
 ('評価', 0.04819277108433735),
 ('普及', 0.04819277108433735),
 ('デファクトスタンダード', 0.04819277108433735),
 ('提供', 0.03614457831325301),
 ('純減', 0.03614457831325301),
 ('開始', 0.024096385542168676)]