In [1]:
import sys
from os import listdir, path
from pyknp import Jumanpp
from gensim import models
from gensim.models.doc2vec import LabeledSentence

In [2]:
def corpus_files():
    dirs = [path.join('./livedoor_news_corpus', x)
            for x in listdir('./livedoor_news_corpus') if not x.endswith('.txt')]
    docs = [path.join(x, y)
            for x in dirs for y in listdir(x) if not x.startswith('LICENSE')]
    return docs

In [3]:
def read_document(path):
    with open(path, 'r') as f:
        return f.read()

In [4]:
def split_into_words(text):
    result = Jumanpp().analysis(text)
    return [mrph.midasi for mrph in result.mrph_list()]

In [5]:
def doc_to_sentence(doc, name):
    words = split_into_words(doc)
    return LabeledSentence(words=words, tags=[name])

In [6]:
def corpus_to_sentences(corpus):
    docs   = [read_document(x) for x in corpus]
    for idx, (doc, name) in enumerate(zip(docs, corpus)):
        sys.stdout.write('\r前処理中 {}/{}'.format(idx, len(corpus)))
        yield doc_to_sentence(doc, name)

In [7]:
corpus = corpus_files()
sentences = corpus_to_sentences(corpus)

In [8]:
model = models.Doc2Vec(sentences, dm=0, size=300, window=15, alpha=.025,
        min_alpha=.025, min_count=1, sample=1e-6)

print('\n訓練開始')
for epoch in range(20):
    print('Epoch: {}'.format(epoch + 1))
    model.train(sentences, total_examples = model.corpus_count,  epochs = model.iter)
    model.alpha -= (0.025 - 0.0001) / 19
    model.min_alpha = model.alpha

前処理中 7375/7376
訓練開始
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10
Epoch: 11
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch: 18
Epoch: 19
Epoch: 20


In [9]:
model.save('doc2vec.model')
model = models.Doc2Vec.load('doc2vec.model')

In [14]:
model.docvecs.most_similar('./livedoor_news_corpus/livedoor-homme/livedoor-homme-5625149.txt', topn=1)

[('./livedoor_news_corpus/peachy/peachy-4493265.txt', 0.21087855100631714)]

In [12]:
model.docvecs.similarity('./livedoor_news_corpus/livedoor-homme/livedoor-homme-4700669.txt', './livedoor_news_corpus/movie-enter/movie-enter-5947726.txt')

0.034366556317362629