In [37]:
import glob
import os

from bs4 import BeautifulSoup
import esanpy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from tqdm import tqdm

In [32]:
esanpy.stop_server()
esanpy.start_server()

In [10]:
natsume_files = glob.glob('data/natsume/files/*.html')

In [28]:
def get_soup(filepath):
    with open(filepath, encoding='shift_jis') as f:
        soup = BeautifulSoup(f, 'html.parser', from_encoding='shift_jis')
    return soup

def get_texts(filepaths):
    texts = [get_soup(filepath).body.text for filepath in tqdm(filepaths)]
    return texts

In [64]:
natsume_texts = get_texts(natsume_files)

100%|██████████| 118/118 [00:35<00:00,  3.30it/s]


In [40]:
def custom_tokenizer(text):
    return esanpy.analyzer(text, analyzer='kuromoji_neologd')

In [65]:
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, max_features=10000, min_df=2)
vectorizer.fit(natsume_texts)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=2,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function custom_tokenizer at 0x11de02ea0>, use_idf=True,
        vocabulary=None)

In [66]:
lda = LatentDirichletAllocation(n_components=20, max_iter=100, n_jobs=-1)
lda.fit(vectorizer.transform(natsume_texts))

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=100, mean_change_tol=0.001,
             n_components=20, n_jobs=-1, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [67]:
len(vectorizer.get_feature_names())

10000

In [68]:
feature_names = vectorizer.get_feature_names()
for i, component in enumerate(lda.components_[:5]):
    print(f'component: {i}')
    indices = component.argsort()[::-1][:5]
    for index in indices:
        print(f'    {feature_names[index]}: {component[index]}')

component: 0
    中佐: 0.4915163072299102
    委員: 0.2443820897005533
    広瀬: 0.237095604848103
    言辞: 0.19279312662096215
    再: 0.08351966586996046
component: 1
    先生: 5.139771166967123
    つて: 4.645858098116799
    居る: 4.347464785597507
    僕: 3.6540210821724117
    子規: 2.4679426595934806
component: 2
    職業: 1.2276807050150358
    土: 1.190841391578364
    長塚: 1.0233290743840016
    ぜんざい: 1.0202090397695605
    趣味: 0.8677429116587403
component: 3
    2041 lancelot: 0.449905101373172
    エレーン: 0.33326199821746383
    アーサ: 0.253292414872814
    騎士: 0.20826200861936267
    兜: 0.1125297740990928
component: 4
    予告: 1.2371799118653202
    初出: 1.2162611466923299
    郎: 1.1328110896669208
    三四: 1.0515218393404302
    編集: 0.9369197350170019
