In [1]:
import glob
import os

from bs4 import BeautifulSoup
import esanpy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from tqdm import tqdm

In [2]:
esanpy.stop_server()
esanpy.start_server()

Failed to stop Elasticsearch process: [Errno 3] No such process


In [3]:
natsume_files = glob.glob('data/natsume/files/*.html')

In [4]:
def get_soup(filepath):
    with open(filepath, encoding='shift_jis') as f:
        soup = BeautifulSoup(f, 'html.parser', from_encoding='shift_jis')
    return soup

def get_texts(filepaths):
    texts = [get_soup(filepath).body.text for filepath in tqdm(filepaths)]
    return texts

In [5]:
natsume_texts = get_texts(natsume_files)

100%|██████████| 118/118 [00:32<00:00,  3.65it/s]


In [6]:
def custom_tokenizer(text):
    return esanpy.analyzer(text, analyzer='kuromoji_neologd')

In [7]:
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, max_features=10000, min_df=2)
vectorizer.fit(natsume_texts)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=2,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function custom_tokenizer at 0x114c1ad90>, use_idf=True,
        vocabulary=None)

In [8]:
lda = LatentDirichletAllocation(n_components=20, max_iter=100, n_jobs=-1)
lda.fit(vectorizer.transform(natsume_texts))

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=100, mean_change_tol=0.001,
             n_components=20, n_jobs=-1, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [9]:
len(vectorizer.get_feature_names())

10000

In [10]:
feature_names = vectorizer.get_feature_names()
for i, component in enumerate(lda.components_[:5]):
    print(f'component: {i}')
    indices = component.argsort()[::-1][:5]
    for index in indices:
        print(f'    {feature_names[index]}: {component[index]}')

component: 0
    東菊: 0.6361577407867479
    一輪: 0.3970190957173659
    碌: 0.32593282481022406
    區: 0.29417667753598215
    年数: 0.2798871998509871
component: 1
    要吉: 0.49468633087471386
    中佐: 0.49151537282734986
    煤煙: 0.3308586429419791
    広瀬: 0.2370970092099029
    低気圧: 0.20760511301335455
component: 2
    広瀬: 0.05000000000050258
    殊更: 0.05000000000046664
    焙炉: 0.05000000000046221
    うめこ: 0.050000000000456624
    まつすぐ: 0.05000000000044801
component: 3
    云う: 14.676303889139412
    事: 9.328946081694557
    人: 7.373100928005811
    自分: 7.298362691308151
    私: 7.1801829966692665
component: 4
    余: 8.781808339588675
    時: 5.120884220960272
    君: 5.080526438146087
    上: 4.682456076473297
    彼: 4.17499897478692
