In [None]:
import os
train_data_path = 'data/train/2025-04-11'
train_file_paths = list()
skip_files = ['stopwords']

for root, dirs, files in os.walk(train_data_path):
    for file in files:
        if file not in skip_files:
            train_file_paths.append(f'{train_data_path}/{file}')

print(train_file_paths)

In [None]:
from src.util.read_file import read_file

encoding = 'utf-8'
train_novels = list[tuple]()

for file_path in train_file_paths:
    try:
        train_novels.append((file_path, read_file(file_path, encoding)))
    except Exception:
        print('Decode Error ', file_path)

with open('data/stopwords', encoding='utf-8') as _f:
    stopwords = _f.readlines()

print(str(train_novels[0])[:512], '...')
print(stopwords[:128])

In [None]:
from src.util.preprocess import split_words

train_seg_lists = list()

for novel in train_novels:
    train_seg_lists.append((novel[0], split_words(novel[1], stopwords)))
    
print(train_seg_lists[0])

In [None]:
train_documents = [x[1] for x in train_seg_lists]

In [None]:
from gensim import corpora

dictionary = corpora.Dictionary(train_documents)
corpus = [dictionary.doc2bow(x) for x in train_documents]

print(corpus[:16])

In [None]:
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

num_topics = 10

lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    passes=64,
    random_state=32
)


cm = CoherenceModel(model=lda_model, texts=train_documents, coherence='u_mass')
coherence = cm.get_coherence()
print(f"Coherence={coherence}")

In [None]:
pyLDAvis.display(gensimvis.prepare(lda_model, corpus, dictionary))

In [None]:
# save model
lda_model.save('model/lda/lda_model.pkl')