In [None]:
%load_ext autoreload
%autoreload 2

%cd '..'

In [None]:
import json

from load.utils import load_comments
from load.constants import SEED, DATA_DIR

from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from gensim.models.callbacks import CoherenceMetric
import gensim.corpora as corpora

In [None]:
YEAR = 2016
START_MONTH = 1
STOP_MONTH = 1

NUM_TOPICS = 7

In [None]:
comments = load_comments(
    years=YEAR,
    start_month=START_MONTH,
    stop_month=STOP_MONTH,
)["body_cleaned"].values

In [None]:
comments_corpus = [comment.split() for comment in comments]

id2word = corpora.Dictionary()

id2word.save(f"{DATA_DIR}/lda.dict")


In [None]:
corpus = [id2word.doc2bow(comment) for comment in comments_corpus]

In [None]:
lda_model = LdaModel(
    corpus=corpus,
    num_topics=NUM_TOPICS,
    id2word=id2word,
    iterations=1000,
    random_state=SEED,
    callbacks=[CoherenceMetric],
)

lda_model.save(f"{DATA_DIR}/lda_model_{NUM_TOPICS}.pickle")


In [None]:
lda_model.show_topics(
    num_topics=NUM_TOPICS,
    formatted=True,
)


In [None]:
keywords_mapper = {
    i: ", ".join([word for word, _ in lda_model.show_topic(i)])
    for i in range(lda_model.num_topics)
}

with open(f"{DATA_DIR}/lda_model_{NUM_TOPICS}_words.json", "w") as f:
    f.write(json.dumps(keywords_mapper))


In [None]:
coherence_lda_model = CoherenceModel(
    model=lda_model,
    texts=comments_corpus,
    dictionary=id2word,
    coherence="c_v",
)
coherence_lda = coherence_lda_model.get_coherence()

print("Coherence Score: ", coherence_lda)
