In [1]:
import pandas as pd
from gensim.parsing.preprocessing import STOPWORDS as gensim_stopwords
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import os,pickle

In [None]:
CS = "cs"
MATH = "math"
PHYSICS = "physics"
SUBJECT = CS
LIST_SUBJECT = [CS, MATH , PHYSICS]

In [3]:
data = {}
for i in LIST_SUBJECT:
    data[i] = pd.read_csv(f"../../dataset/{i}/emb/v1.csv")

# Build Model

In [4]:
tokens = {}
for i in LIST_SUBJECT:
    token_list = data[i]["text"].tolist()
    tokens[i] = [x.split() for x in token_list]

In [5]:
# import ast
# tokens = {}
# for i in LIST_SUBJECT:
#   token_list = data[i]["text"].tolist()
#   tokens[i] = [ast.literal_eval(x) for x in token_list]
  # print(token_list)

In [None]:
def preprocess(df: pd.DataFrame, token):
    df['year'] = pd.to_datetime(df["submitted_date"]).dt.year
    slice_counts = df.groupby('year').size()
    time_slices = slice_counts.tolist()
    year_list = slice_counts.index.tolist()
    dictionary = Dictionary(token)
    dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
    dictionary.compactify()

    corpus_sorted = [dictionary.doc2bow(text) for text in token]

    print(f"Sample total: {len(df)} docs across {len(year_list)} years")
    print(f"Time slices: {time_slices}")

    return corpus_sorted, dictionary, time_slices, year_list

In [7]:
corpus_sorted = {}
dictionary = {}
time_slices = {}
year_list = {}
for i in LIST_SUBJECT:
  corpus_sorted[i], dictionary[i], time_slices[i], year_list[i] = preprocess_for_dtm(data[i], tokens[i])
  dictionary[i].save(f"model_results/dictionary_{i}.gensim")

Sample total: 165756 docs across 26 years
Time slices: [488, 594, 648, 825, 948, 1000, 1000, 1000, 1000, 1000, 1362, 1622, 2254, 2719, 2989, 3345, 4280, 5534, 7470, 9549, 12352, 14096, 14993, 17825, 23529, 33334]
Sample total: 126192 docs across 26 years
Time slices: [1000, 1000, 1000, 1069, 1289, 1538, 1876, 2253, 2478, 2778, 3178, 3719, 4236, 4805, 5185, 5747, 6120, 6419, 6803, 7250, 7992, 8144, 8131, 8640, 10291, 13251]
Sample total: 28965 docs across 23 years
Time slices: [2, 2, 1, 6, 293, 520, 556, 798, 1000, 1000, 1000, 1000, 1000, 1000, 1163, 1936, 2845, 3202, 2220, 1929, 1986, 2322, 3184]


In [None]:
from gensim.models import LdaMulticore, LdaModel

topic_range = range(100, 201, 50)
for i in LIST_SUBJECT:
    for k in topic_range:
        lda = LdaMulticore(
            corpus=corpus_sorted[i],
            id2word=dictionary[i],
            num_topics=k,
            passes=15,         
            random_state=42,
            workers=5
                       
        )
        ckpt_path = f"model_results/lda_global_{i}_num_topic_{k}.pkl"
        with open(ckpt_path, "wb") as f:
            pickle.dump(lda, f)
        print(f"[Checkpoint saved] {ckpt_path}")

[Checkpoint saved] model_results/lda_global_cs_num_topic_100.pkl
[Checkpoint saved] model_results/lda_global_cs_num_topic_150.pkl
[Checkpoint saved] model_results/lda_global_cs_num_topic_200.pkl
[Checkpoint saved] model_results/lda_global_math_num_topic_100.pkl
[Checkpoint saved] model_results/lda_global_math_num_topic_150.pkl


Process ForkPoolWorker-26:
Process ForkPoolWorker-29:
Process ForkPoolWorker-30:
Process ForkPoolWorker-28:
Process ForkPoolWorker-27:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/nedo/.local/share/uv/python/cpython-3.13.11-linux-x86_64-gnu/lib/python3.13/multiprocessing/process.py", line 313, in _bootstrap
    self.run()
    ~~~~~~~~^^
  File "/home/nedo/.local/share/uv/python/cpython-3.13.11-linux-x86_64-gnu/lib/python3.13/multiprocessing/process.py", line 313, in _bootstrap
    self.run()
    ~~~~~~~~^^
  File "/home/nedo/.local/share/uv/python/cpython-3.13.11-linux-x86_64-gnu/lib/python3.13/multiprocessing/process.py", line 313, in _bootstrap
    self.run()
    ~~~~~~~~^^
  File "/home/nedo/.local/share/uv/python/cpython-3.13.11-linux-x86_64-gnu/lib/python3.13/multiprocessing/process.py", line 313, in _bootstrap
    self.run()
    ~~~~~~~~^

KeyboardInterrupt: 

# Visualize

In [None]:
model_path = f"model_results/lda_global_{SUBJECT}_num_topic_{50}.pkl"
try:
    with open(model_path, "rb") as f:
        loaded_lda_model = pickle.load(f)
    print("LDA model loaded.")
except FileNotFoundError:
    print(f"ERROR: Model file not found at {model_path}")

LDA model loaded.


In [None]:
import numpy as np
import plotly.graph_objects as go
import textwrap

split_corpus = []
idx = 0
for slice_size in time_slices[SUBJECT]:
    split_corpus.append(corpus_sorted[SUBJECT][idx: idx + slice_size])
    idx += slice_size

topic_activity = []
for i, corpus_year in enumerate(split_corpus):
    gamma = [loaded_lda_model.get_document_topics(doc, minimum_probability=0) for doc in corpus_year]
    avg_topic_dist = np.mean([[p for _, p in g] for g in gamma], axis=0)
    topic_activity.append(avg_topic_dist)

topic_activity = np.array(topic_activity)

mean_activity = topic_activity.mean(axis=0)
top_topics_idx = np.argsort(mean_activity)[::-1][:10]

top_words = {}
for idx in top_topics_idx:
    words = [w for w, _ in loaded_lda_model.show_topic(idx, topn=15)]
    wrapped = "<br>".join(
        [", ".join(words[i:i + 5]) for i in range(0, len(words), 5)]
    )
    top_words[idx] = wrapped

fig = go.Figure()

for idx in top_topics_idx:
    fig.add_trace(go.Scatter(
        x=year_list[SUBJECT],
        y=topic_activity[:, idx],
        mode='lines+markers',
        name=f"Topic {idx}",
        hovertemplate=(
            f"<b>Topic {idx}</b><br>"
            f"<b>Top words:</b><br>{top_words[idx]}<br>"
            "Year: %{x}<br>"
            "Weight: %{y:.4f}<extra></extra>"
        )
    ))

fig.update_layout(
    title="Top 5 Topics Over Time (LDA Global Model)",
    xaxis_title="Year",
    yaxis_title="Average Topic Weight",
    legend_title="Topics",
    template="plotly_white",
    hovermode="closest",
    width=950,
    height=550
)

fig.show()


# Metrics

In [10]:
import pickle
from gensim.models import CoherenceModel

In [11]:

topic_range = range(100, 201, 50)
for k in ['cs']:
    print(f"Coherence Score Subject: {k}")
    for i in topic_range:
        MODEL_PATH = f"model_results/lda_global_{k}_num_topic_{i}.pkl"
        DICTIONARY_PATH = f"model_results/dictionary_{k}.gensim"
        try:
            with open(MODEL_PATH, "rb") as f:
                loaded_lda_model = pickle.load(f)
            print("LDA model loaded.")
        except FileNotFoundError:
            print(f"ERROR: Model file not found at {MODEL_PATH}")
        topic_tuples = loaded_lda_model.show_topics(
            num_topics=1000,
            num_words=10,
            formatted=False              
        )
        lda_static_topics = []
        for topic_id, words_probs in topic_tuples:
            top_words = [word for word, prob in words_probs]
            lda_static_topics.append(top_words)
        import ast
        token_list = data[k]["text"].explode().tolist()
        # tokens = [ast.literal_eval(x) for x in token_list]
        tokens = [x.split() for x in token_list]

        dictionary_bow = Dictionary(tokens)
        cm_lda_static = CoherenceModel(
                topics=lda_static_topics,       
                texts=tokens,
                dictionary=dictionary_bow,
                coherence='c_v' ,
                processes=5
            )
        coherence_score = cm_lda_static.get_coherence()
        print(f"Skor Koherensi LDA Statis (C_v) {i} topics: {coherence_score:.4f}")
        

Coherence Score Subject: cs
LDA model loaded.
Skor Koherensi LDA Statis (C_v) 100 topics: 0.5332
LDA model loaded.
Skor Koherensi LDA Statis (C_v) 150 topics: 0.5164
LDA model loaded.
Skor Koherensi LDA Statis (C_v) 200 topics: 0.4825
