In [1]:
import tomotopy as tp
import pandas as pd
import tomotopy as tp
from tqdm.notebook import tqdm

In [2]:
CS = "cs"
MATH = "math"
STAT = "stat"

In [3]:
SUBJECT = CS
LIST_SUBJECT = [CS, MATH, STAT]

In [4]:
df = pd.read_csv(f"dataset/arxiv_{SUBJECT}_bow.csv")
df

Unnamed: 0,title,submitted_date,text,tag_text
0,Fault Detection using Immune-Based Systems and...,2000-10-03,"['detection', 'immune', 'system', 'formal', 'l...","Computational Engineering, Finance, and Scienc..."
1,Robust Classification for Imprecise Environments,2000-09-13,"['robust', 'classification', 'imprecise', 'env...",Machine Learning
2,Tagger Evaluation Given Hierarchical Tag Sets,2000-08-09,"['tagger', 'evaluation', 'hierarchical', 'tag'...",Computation and Language
3,Description of GADEL,2000-03-07,"['description', 'gadel', 'article', 'implement...","Artificial Intelligence, Logic in Computer Sci..."
4,The dynamics of iterated transportation simula...,2000-02-22,"['dynamic', 'transportation', 'simulation', 'r...","Adaptation and Self-Organizing Systems, Comput..."
...,...,...,...,...
159242,Mining the Long Tail: A Comparative Study of D...,2025-09-16,"['long_tail', 'comparative', 'study', 'data', ...","Robotics, Artificial Intelligence, Machine Lea..."
159243,Adaptive Data-Knowledge Alignment in Genetic P...,2025-10-01,"['adaptive', 'datum', 'knowledge', 'alignment'...","Molecular Networks, Artificial Intelligence, M..."
159244,Sobolev Training of End-to-End Optimization Pr...,2025-05-16,"['sobolev', 'training', 'end', 'optimization',...","Machine Learning, Optimization and Control"
159245,Beyond Unimodal Boundaries: Generative Recomme...,2025-03-30,"['unimodal', 'boundary', 'generative', 'recomm...","Information Retrieval, Artificial Intelligence..."


In [5]:
NUM_TOPICS = 50        
TRAIN_ITERATIONS = 500 
ITERATION_STEP = 50 

# Build

In [6]:
if not pd.api.types.is_datetime64_any_dtype(df["submitted_date"]):
    df["submitted_date"] = pd.to_datetime(df["submitted_date"])
df['year'] = df["submitted_date"].dt.year
min_year, max_year = df['year'].min(), df['year'].max()
all_years = sorted(df['year'].unique())
num_time_steps = len(all_years)
print(f"Data found from {min_year} to {max_year} ({num_time_steps} time steps).")
year_to_timestep = {year: i for i, year in enumerate(all_years)}
df['timestep'] = df['year'].map(year_to_timestep)

Data found from 2000 to 2025 (26 time steps).


In [7]:
import ast
token_list = df["text"].explode().tolist()
tokens = [ast.literal_eval(x) for x in token_list]
processed_docs = tokens
doc_timesteps = df['timestep'].tolist()
if len(processed_docs) != len(doc_timesteps):
    raise ValueError("Mismatch between processed docs and timesteps")

In [8]:
dt_model = tp.DTModel(
    t=num_time_steps,        
    k=NUM_TOPICS,            
    seed=42,
    
)


for i in tqdm(range(len(processed_docs)), desc="Adding docs"):
    if processed_docs[i]:
        dt_model.add_doc(processed_docs[i], timepoint=doc_timesteps[i])

print(f"\nMemulai training untuk {TRAIN_ITERATIONS} iterasi...")
for i in range(0, TRAIN_ITERATIONS, ITERATION_STEP):
    dt_model.train(ITERATION_STEP)
    print(f"Iteration: {i + ITERATION_STEP}/{TRAIN_ITERATIONS}\tLog-likelihood: {dt_model.ll_per_word:.4f}")

dt_model.save(f"model_results/dtm/dtm_{SUBJECT}_num_topic_{NUM_TOPICS}_v1.bin")

Adding docs:   0%|          | 0/159247 [00:00<?, ?it/s]


Memulai training untuk 500 iterasi...
Iteration: 50/500	Log-likelihood: -10.0884
Iteration: 100/500	Log-likelihood: -9.4770


  dt_model.train(ITERATION_STEP)


Iteration: 150/500	Log-likelihood: -9.1357
Iteration: 200/500	Log-likelihood: -8.9092
Iteration: 250/500	Log-likelihood: -8.7386
Iteration: 300/500	Log-likelihood: -8.5930
Iteration: 350/500	Log-likelihood: -8.4732
Iteration: 400/500	Log-likelihood: -8.3728
Iteration: 450/500	Log-likelihood: -8.2833
Iteration: 500/500	Log-likelihood: -8.2043


# Visualize

In [None]:
model_path = f"model_results/dtm/dtm_{SUBJECT}_num_topic_{NUM_TOPICS}.bin"
loaded_dt_model = tp.DTModel.load(model_path)

In [None]:
import pandas as pd
import plotly.graph_objects as go
import tomotopy as tp

df["submitted_date"] = pd.to_datetime(df["submitted_date"], errors="coerce")
df["year"] = df["submitted_date"].dt.year

years = sorted(df['year'].unique())
year_to_tp = {y: i for i, y in enumerate(years)}

num_times = loaded_dt_model.num_timepoints
assert len(years) == num_times, "Mismatch antara tahun dan timepoints di model"

In [None]:
import numpy as np

topic_time_probs = []

for t in range(loaded_dt_model.num_timepoints):
    logits = np.array(loaded_dt_model.alpha[t])
    probs = np.exp(logits) / np.exp(logits).sum()
    topic_time_probs.append(probs)

topic_time_probs = np.array(topic_time_probs)   # shape: time × topics


In [None]:
df_topic_time = pd.DataFrame(
    topic_time_probs.T,
    index=[f"Topic {i}" for i in range(loaded_dt_model.k)],
    columns=all_years
)


In [None]:
import plotly.graph_objects as go
import numpy as np

topic_means = topic_time_probs.mean(axis=0)
top5_idx = np.argsort(topic_means)[-5:]

def get_top_words(topic, timepoint, topn=8):
    words = loaded_dt_model.get_topic_words(topic, timepoint, top_n=topn)
    return ", ".join([w for w, p in words])

fig = go.Figure()

for k in top5_idx:
    y_values = topic_time_probs[:, k]

    htext = [
        f"Year: {all_years[t]}<br>"
        f"Topic {k}<br>"
        f"Top words: {get_top_words(k, t)}<br>"
        f"Proportion: {y_values[t]:.4f}"
        for t in range(len(all_years))
    ]

    fig.add_trace(go.Scatter(
        x=all_years,
        y=y_values,
        mode="lines+markers",
        name=f"Topic {k}",
        hovertext=htext,
        hoverinfo="text"
    ))

fig.update_layout(
    title="Top 5 Topics Over Time (DTM)",
    xaxis_title="Year",
    yaxis_title="Proportion",
    hovermode="closest",
    template="plotly_white"
)

fig.show()


# Metrics

In [9]:
import pandas as pd
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from tqdm.notebook import tqdm
from gensim.models import CoherenceModel
import tomotopy as tp
import pandas as pd
import numpy as np

In [10]:
model_path = f"model_results/dtm/dtm_{SUBJECT}_num_topic_{NUM_TOPICS}.bin"
loaded_dt_model = tp.DTModel.load(model_path)

In [11]:
df["submitted_date"] = pd.to_datetime(df["submitted_date"])
df['year'] = df["submitted_date"].dt.year
all_years = sorted(df['year'].unique())
num_time_steps = {year: i for i, year in enumerate(all_years)}

In [12]:
def tokenize_for_coherence(text):
    return [
        token for token in simple_preprocess(str(text), deacc=True)
    ]
texts_for_coherence = [tokenize_for_coherence(text) for text in tqdm(df['text'], desc="Tokenizing for Coherence")]
dictionary_coherence = Dictionary(texts_for_coherence)

Tokenizing for Coherence:   0%|          | 0/159247 [00:00<?, ?it/s]

In [13]:
import numpy as np
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from collections import Counter
from tqdm import tqdm

global_topics_words = []
top_n_words = 10
print("Extracting Global Topics...")
for topic_id in range(NUM_TOPICS):
    word_accumulator = Counter()
    for time_index in range(len(num_time_steps)):
        words_probs = loaded_dt_model.get_topic_words(
            topic_id=topic_id,
            timepoint=time_index,
            top_n=20 
        )
        
        for word, prob in words_probs:
            word_accumulator[word] += prob
            
    most_common_global = [word for word, score in word_accumulator.most_common(top_n_words)]
    global_topics_words.append(most_common_global)
    print(f"Global Topic {topic_id}: {most_common_global}")

print("Calculating Global Coherence...")
try:
    cm_global = CoherenceModel(
        topics=global_topics_words,      # List of list kata (versi global)
        texts=texts_for_coherence,              # Seluruh teks (bukan per tahun)
        dictionary=dictionary_coherence,
        coherence="c_v",
        processes=5
    )
    
    global_score = cm_global.get_coherence()
    print(f"\n=== GLOBAL COHERENCE SCORE (Aggregated): {global_score:.4f} ===")
    
    # Jika ingin melihat skor per topik
    coherence_per_topic = cm_global.get_coherence_per_topic()
    for i, score in enumerate(coherence_per_topic):
        print(f"Topic {i}: {score:.4f}")

except Exception as e:
    print(f"Error calculating global coherence: {e}")

Extracting Global Topics...
Global Topic 0: ['function', 'deep', 'loss', 'attack', 'aware', 'graph', 'set', 'available', 'decision', 'hybrid']
Global Topic 1: ['algorithm', 'neural', 'baseline', 'point', 'improvement', 'security', 'synthetic', 'visual', 'structure', 'effect']
Global Topic 2: ['complexity', 'network', 'global', 'communication', 'metric', 'crucial', 'local', 'device', 'segmentation', 'rate']
Global Topic 3: ['specific', 'setting', 'standard', 'sample', 'function', 'reward', 'efficient', 'code_available', 'optimal', 'algorithm']
Global Topic 4: ['algorithm', 'code', 'condition', 'metric', 'medical', 'technology', 'object', 'signal', 'perception', 'fusion']
Global Topic 5: ['information', 'datum', 'image', 'search', 'efficient', 'linear', 'machine_learning', 'average', 'environment', 'social']
Global Topic 6: ['language', 'dynamic', 'view', 'finding', 'architecture', 'level', 'computation', 'open', 'available', 'self']
Global Topic 7: ['dataset', 'behavior', 'information',

In [None]:
yearly_coherence_scores = [] 
yearly_scores_dict = {}    
for time_index in tqdm(range(len(num_time_steps)), desc="Calculating coherence per year"):
    current_year = all_years[time_index]
    texts_for_coherence_year = [tokenize_for_coherence(text) for text in tqdm(df[df['year'] == current_year]['text'])]
    dictionary_year =  Dictionary(texts_for_coherence_year)
    
    topics_for_this_year = []
    
    
    for topic_id in range(NUM_TOPICS):
        words_probs = loaded_dt_model.get_topic_words(
            topic_id=topic_id,
            timepoint=time_index,
            top_n=10
        )
        word_list = [word for word, prob in words_probs]
        topics_for_this_year.append(word_list)
        
    try:
        cm = CoherenceModel(
            topics=topics_for_this_year,      
            texts=texts_for_coherence_year,
            dictionary=dictionary_year,
            coherence="c_v",
            processes=5
        )
        
        score = cm.get_coherence()
        
        print(f"  Tahun: {current_year} (t={time_index}) -> Skor Koherensi: {score:.4f}")
        
        yearly_coherence_scores.append(score)
        yearly_scores_dict[current_year] = score

    except Exception as e:
        print(f"  Tahun: {current_year} (t={time_index}) -> GAGAL MENGHITUNG: {e}")
        yearly_coherence_scores.append(np.nan) 
        yearly_scores_dict[current_year] = np.nan

if yearly_coherence_scores:
    average_coherence = np.nanmean(yearly_coherence_scores) 
    print(f"\nAverage Score: {average_coherence}")
    print("\nSkor Koherensi per Tahun (Detail):")
    print(yearly_scores_dict)
else:
    print("Tidak ada skor koherensi yang dihitung.")

Calculating coherence per year:   0%|          | 0/26 [00:00<?, ?it/s]

  0%|          | 0/488 [00:00<?, ?it/s]

  Tahun: 2000 (t=0) -> Skor Koherensi: 0.3659


  0%|          | 0/594 [00:00<?, ?it/s]

  Tahun: 2001 (t=1) -> Skor Koherensi: 0.2786


  0%|          | 0/648 [00:00<?, ?it/s]

  Tahun: 2002 (t=2) -> Skor Koherensi: 0.2962


  0%|          | 0/825 [00:00<?, ?it/s]

  Tahun: 2003 (t=3) -> Skor Koherensi: 0.2851


  0%|          | 0/948 [00:00<?, ?it/s]

  Tahun: 2004 (t=4) -> Skor Koherensi: 0.2962


  0%|          | 0/1000 [00:00<?, ?it/s]

  Tahun: 2005 (t=5) -> Skor Koherensi: 0.3651


  0%|          | 0/1000 [00:00<?, ?it/s]

  Tahun: 2006 (t=6) -> Skor Koherensi: 0.3763


  0%|          | 0/1000 [00:00<?, ?it/s]

  Tahun: 2007 (t=7) -> Skor Koherensi: 0.3982


  0%|          | 0/1000 [00:00<?, ?it/s]

  Tahun: 2008 (t=8) -> Skor Koherensi: 0.3988


  0%|          | 0/1000 [00:00<?, ?it/s]

  Tahun: 2009 (t=9) -> Skor Koherensi: 0.3557


  0%|          | 0/1362 [00:00<?, ?it/s]

  Tahun: 2010 (t=10) -> Skor Koherensi: 0.3496


  0%|          | 0/1622 [00:00<?, ?it/s]

  Tahun: 2011 (t=11) -> Skor Koherensi: 0.3684


  0%|          | 0/2254 [00:00<?, ?it/s]

  Tahun: 2012 (t=12) -> Skor Koherensi: 0.3464


  0%|          | 0/2719 [00:00<?, ?it/s]

  Tahun: 2013 (t=13) -> Skor Koherensi: 0.3819


  0%|          | 0/2989 [00:00<?, ?it/s]

  Tahun: 2014 (t=14) -> Skor Koherensi: 0.3887


  0%|          | 0/3346 [00:00<?, ?it/s]

  Tahun: 2015 (t=15) -> Skor Koherensi: 0.3813


  0%|          | 0/4281 [00:00<?, ?it/s]

  Tahun: 2016 (t=16) -> Skor Koherensi: 0.3989


  0%|          | 0/5534 [00:00<?, ?it/s]

  Tahun: 2017 (t=17) -> Skor Koherensi: 0.3536


  0%|          | 0/7471 [00:00<?, ?it/s]

  Tahun: 2018 (t=18) -> Skor Koherensi: 0.3198


  0%|          | 0/9550 [00:00<?, ?it/s]

  Tahun: 2019 (t=19) -> Skor Koherensi: 0.3231


  0%|          | 0/12356 [00:00<?, ?it/s]

  Tahun: 2020 (t=20) -> Skor Koherensi: 0.3175


  0%|          | 0/14101 [00:00<?, ?it/s]

  Tahun: 2021 (t=21) -> Skor Koherensi: 0.3230


  0%|          | 0/15005 [00:00<?, ?it/s]

  Tahun: 2022 (t=22) -> Skor Koherensi: 0.3247


  0%|          | 0/17863 [00:00<?, ?it/s]

  Tahun: 2023 (t=23) -> Skor Koherensi: 0.3221


  0%|          | 0/23746 [00:00<?, ?it/s]

  Tahun: 2024 (t=24) -> Skor Koherensi: 0.3246


  0%|          | 0/26545 [00:00<?, ?it/s]

  Tahun: 2025 (t=25) -> Skor Koherensi: 0.3240

Average Score: 0.3447474302577734

Skor Koherensi per Tahun (Detail):
{2000: 0.36585294677034513, 2001: 0.27861537189668584, 2002: 0.2962242470178769, 2003: 0.285110423176869, 2004: 0.2961500379654354, 2005: 0.3651190144380205, 2006: 0.3763190734026932, 2007: 0.39819305250306625, 2008: 0.39881164954347187, 2009: 0.35569187273780445, 2010: 0.3495641440396596, 2011: 0.368358486582105, 2012: 0.3463780666820756, 2013: 0.38185199713652784, 2014: 0.38866577208099146, 2015: 0.3813048184245958, 2016: 0.39887941858042164, 2017: 0.3535910067865269, 2018: 0.31981068256121153, 2019: 0.3230911479338429, 2020: 0.3175114444447714, 2021: 0.32296954754798546, 2022: 0.3246607191701411, 2023: 0.32208591702656086, 2024: 0.32459165434423426, 2025: 0.3240306739081876}
