In [1]:
import os, re, argparse
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.representation import PartOfSpeech, KeyBERTInspired, MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

try:
    from fugashi import Tagger
    tagger = Tagger()
except Exception:
    tagger = None

# ------------------------
# Tokenizer for Japanese
# ------------------------
JA_STOPWORDS = set("の は に を が で と も から まで など そして しかし また その この あの する なる いる ある".split())
PUNCT = re.compile(r"[。、・「」『』（）()［］\[\]【】〈〉《》…—\-.,!?]")

def ja_tokenize(text: str):
    if not isinstance(text, str):
        return []
    text = PUNCT.sub(" ", text)
    if tagger is None:
        return [t for t in re.split(r"\s+", text) if t]
    toks = []
    for w in tagger(text):
        pos = getattr(w.feature, "pos1", "")
        if pos in {"助詞","助動詞","記号","接続詞","連体詞","フィラー","感動詞"}:
            continue
        lemma = getattr(w.feature, "lemma", None) or w.surface
        lemma = lemma.strip()
        if not lemma or lemma in JA_STOPWORDS:
            continue
        toks.append(lemma)
    return toks

# ------------------------
# Build BERTopic model
# ------------------------
def build_model():
    vectorizer = CountVectorizer(
        tokenizer=ja_tokenize,
        token_pattern=None,
        ngram_range=(1,2),
        min_df=3, max_df=0.5
    )

    rep = [KeyBERTInspired(top_n_words=20),
           MaximalMarginalRelevance(diversity=0.3)]

    emb_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
    ctfidf = ClassTfidfTransformer(reduce_frequent_words=True)

    model = BERTopic(
        embedding_model=emb_model,
        ctfidf_model=ctfidf,
        vectorizer_model=vectorizer,
        representation_model=rep,
        language="japanese",
        verbose=True,
        calculate_probabilities=True
    )
    return model

# ------------------------
# Evolution alignment
# ------------------------
def align_topics(year_topics, threshold=0.6):
    edges = []
    years = sorted(year_topics.keys())
    for i in range(len(years)-1):
        y1, y2 = years[i], years[i+1]
        nodes1, nodes2 = year_topics[y1], year_topics[y2]
        if not nodes1 or not nodes2:
            continue
        emb1 = np.vstack([n['embedding'] for n in nodes1])
        emb2 = np.vstack([n['embedding'] for n in nodes2])
        sims = cosine_similarity(emb1, emb2)
        for i1, n1 in enumerate(nodes1):
            for i2, n2 in enumerate(nodes2):
                w = sims[i1, i2]
                if w >= threshold:
                    edges.append({
                        'source': f"{y1}:{n1['topic']}",
                        'target': f"{y2}:{n2['topic']}",
                        'kind': 'continue',
                        'weight': w
                    })
    return edges

  from .autonotebook import tqdm as notebook_tqdm





In [None]:
# ------------------------
# Main (Modified for Jupyter)
# ------------------------

# Define input and output paths directly
input_file = 'Narou_global1w.csv'
output_dir = './ten_out'
min_docs_per_year = 10

os.makedirs(output_dir, exist_ok=True)

df = pd.read_csv(input_file)

# Debug: Check input DataFrame
print('[Debug] Input DataFrame info:')
print(df.info())
print('[Debug] First few rows of the DataFrame:')
print(df.head())

df = df.dropna(subset=['story', 'Year'])
print('[Debug] DataFrame after dropna:')
print(df.info())

# Debug: Check unique years
df['Year'] = pd.to_numeric(df['Year'], errors='coerce').astype('Int64')
print('[Debug] Unique years in the data:', df['Year'].unique())

# Debug: Check document count per year
for y in sorted(df['Year'].unique()):
    print(f'[Debug] Year {y}: {len(df[df["Year"] == y])} documents')

docs = df['story'].astype(str).tolist()
years = df['Year'].astype(int).tolist()

[Debug] Input DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6071 entries, 0 to 6070
Data columns (total 56 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         6071 non-null   int64  
 1   title              6071 non-null   object 
 2   ncode              6071 non-null   object 
 3   userid             6071 non-null   int64  
 4   writer             6071 non-null   object 
 5   story              6071 non-null   object 
 6   biggenre           6071 non-null   int64  
 7   genre              6071 non-null   int64  
 8   gensaku            0 non-null      float64
 9   keyword            6068 non-null   object 
 10  general_firstup    6071 non-null   object 
 11  general_lastup     6071 non-null   object 
 12  novel_type         6071 non-null   int64  
 13  end                6071 non-null   int64  
 14  general_all_no     6071 non-null   int64  
 15  length             6071 non-null   int64  

In [None]:
# Fit BERTopic
topic_model = build_model()
topics, probs = topic_model.fit_transform(docs)

# Debug: Check topics after fit_transform
print('[Debug] Topics:', set(topics))

# Topics over time
topics_over_time = topic_model.topics_over_time(docs, years, nr_bins=len(set(years)))

2025-09-03 18:29:14,845 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 190/190 [00:02<00:00, 67.80it/s]
2025-09-03 18:29:17,739 - BERTopic - Embedding - Completed ✓
Batches: 100%|██████████| 190/190 [00:02<00:00, 67.80it/s]
2025-09-03 18:29:17,739 - BERTopic - Embedding - Completed ✓
2025-09-03 18:29:17,740 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-03 18:29:17,740 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-03 18:29:29,346 - BERTopic - Dimensionality - Completed ✓
2025-09-03 18:29:29,346 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-03 18:29:29,346 - BERTopic - Dimensionality - Completed ✓
2025-09-03 18:29:29,346 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-03 18:29:29,797 - BERTopic - Cluster - Completed ✓
2025-09-03 18:29:29,797 - BERTopic - Representation - Extracting topics from clusters using representation

[Debug] Topics: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1}


14it [00:30,  2.21s/it]
14it [00:30,  2.21s/it]


In [23]:
# Build year->topic nodes
year_topics = {}
for y in sorted(set(years)):
    nodes = []
    df_y = topics_over_time[topics_over_time['Timestamp'].astype(int) == y]
    for _, row in df_y.iterrows():
        t = int(row['Topic'])
        if t == -1:
            continue
        emb = topic_model.topic_embeddings_[t]
        nodes.append({
            'year': y,
            'topic': t,
            'top_terms': row['Words'],
            'count': row['Frequency'],
            'embedding': emb
        })
    if len(nodes) >= min_docs_per_year:
        year_topics[y] = nodes

# Debug: Check year_topics content
for y, nodes in year_topics.items():
    print(f'[Debug] Year {y}: {len(nodes)} nodes')

[Debug] Year 2009: 15 nodes
[Debug] Year 2010: 26 nodes
[Debug] Year 2011: 35 nodes
[Debug] Year 2012: 38 nodes
[Debug] Year 2013: 41 nodes
[Debug] Year 2014: 40 nodes
[Debug] Year 2015: 41 nodes
[Debug] Year 2016: 41 nodes
[Debug] Year 2017: 47 nodes
[Debug] Year 2018: 47 nodes
[Debug] Year 2019: 49 nodes
[Debug] Year 2020: 44 nodes


In [25]:
# Build nodes.csv
all_nodes = []
for y, nodes in year_topics.items():
    for n in nodes:
        all_nodes.append({
            'id': f"{y}:{n['topic']}",
            'year': y,
            'topic': n['topic'],
            'top_terms': n['top_terms'],
            'count': n['count']
        })
pd.DataFrame(all_nodes).to_csv(os.path.join(output_dir,'nodes.csv'), index=False)

# Debug: Check all_nodes and edges before export
print('[Debug] All nodes:', all_nodes[:5])

[Debug] All nodes: [{'id': '2009:0', 'year': 2009, 'topic': 0, 'top_terms': 'ゲーム-game ライフ-life, ゲーム-game 舞台, ゲーム-game キャラクター-character, ライン-line ゲーム-game, 魔法 冒険', 'count': 3}, {'id': '2009:1', 'year': 2009, 'topic': 1, 'top_terms': '！ アニメ-animation, 読む 呉れる, ？ ヒロイン-heroine, 中 作画, 勘弁 ！', 'count': 3}, {'id': '2009:2', 'year': 2009, 'topic': 2, 'top_terms': '彼女 下, 伝説 聖女, 為る 彼女, 来る 物語, 一人 少女', 'count': 3}, {'id': '2009:3', 'year': 2009, 'topic': 3, 'top_terms': '日本 人, 現代 日本, 有る 日本, 幽霊, 世界 日本', 'count': 2}, {'id': '2009:4', 'year': 2009, 'topic': 4, 'top_terms': '魔術 師, 魔法 使い, 魔術 才能, 最強 魔術, 魔法 使う', 'count': 2}]


In [26]:
# Build edges.csv
edges = align_topics(year_topics, threshold=0.6)
pd.DataFrame(edges).to_csv(os.path.join(output_dir,'edges.csv'), index=False)

# Sankey
sankey = [{'source':e['source'], 'target':e['target'], 'value':e['weight']} for e in edges]
pd.DataFrame(sankey).to_csv(os.path.join(output_dir,'sankey.csv'), index=False)

# GraphML
with open(os.path.join(output_dir,'graph.graphml'),'w',encoding='utf-8') as f:
    f.write("<?xml version='1.0' encoding='UTF-8'?>\n")
    f.write("<graphml xmlns='http://graphml.graphdrawing.org/xmlns'>\n")
    f.write("<graph edgedefault='directed'>\n")
    for n in all_nodes:
        f.write(f"<node id='{n['id']}'><data key='label'>{n['top_terms']}</data></node>\n")
    for e in edges:
        f.write(f"<edge source='{e['source']}' target='{e['target']}'><data key='weight'>{e['weight']}</data></edge>\n")
    f.write("</graph></graphml>")

print("[Done] Exported nodes.csv, edges.csv, sankey.csv, graph.graphml")

[Done] Exported nodes.csv, edges.csv, sankey.csv, graph.graphml
