In [5]:
import pandas as pd
nodes = pd.read_csv("ten_out/nodes.csv")

# 每年主题数
print(nodes.groupby("time_from")["node_id"].count())

# 每年平均一致性
print(nodes.groupby("time_from")["coherence"].mean())

time_from
2010    3
2011    3
2012    5
2013    4
2014    5
2015    4
2016    6
2017    4
2018    5
2019    4
2020    8
2021    2
Name: node_id, dtype: int64
time_from
2010    0.762197
2011    0.760838
2012    0.782821
2013    0.802874
2014    0.809090
2015    0.801830
2016    0.793973
2017    0.798535
2018    0.840083
2019    0.818210
2020    0.822959
2021    0.792390
Name: coherence, dtype: float64


In [6]:
edges = pd.read_csv("ten_out/edges.csv")

# 各种事件的数量
print(edges["kind"].value_counts())

# 每年 continue / split / merge 的比例
print(edges.groupby(["time_from","kind"])["source"].count().unstack(fill_value=0))

kind
continue    20
merge        6
Name: count, dtype: int64
kind       continue  merge
time_from                 
2                 2      2
3                 2      0
4                 2      0
5                 1      0
6                 4      4
7                 1      0
8                 3      0
9                 2      0
10                1      0
11                1      0
12                1      0


In [7]:
import pandas as pd, plotly.graph_objects as go

df = pd.read_csv("ten_out/sankey.csv")
labels = pd.Index(pd.concat([df["source"], df["target"]]).unique())
src = labels.get_indexer(df["source"])
tgt = labels.get_indexer(df["target"])
val = df["value"]

fig = go.Figure(go.Sankey(
    node=dict(label=labels.astype(str).tolist()),
    link=dict(source=src, target=tgt, value=val)
))
fig.update_layout(title="Topic Evolution Sankey")
fig.show()


In [8]:
# save as plot_sankey_with_labels.py
import pandas as pd
import plotly.graph_objects as go

# === 配置区 ===
NODES_CSV = "ten_out/nodes.csv"
EDGES_CSV = "ten_out/edges.csv"
TOPK_TERMS = 4          # 每个节点展示的关键词数量
MAX_LABEL_LEN = 28      # 标签最大长度，超出会省略号
WEIGHT_MIN = 0.65       # 仅展示 >= 这个相似度的边（可调）
MAX_OUT_EDGES = 3       # 每个 source 只保留权重最高的前 N 条边（防过密）；设为 None 则不过滤
TITLE = "Topic Evolution Sankey (with Top Terms)"

# === 读取 ===
nodes = pd.read_csv(NODES_CSV)
edges = pd.read_csv(EDGES_CSV)

# 过滤弱边
if "weight" in edges.columns and WEIGHT_MIN is not None:
    edges = edges[edges["weight"] >= WEIGHT_MIN].copy()

# 每个 source 只保留前N条边（按 weight 降序）
if MAX_OUT_EDGES is not None and "weight" in edges.columns:
    edges = (edges.sort_values(["source", "weight"], ascending=[True, False])
                  .groupby("source")
                  .head(MAX_OUT_EDGES)
                  .reset_index(drop=True))

# === 构造更可读的节点标签 ===
# nodes.node_id 形如 "3:0"
def tidy_terms(s):
    if not isinstance(s, str):
        return ""
    # top_terms 列里是以 "; " 拼的字符串
    parts = [t.strip() for t in s.split(";") if t.strip()]
    parts = parts[:TOPK_TERMS]
    label = "、".join(parts)
    if len(label) > MAX_LABEL_LEN:
        label = label[:MAX_LABEL_LEN] + "…"
    return label

nodes["terms_label"] = nodes["top_terms"].fillna("").map(tidy_terms)

# 如果想在标签上加时间片信息：
nodes["pretty_label"] = nodes["time_from"].astype(str) + ":" + nodes["tid"].astype(str) \
                        + " | " + nodes["terms_label"].replace("", "(no terms)")

# === 把节点字符串映射为整数索引（Plotly 要求） ===
labels_idx = pd.Index(nodes["node_id"])  # 用 node_id 作为唯一键
def id_to_idx(x): return labels_idx.get_loc(x)

# 准备 sankey 数据
df = edges.copy()
df["source_idx"] = df["source"].map(id_to_idx)
df["target_idx"] = df["target"].map(id_to_idx)

# link 的值，可以用 weight，也可以全部=1 看结构
if "weight" in df.columns:
    link_value = df["weight"].astype(float)
else:
    link_value = pd.Series([1]*len(df))

# link 着色（可选）：按 kind 区分颜色
kind2color = {"continue": "rgba(150,150,150,0.5)",
              "split":    "rgba(255,140,0,0.6)",
              "merge":    "rgba(0,150,255,0.6)"}
link_colors = df["kind"].map(kind2color).fillna("rgba(180,180,180,0.4)")

# === 画图 ===
fig = go.Figure(go.Sankey(
    node=dict(
        label=nodes.set_index("node_id").loc[labels_idx, "pretty_label"].tolist(),
        pad=12, thickness=12
    ),
    link=dict(
        source=df["source_idx"].to_list(),
        target=df["target_idx"].to_list(),
        value=link_value.to_list(),
        color=link_colors.to_list()
    )
))
fig.update_layout(title=TITLE, font=dict(size=12))
fig.show()