In [1]:
import pandas as pd
results = pd.read_csv('dgcn_clustering_results_2018.csv')
results

Unnamed: 0,n_clusters,diversity,coherence,cohenrences,topics_words,topic_ctfidfs
0,2,0.864,0.349166,"[0.4114547301956549, 0.286876395809927]","[['system', 'aceh', 'financial', 'design', 'mo...","[[164.73593709306485, 129.05863476260959, 127...."
1,3,0.822,0.351525,"[0.423030387012301, 0.344667325177772, 0.28687...","[['system', 'aceh', 'design', 'financial', 'mh...","[[150.00345491401026, 129.05863476260959, 113...."
2,4,0.788,0.491591,"[0.423030387012301, 0.344667325177772, 0.77871...","[['system', 'aceh', 'design', 'financial', 'mh...","[[150.00345491401026, 129.05863476260959, 113...."


In [2]:
best_row = 2
words = pd.DataFrame(eval(results.topics_words[best_row]))
ctfidfs = pd.DataFrame(eval(results.topic_ctfidfs[best_row]))
coherences = eval(results.cohenrences[best_row])

In [3]:
from collections import Counter

# 统计每个主题的词频分布
def get_topic_word_distribution(topic_words):
    """计算一个主题的词频分布（概率标准化）"""
    word_counts = Counter(topic_words)
    total_words = sum(word_counts.values())
    return {word: count / total_words for word, count in word_counts.items()}

# 获取所有主题的词频分布
topic_dists = []
for i in range(len(words)):
    topic_words = words.iloc[i].dropna().tolist()  # 提取词汇并去除NaN（如果有）
    topic_dist = get_topic_word_distribution(topic_words)
    topic_dists.append(topic_dist)

In [4]:
import numpy as np

# 计算紧密中心度(Closeness Centrality)

def hellinger_distance(dist1, dist2, words=words):
    """计算两个词频分布之间的Hellinger距离"""
    all_words = set(dist1.keys()).union(set(dist2.keys()))
    sqrt_p = np.array([np.sqrt(dist1.get(word, 0)) for word in all_words])
    sqrt_q = np.array([np.sqrt(dist2.get(word, 0)) for word in all_words])
    return np.sqrt(np.sum((sqrt_p - sqrt_q) ** 2)) / np.sqrt(2)

# 计算所有主题之间的Hellinger距离矩阵
distance_matrix = np.zeros((len(words), len(words)))
for i in range(len(words)):
    for j in range(len(words)):
        distance_matrix[i, j] = hellinger_distance(topic_dists[i], topic_dists[j])

# 计算紧密中心度
closeness_centrality = {}
for t in range(len(words)):
    sum_distances = np.sum(distance_matrix[t])  # 计算主题t到所有其他主题的距离和
    closeness_centrality[t] = 1 / sum_distances if sum_distances != 0 else 0

# 计算主题权重(Topic Weight)

total_coherence = sum(coherences)
topic_weights = {t: coh / total_coherence for t, coh in enumerate(coherences)}

In [12]:
import pickle
from collections import defaultdict

with open('dgcn_result_2018.pkl', 'rb') as f:
        G = pickle.load(f)

# 提取所有 seeds 节点（文档）
seeds_nodes = [n for n in G.nodes if G.nodes[n]['type'] == 'seed']

# 按主题分组文档
topic_docs = defaultdict(list)
for node in seeds_nodes:
    topic = G.nodes[node]['cluster']
    text = G.nodes[node]['bert_abstract']  # 文档内容
    topic_docs[topic].append(text)

# 检查主题数量
topics = list(topic_docs.keys())
print(f"找到的主题: {topics}")

找到的主题: [2, 0, 3, 1]


In [13]:
# 构建所有主题的共享词汇表
all_words = list(set([w for i in range(len(words)) for w in words.iloc[i]]))
word_to_idx = {word: i for i, word in enumerate(all_words)}

# 初始化协方差矩阵存储
topic_ac_scores = {}

for topic in topics:
    docs = topic_docs[topic]
    # 统计高频词在文档中的出现次数（二进制表示是否出现）
    word_occurrence = np.zeros((len(docs), len(all_words)))
    for i, doc in enumerate(docs):
        for word in doc.split():
            if word in word_to_idx:
                word_occurrence[i, word_to_idx[word]] = 1
    # 计算词汇协方差矩阵
    cov_matrix = np.cov(word_occurrence, rowvar=False)  # 变量是词汇，所以 rowvar=False
    # 计算均值和方差
    mean_cov = np.mean(cov_matrix)
    var_t = np.var(word_occurrence)
    # 避免除以零
    topic_ac_scores[topic] = mean_cov / var_t if var_t != 0 else 0

print("主题自相关性 AC(t):", topic_ac_scores)

主题自相关性 AC(t): {2: 0.14716305458933954, 0: 0.01551340066637781, 3: 0.22594441734252083, 1: 0.06597545022311942}


In [19]:
# 将结果整合到DataFrame中
calculation_results = pd.DataFrame({
    'Topic': topic_docs.keys(),
    'Docs': topic_docs.values(),
    'Words': [list(words.iloc[row]) for row in range(len(words))],
    'Ctfidfs': [list(ctfidfs.iloc[row]) for row in range(len(ctfidfs))],
    'Closeness_Centrality': closeness_centrality.values(),
    'Topic_Weight': topic_weights.values(),
    'Auto_Correleration': topic_ac_scores.values()
}).set_index('Topic')

from sklearn.preprocessing import MinMaxScaler
#calculation_results[['Min_Max_CC','Min_Max_TW','Min_Max_AC']] = MinMaxScaler().fit_transform(calculation_results[['Closeness_Centrality', 'Topic_Weight', 'Auto_Correleration']])

import math
#calculation_results['Min_Max_Weakness'] = [(calculation_results.Min_Max_CC[row] * calculation_results.Min_Max_TW[row]) / (1 + math.exp(-calculation_results.Min_Max_AC[row])) for row in range(len(calculation_results))]
calculation_results['Weakness'] = [(calculation_results.Closeness_Centrality[row] * calculation_results.Topic_Weight[row]) / (1 + math.exp(-calculation_results.Auto_Correleration[row])) for row in range(len(calculation_results))]

calculation_results['Weakness_Min_Max'] = MinMaxScaler().fit_transform(calculation_results[['Weakness']])
                                   
calculation_results = calculation_results.sort_index().reset_index()

calculation_results

Unnamed: 0,Topic,Docs,Words,Ctfidfs,Closeness_Centrality,Topic_Weight,Auto_Correleration,Weakness,Weakness_Min_Max
0,0,[intensification globalization rapid developme...,"[biometric, information, new, dt, model, segre...","[52.4487602698934, 45.8059798376516, 37.724831...",0.36941,0.175282,0.015513,0.042072,0.198547
1,1,[advent artificial intelligence way technology...,"[food, cloud, iot, plastic, intelligence, data...","[116.73559036766228, 99.09377689622501, 91.611...",0.381418,0.213565,0.065975,0.080199,1.0
2,2,[specialty grand challenge article front artif...,"[system, aceh, design, financial, mhpp, contra...","[150.00345491401026, 129.05863476260959, 113.0...",0.37415,0.215133,0.147163,0.032627,0.0
3,3,[advances artificial intelligence ( ai ) trans...,"[art, infertility, child, health, law, might, ...","[79.54152561568326, 69.39018841521411, 64.1453...",0.364071,0.39602,0.225944,0.043202,0.222303


In [20]:
calculation_results.to_csv('wsd_calculation_results_2018.csv', index=False)
calculation_results.to_excel('wsd_calculation_results_2018.xlsx', index=False)

In [22]:
weak_topics = [id for id in range(len(calculation_results)) if 0.01 <= calculation_results.Weakness_Min_Max[id] <= 0.1]
if weak_topics:
    print(f'弱主题有{weak_topics}。')
else:
    print('没有弱主题。')

没有弱主题。


In [23]:
citations = pd.read_csv('citations_final_abstract_2018.csv')
citations_count = citations.groupby('seed_paper_id').count()['id'].reset_index().rename(columns={'id':'citations_count'})
seeds = pd.read_csv('seeds_final_abstract_2018.csv')
citations_count['bert_abstract'] = [seeds.bert_final_abstract[list(seeds.id).index(id)] for id in citations_count.seed_paper_id]
citations_count['cluster'] = [n for bert_abstract in citations_count.bert_abstract for n, docs in enumerate(calculation_results.Docs) if bert_abstract in docs]
citations_count

Unnamed: 0,seed_paper_id,citations_count,bert_abstract,cluster
0,https://openalex.org/W2785943049,40,study examine two important aspect late techno...,2
1,https://openalex.org/W2790020274,31,advent electronic medical record ( emrs ) fuel...,3
2,https://openalex.org/W2792048062,16,rise artificial intelligence recently lead bot...,3
3,https://openalex.org/W2807786846,37,advent artificial intelligence way technology ...,1
4,https://openalex.org/W2809303504,1,artificial intelligence ( ai ) become primary ...,0
5,https://openalex.org/W2884382206,22,objective work present quick gbest guided arti...,2
6,https://openalex.org/W2885623676,282,advances artificial intelligence ( ai ) transf...,3
7,https://openalex.org/W2886383250,4,artificial neural networks ( anns ) genetic al...,0
8,https://openalex.org/W2888011899,6,development artificial intelligence technology...,2
9,https://openalex.org/W2889038430,1,common sign development financing cryptocurren...,2


In [24]:
for weak_topic in weak_topics:
    weak_signal_results = pd.DataFrame(data={'word': calculation_results.Words[weak_topic], 'ctfidf': calculation_results.Ctfidfs[weak_topic]})
    weak_signal_results['min_max_ctfidf'] = MinMaxScaler().fit_transform(weak_signal_results[['ctfidf']])
    weak_signal_results['weak_signal'] = [1 if 0.02 <= ctfidf <= 0.1 else 0 for ctfidf in weak_signal_results.min_max_ctfidf]
    print(f'弱主题（主题{weak_topic}）有{weak_signal_results.weak_signal.sum()}个弱信号。')
    weak_signals = [weak_signal_results.word[row] for row in range(len(weak_signal_results)) if weak_signal_results.weak_signal[row] == 1]
    print(', '.join(weak_signals[:min(15,len(weak_signals))]) + '...')
    weak_signal_results['citations_count'] = [sum([citations_count.citations_count[row] for row in citations_count[citations_count.cluster==weak_topic].index if word in citations_count.bert_abstract[row]]) for word in weak_signal_results.word]
    weak_signal_filter_results = weak_signal_results[weak_signal_results.weak_signal==1].sort_values(by='citations_count', ascending=False).reset_index(drop=True)
    weak_signal_filter_results['min_max_c_count'] = MinMaxScaler().fit_transform(weak_signal_filter_results[['citations_count']])
    weak_signal_filter_results['weak_signal_filter'] = [1 if 0.02 <= c_count <= 0.1 else 0 for c_count in weak_signal_filter_results.min_max_c_count]
    print(f'弱主题（主题{weak_topic}）有{weak_signal_filter_results.weak_signal_filter.sum()}个过滤后的弱信号。')
    weak_signals_filter = [weak_signal_filter_results.word[row] for row in range(len(weak_signal_filter_results)) if weak_signal_filter_results.weak_signal_filter[row] == 1]
    print(', '.join(weak_signals_filter))
    weak_signal_filter_results.to_csv(f'wsd_filter_results_{weak_topic}_2018.csv',index=False)
    weak_signal_filter_results.to_excel(f'wsd_filter_results_{weak_topic}_2018.xlsx',index=False)
    with open(f'weak_signals_{weak_topic}_2018.txt', 'w') as f:
        f.write(', '.join(weak_signals_filter))