In [6]:
import jieba
import jieba.posseg as pseg
import re
import networkx as nx
import time

In [7]:
start_time = time.time()

## 读取自定义词典

In [8]:
jieba.load_userdict("names.txt")

## 获取词频最高的词，和每个段落出现的词

In [9]:
word_class = ["nr", "nz", "ns"]

In [10]:
words_dict = {} # 词频统计 {词：词频}
words_lines = []    # 每个段落出现的词 [[词1，词2，...], [...], [...]]
maxn = 50

with open("射雕英雄传.txt", "r") as file:
    # 获取词频最高的角色姓名
    for line in file:   
        text = re.sub(r"[^\u4e00-\u9fa5]", "", line)    # 去除非中文字符
        words = pseg.cut(text.strip())
        for word, flag in words:
            if len(word) > 1 and flag in word_class:    # 长度大于1且为人名、地名、机构名
                # print(word)
                words_dict[word] = words_dict.get(word, 0) + 1
    
    # 按照词频排序
    words_dict = dict(sorted(words_dict.items(), key=lambda x:x[1], reverse=True))
    top_words = list(words_dict.keys())[:maxn] # 取50个词
    print(top_words)

    # 获取每个段落里出现的词
    file.seek(0)    # 文件指针回到开头
    for line in file:   # 遍历每一段
        words_line = [] # 每一段出现的词
        text = re.sub(r"[^\u4e00-\u9fa5]", "", line)    
        words = pseg.cut(text.strip())
        for word, flag in words:
            if len(word) > 1 and word in top_words: 
                words_line.append(word)
        if len(words_line) == 0:
            continue
        words_line = list(set(words_line))
        # print(words_line)
        words_lines.append(words_line)  


['郭靖', '黄蓉', '欧阳锋', '洪七公', '黄药师', '武功', '周伯通', '丘处机', '欧阳克', '梅超风', '柯镇恶', '裘千仞', '杨康', '铁木真', '成吉思汗', '完颜洪烈', '朱聪', '穆念慈', '完颜康', '蒙古', '陆冠英', '彭连虎', '拖雷', '杨铁心', '梁子翁', '江南', '王处一', '桃花岛', '靖哥哥', '包惜弱', '华筝', '黄蓉笑', '傻姑', '韩小莹', '韩宝驹', '丐帮', '老顽童', '侯通海', '马钰', '沙通天', '郭靖心', '明白', '黄老邪', '鲁有脚', '程瑶迦', '哲别', '尹志平', '全金发', '师哥', '陆庄主']


In [11]:
top_words

['郭靖',
 '黄蓉',
 '欧阳锋',
 '洪七公',
 '黄药师',
 '武功',
 '周伯通',
 '丘处机',
 '欧阳克',
 '梅超风',
 '柯镇恶',
 '裘千仞',
 '杨康',
 '铁木真',
 '成吉思汗',
 '完颜洪烈',
 '朱聪',
 '穆念慈',
 '完颜康',
 '蒙古',
 '陆冠英',
 '彭连虎',
 '拖雷',
 '杨铁心',
 '梁子翁',
 '江南',
 '王处一',
 '桃花岛',
 '靖哥哥',
 '包惜弱',
 '华筝',
 '黄蓉笑',
 '傻姑',
 '韩小莹',
 '韩宝驹',
 '丐帮',
 '老顽童',
 '侯通海',
 '马钰',
 '沙通天',
 '郭靖心',
 '明白',
 '黄老邪',
 '鲁有脚',
 '程瑶迦',
 '哲别',
 '尹志平',
 '全金发',
 '师哥',
 '陆庄主']

## 获取共现矩阵

In [12]:
def get_cocurrence_matrix(words_lines, maxn, top_words):
    """
    获取共现矩阵
    :params words_lines: 每个段落出现的词 [[词1，词2，...], [...], [...]]
    :params maxn: 词频最高的词的数量
    :params top_words: 词频最高的词
    :return: 共现矩阵   matrix[word_a][word_b] = 词a和词b共现的次数
    """
    co_matrix = np.zeros((maxn, maxn))
    for words_line in words_lines:
        for i in range(len(words_line)-1):
            for j in range(i+1,len(words_line)):
                word_a = words_line[i]
                word_b = words_line[j]
                row_index = top_words.index(word_a)
                colunm_index = top_words.index(word_b)
                # print(row_index,colunm_index)
                co_matrix[row_index, colunm_index] += 1
    return co_matrix

In [13]:
co_matrix = get_cocurrence_matrix(words_lines=words_lines, maxn=maxn, top_words=top_words)

## 获取无向图

In [14]:
def get_graph(co_matrix,top_words):
    """
    获取图
    :params co_matrix: 共现矩阵
    :params top_words: 词频最高的词
    :return: 图
    """
    graph = nx.Graph()
    graph.add_nodes_from(top_words)

    for i in range(len(top_words)-1):
        for j in range(i+1, len(top_words)):
            weight = co_matrix[i, j] + co_matrix[j, i]
            if weight > 0: 
                graph.add_edge(top_words[i], top_words[j], weight=weight) 

    return graph          

In [15]:
graph = get_graph(co_matrix, top_words)

## 保存为gexf文件

In [16]:
nx.write_gexf(graph, "射雕英雄传.gexf")

In [17]:
print("运行时间：", end="")
print(time.time() - start_time, end="s")

运行时间：85.73619675636292s