# 一、高频分析

定义加载小说的函数

In [3]:
def load_novel(novel):
    with open(f'./file/novels/{novel}.txt', 'rt', encoding='utf-8') as f:
        return f.read()

# 二、主角分析

加载人物数据

In [19]:
with open('./file/data/names.txt', 'rt', encoding='utf-8') as f:
    data = [line.rstrip() for line in f]
    novels = data[::2]
    names = data[1::2]
    novel_names = {k: v.split() for k,v in zip(novels, names)}
    del novels, names, data

可以预览一下《天龙八部》中的人物

In [20]:
",".join(novel_names['天龙八部'][:20])

'刀白凤,丁春秋,马夫人,马五德,小翠,于光豪,巴天石,不平道人,邓百川,风波恶,甘宝宝,公冶乾,木婉清,包不同,天狼子,太皇太后,王语嫣,乌老大,无崖子,云岛主'

下面我们寻找一下每部小说的主角，统计每个人物的出场次数，显然次数越多主角光环越强，下面我们看看每部小说，出现次数最多的前十个人物：

In [25]:
from collections import Counter
def find_main_characters(novel, num=10, content=None):
    if content is None:
        content = load_novel(novel)
    count = Counter()
    for name in novels_names[novel]:
        count[name] = content.count(name)
    return count.most_common(num)

for novel in novel_names:
    print(novel, find_main_characters(novel, num=5, content=None))
    

书剑恩仇录 [('陈家洛', 2095), ('张召重', 760), ('徐天宏', 685), ('霍青桐', 650), ('余鱼同', 605)]
碧血剑 [('袁承志', 3028), ('何铁手', 306), ('温青', 254), ('阿九', 215), ('洪胜海', 200)]
射雕英雄传 [('郭靖', 5009), ('黄蓉', 3650), ('洪七公', 1041), ('黄药师', 868), ('周伯通', 654)]
神雕侠侣 [('杨过', 5991), ('小龙女', 2133), ('郭靖', 1431), ('黄蓉', 1428), ('李莫愁', 1016)]
雪山飞狐 [('胡斐', 230), ('曹云奇', 228), ('宝树', 225), ('苗若兰', 217), ('胡一刀', 207)]
飞狐外传 [('胡斐', 2761), ('程灵素', 765), ('袁紫衣', 425), ('苗人凤', 405), ('马春花', 331)]
白马啸西风 [('李文秀', 441), ('苏普', 270), ('阿曼', 164), ('苏鲁克', 147), ('陈达海', 106)]
倚天屠龙记 [('张无忌', 4665), ('赵敏', 1250), ('谢逊', 1211), ('张翠山', 1146), ('周芷若', 825)]
鸳鸯刀 [('萧中慧', 103), ('袁冠南', 82), ('卓天雄', 76), ('周威信', 74), ('林玉龙', 52)]
天龙八部 [('段誉', 3372), ('萧峰', 1786), ('虚竹', 1636), ('阿紫', 1150), ('乔峰', 1131)]
连城诀 [('狄云', 1433), ('水笙', 439), ('戚芳', 390), ('丁典', 364), ('万震山', 332)]
侠客行 [('石破天', 1804), ('石清', 611), ('丁珰', 446), ('白万剑', 446), ('丁不四', 343)]
笑傲江湖 [('令狐冲', 5838), ('岳不群', 1184), ('林平之', 926), ('岳灵珊', 919), ('仪琳', 729)]
鹿鼎记 [('韦小宝', 9731)

上述结果用文本展示了每部小说的前5个主角，但是不够直观，下面用pyecharts的树图展示一下：

In [38]:
from pyecharts import options as opts
from pyecharts.charts import TreeMap

data = []
for novel in novel_names:
    tmp = []
    data.append({"name": novel, "children": tmp})
    for name, count in find_main_characters(novel, 5):
        tmp.append({"name": name, "value": count})
c = (
    TreeMap()
    .add("", data, levels=[
        opts.TreeMapLevelsOpts(),
        opts.TreeMapLevelsOpts(
            color_saturation=[0.4, 0.6],
            treemap_itemstyle_opts=opts.TreeMapItemStyleOpts(
                border_color_saturation=0.7, gap_width=5, border_width=10
            ),
            upper_label_opts=opts.LabelOpts(
                is_show=True, position='insideTopLeft', vertical_align='top'
            )
        ),
    ])
    .set_global_opts(title_opts=opts.TitleOpts(title="金庸小说主角"))
)
c.render_notebook()