In [9]:
import os
import hanlp
from collections import Counter
import csv

# 初始化HanLP分词器
tokenizer = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
recognizer = hanlp.load(hanlp.pretrained.ner.MSRA_NER_ELECTRA_SMALL_ZH)

# 创建输出文件夹
os.makedirs('30-50_split', exist_ok=True)

# 存储每章节的人物
chapter_persons = []

# 遍历30-50文件夹中的txt文件
for filename in sorted(os.listdir('30-50')):
    if filename.endswith('.txt'):
        filepath = os.path.join('30-50', filename)
        
        # 读取文件
        with open(filepath, 'r', encoding='utf-8') as f:
            text = f.read()
        
        # 分词
        tokens = tokenizer(text)
        
        # 保存分词结果
        output_path = os.path.join('30-50_split', filename)
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(' '.join(tokens))
        
        # 命名实体识别，找出人物
        entities = recognizer(tokens)
        persons = []
        for entity_tuple in entities:
            entity_name = entity_tuple[0]
            entity_label = entity_tuple[1]
            if entity_label == 'PERSON':
                persons.append(entity_name)
        
        # 统计人物出现频率
        person_count = Counter(persons)
        
        # 获取出现最多的前5个人物
        top_persons = [person for person, count in person_count.most_common(5)]
        
        # 保存章节和人物
        chapter_name = filename.replace('.txt', '')
        for person in top_persons:
            chapter_persons.append([chapter_name, person])

# 导出到CSV
with open('人物统计.csv', 'w', encoding='utf-8-sig', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['章节', '人物'])
    writer.writerows(chapter_persons)

print('完成！')

                                   

完成！


In [10]:
import csv
from collections import defaultdict

# 读取CSV文件
chapter_persons = defaultdict(list)

with open('人物统计.csv', 'r', encoding='utf-8-sig') as f:
    reader = csv.reader(f)
    next(reader)  # 跳过标题行
    for row in reader:
        chapter = row[0]
        person = row[1]
        # 过滤掉只有一个字的名字
        if len(person) > 1:
            chapter_persons[chapter].append(person)

# 写入新的CSV文件
with open('人物统计_合并.csv', 'w', encoding='utf-8-sig', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['章节', '人物'])
    for chapter in sorted(chapter_persons.keys()):
        persons = ', '.join(chapter_persons[chapter])
        writer.writerow([chapter, persons])

print('完成！')

完成！
