Code for cleaning <i>Xianqin Han Wei Jin Nanbeichao shi</i>.
<br>Original source for Collections.txt: https://github.com/garychowcmu/daizhigev20/blob/master/%E8%AF%97%E8%97%8F/%E8%AF%97%E9%9B%86/%E5%85%88%E7%A7%A6%E6%B1%89%E9%AD%8F%E6%99%8B%E5%8D%97%E5%8C%97%E6%9C%9D%E8%AF%97.txt.
<br>Manual corrections were conducted after the cleaning steps in this notebook.

In [None]:
import re
from collections import defaultdict

In [None]:
def light_clean_text(text):
    """
    light cleaning：remove all the（...）,〖...〗,{...}, <...>, but keep〈...〉 and 【...】for authors and poems
    """
    text = re.sub(r'（.*?）', '', text, flags=re.DOTALL)
    text = re.sub(r'〖.*?〗', '', text, flags=re.DOTALL)
    text = re.sub(r'{.*?}', '', text, flags=re.DOTALL)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[\ue000-\uf8ff]', '', text)
    text = re.sub(r'[�□]', '', text)
    text = re.sub(r'^[\u4e00-\u9fff]{1,6}诗卷[一二三四五六七八九十]+$', '', text, flags=re.MULTILINE)
    text = text.replace('○', '')

    return text


In [None]:
def parse_poetry_by_author(text):
    """
    Keep author names
    """
    poems_by_author = defaultdict(list)

    # use〈...〉to identify author.
    # This is a loose match, and some ambiguous cases (e.g., 陆云公, 释宝月, 陶潜, 班姬) require manual review.
    pattern = re.compile(r'(?P<author>[\u4e00-\u9fff]{2,})\s*\n\s*〈(?P<bio>.*?)〉', flags=re.DOTALL)
    matches = list(pattern.finditer(text))

    for idx, match in enumerate(matches):
        author = match.group("author").strip()
        start = match.end()
        end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text)
        content = text[start:end]

        # use【..】to identify poem title
        poems = re.split(r'【(.*?)】', content)
        current_author = author


        for i in range(1, len(poems), 2):
            title = poems[i].strip()
            poem_body = poems[i + 1].strip()
            poem_body = re.sub(r'\n\s*', '', poem_body)

            if title in ['杂歌谣辞']:
                current_author = title  # use 杂歌谣辞 as the author
                print()
                continue

            if poem_body:
                poems_by_author[current_author].append({
                    'title': title,
                    'poem': poem_body
                })

    return poems_by_author

In [None]:
with open('../Data/Xianqin Han Wei Jin Nanbeichao shi/Collections.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

cleaned_text = light_clean_text(raw_text)

with open('../Data/Xianqin Han Wei Jin Nanbeichao shi/cleaned_Collections.txt', 'w', encoding='utf-8') as f:
    f.write(cleaned_text)

poems_by_author = parse_poetry_by_author(cleaned_text)

In [None]:
import json

with open('../Data/Xianqin Han Wei Jin Nanbeichao shi/poems_by_author.json', 'w', encoding='utf-8') as f:
    json.dump(poems_by_author, f, ensure_ascii=False, indent=2)

In [None]:
sorted_authors = sorted(poems_by_author.items(), key=lambda x: len(x[1]), reverse=True)
for author, poems in sorted_authors:
    print(f"{author} number of poems: {len(poems)} ")

In [None]:
shipin_authors = [
    "李陵", "班姬", "曹植", "刘桢", "王粲", "阮籍", "陆机", "潘岳", "张协", "左思", "谢灵运",
    "徐淑", "曹丕", "嵇康", "张华", "何晏", "孙楚", "王赞", "张翰", "潘尼", "应璩", "陆云",
    "石崇", "曹摅", "何劭", "刘琨", "卢谌", "郭璞", "袁宏", "郭泰机", "顾恺之", "谢世基",
    "顾迈", "戴凯", "陶潜", "颜延之", "谢瞻", "谢混", "袁淑", "王微", "王僧达", "谢惠连",
    "鲍照", "谢朓", "江淹", "范云", "丘迟", "任昉", "沈约", "班固", "郦炎", "赵壹", "曹操",
    "曹叡", "曹彪", "徐幹", "阮瑀", "欧阳建", "应玚", "嵇含", "阮侃", "嵇绍", "枣据", "张载",
    "傅玄", "傅咸", "缪袭", "夏侯湛", "王济", "杜预", "孙绰", "许询", "戴逵", "殷仲文", "傅亮",
    "何长瑜", "羊曜璠", "范晔", "刘骏", "刘铄", "刘宏", "谢庄", "苏宝生", "陵修之", "任昙绪",
    "戴法兴", "区惠恭", "惠休", "道猷", "释宝月", "萧道成", "张永", "王俭", "谢超宗", "丘灵鞠",
    "刘祥", "檀超", "钟宪", "颜测", "顾则心", "毛伯成", "吴迈远", "许瑶之", "鲍令晖", "韩兰英",
    "张融", "孔稚珪", "王融", "刘绘", "江祏", "王屮", "卞彬", "卞铄", "袁嘏", "张欣泰", "范缜",
    "陆厥", "虞羲", "江洪", "鲍行卿", "孙察"
]

unmatched_shipin_authors = set(shipin_authors)


def is_shipin_author(author, shipin_authors):
    return any(name in author for name in shipin_authors)

train_set = {}
test_set = {}

for author, poems in poems_by_author.items():
    if is_shipin_author(author, shipin_authors):
        test_set[author] = poems
        for name in shipin_authors:
            if name in author:
                unmatched_shipin_authors.discard(name)
                break
    else:
        train_set[author] = poems

with open('../Data/Xianqin Han Wei Jin Nanbeichao shi/train_set.json', 'w', encoding='utf-8') as f:
    json.dump(train_set, f, ensure_ascii=False, indent=2)

with open('../Data/Xianqin Han Wei Jin Nanbeichao shi/test_set.json', 'w', encoding='utf-8') as f:
    json.dump(test_set, f, ensure_ascii=False, indent=2)