#### Step 1 导入相关包

In [None]:
from xml.sax import handler, make_parser
import json
from tqdm import tqdm


paperTag = ('article', 'inproceedings')


class mHandler(handler.ContentHandler):
    def __init__(self):
        # 初始化存储数据的结构，存储文章信息的列表
        self.articles = []
        self.current_article = None
        self.current_tag = None
        self.current_list = None  # 用于存储可能的列表元素，如多个作者
        self.counter = 0  # 计数器，用于记录存储的文章数量
        self.batch_size = 1000000  # 每 1000000 条存储一个 JSON 文件

    def startDocument(self):
        print('Document Start')

    def endDocument(self):
        print('Document End')
        # 将剩余的文章存储为 JSON 文件
        self.save_to_json(f'../output/dblp_parsed_{self.counter // self.batch_size}.json')
        print(self.articles)

    def startElement(self, name, attrs):
        if name in paperTag:
            self.current_article = {}
            self.current_article['type'] = name
            for attr_name, attr_value in attrs.items():
                self.current_article[attr_name] = attr_value
            self.current_tag = name
        elif name in ['author', 'editor', 'title', 'booktitle', 'pages', 'year', 'address', 'journal', 'volume',
                     'number', 'month', 'url', 'ee', 'cdrom', 'cite', 'publisher', 'note', 'crossref',
                     'isbn', 'series', 'school', 'chapter', 'publnr', 'stream', 'rel']:
            if self.current_article:  # 确保在处理 article, inproceedings, proceedings 时才进行以下操作
                if name in ['author', 'editor']:
                    if 'authors' not in self.current_article:
                        self.current_article['authors'] = []
                    self.current_list = self.current_article['authors']
                elif name == 'title':
                    if 'titles' not in self.current_article:
                        self.current_article['titles'] = []
                    self.current_list = self.current_article['titles']
                else:
                    self.current_list = []
                    self.current_article[name] = self.current_list
                self.current_tag = name

    def endElement(self, name):
        if name in paperTag:
            self.articles.append(self.current_article)
            self.current_article = None
            self.counter += 1
            if self.counter % self.batch_size == 0:
                self.save_to_json(f'../output/dblp_parsed_{self.counter // self.batch_size}.json')
                self.articles = []  # 清空存储列表
        self.current_tag = None
        self.current_list = None

    def characters(self, content):
        if self.current_list is not None and self.current_article:  # 确保在处理 article, inproceedings, proceedings 时才进行以下操作
            self.current_list.append(content.strip())

    def save_to_json(self, output_file):
        with open(output_file, 'w') as f:
            json.dump(self.articles, f, indent=4)


def parserDblpXml():
    handler = mHandler()
    parser = make_parser()
    parser.setContentHandler(handler)
    DBLP_XML_PATH = '../data/dblp.xml'
    # 使用 with 语句管理文件对象
    with open(DBLP_XML_PATH, 'rb') as f:
        # 统计元素的总数，假设可以使用另一个解析器或其他方法来实现，这里假设为 1000000 作为示例
        total_elements = 1000000
        with tqdm(total=total_elements) as pbar:
            def update_progress():
                pbar.update(1)

            def start_element(name, attrs):
                handler.startElement(name, attrs)
                update_progress()

            def end_element(name):
                handler.endElement(name)
                update_progress()

            def characters(content):
                handler.characters(content)

            parser.setContentHandler(handler)
            parser.parse(f)


if __name__ == '__main__':
    parserDblpXml()

In [None]:
handler.articles[-1]