In [24]:
import markdown
from docx import Document

In [26]:
import xml.etree.ElementTree as ET
from collections import defaultdict

def analyze_xml_structure(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    tag_counts = defaultdict(int)
    tag_hierarchy = defaultdict(set)

    def traverse(element, parent=None):
        tag_counts[element.tag] += 1
        if parent:
            tag_hierarchy[parent].add(element.tag)
        for child in element:
            traverse(child, element.tag)

    traverse(root)

    return tag_counts, tag_hierarchy, root

In [27]:
def parse_element(element, tag_counts, tag_hierarchy, level=0):
    markdown = ""
    
    if element.tag in ["article", "front", "body", "back"]:
        for child in element:
            markdown += parse_element(child, tag_counts, tag_hierarchy, level)
    
    elif element.tag in ["sec", "abstract"]:
        title = element.find("title")
        if title is not None:
            markdown += f"{'#' * (level + 1)} {title.text}\n\n"
        for child in element:
            if child.tag != "title":
                markdown += parse_element(child, tag_counts, tag_hierarchy, level + 1)
    
    elif element.tag == "p":
        markdown += f"{element.text}\n\n" if element.text else "\n"
    
    elif element.tag == "list":
        for list_item in element.findall("list-item"):
            item_text = list_item.find('p').text if list_item.find('p') is not None else ""
            markdown += f"- {item_text}\n"
        markdown += "\n"
    
    elif element.tag == "table-wrap":
        markdown += "| | |\n|---|---|\n"
        for row in element.findall(".//tr"):
            cells = [cell.text if cell.text else "" for cell in row.findall("td") + row.findall("th")]
            markdown += f"| {' | '.join(cells)} |\n"
        markdown += "\n"
    
    elif element.tag == "fig":
        caption = element.find(".//caption")
        if caption is not None:
            markdown += f"*{caption.text}*\n\n"
    
    # tag_countsとtag_hierarchyを使用して、特定のタグに対する処理を追加できます
    # 例：
    # if element.tag in tag_hierarchy['specific_parent']:
    #     # 特別な処理
    
    return markdown

def convert_xml_to_markdown(xml_file):
    tag_counts, tag_hierarchy, root = analyze_xml_structure(xml_file)
    return parse_element(root, tag_counts, tag_hierarchy)
    
 

def markdown_to_word(markdown_content, output_word_file):
    """Convert Markdown content to a Word document."""
    doc = Document()
    
    # Split the markdown content by lines and parse accordingly
    for line in markdown_content.split("\n"):
        if line.startswith("### "):
            doc.add_heading(line[4:], level=3)
        elif line.startswith("#### "):
            doc.add_heading(line[5:], level=4)
        elif line.startswith("- "):
            doc.add_paragraph(line[2:], style='List Bullet')
        elif line.startswith("**"):
            doc.add_paragraph(line.strip("**"), style='Heading 2')
        elif line.startswith("*"):
            doc.add_paragraph(line.strip("*"), style='Heading 3')
        else:
            doc.add_paragraph(line)

    # Save the document
    doc.save(output_word_file)




In [28]:
# Example usage:
xml_file = r'C:YOUR-output-XML-PATH'


In [30]:
# XMLの構造を分析
tag_counts, tag_hierarchy, root = analyze_xml_structure(xml_file)


# Markdownに変換
markdown_output = parse_element(root, tag_counts, tag_hierarchy)
print(markdown_output[:500])  # 最初の500文字を表示

# Wordファイルに変換
output_word_file = 'output.docx'
markdown_to_word(markdown_output, output_word_file)
print(f"Markdown converted to Word file: {output_word_file}")

# Background

## Description of the condition

The coronavirus (COVID-19) epidemic (i.e. an unexpected increase in the number of disease cases in a specific geographical area) was first reported in China in December 2019 [

An epidemic of an emerging infectious disease may have a widespread negative impact on mental health. The Severe Acute Respiratory Syndrome (SARS) epidemic caused symptoms of post-traumatic stress disorder (PTSD) and depression in the general population [

## Description of t
Markdown converted to Word file: output.docx
