In [14]:
import xml.etree.ElementTree as ET
from collections import defaultdict
from docx import Document
from collections import deque

def analyze_xml_structure(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    tag_hierarchy = defaultdict(list)

    def traverse(element, parent=None):
        if parent is not None:
            tag_hierarchy[parent].append(element.tag)
        for child in element:
            traverse(child, element.tag)

    traverse(root)
    return tag_hierarchy, root

def parse_element(element, tag_hierarchy, level=0):
    markdown = ""
    
    # タイトルの処理
    if element.tag == "title":
        markdown += f"{'#' * (level + 1)} {element.text}\n\n"
    elif element.text and element.text.strip():
        markdown += f"{'  ' * level}- {element.tag}: {element.text.strip()}\n"
    else:
        markdown += f"{'  ' * level}- {element.tag}\n"
    
    # 子要素の処理
    if element.tag in tag_hierarchy:
        for child_tag in tag_hierarchy[element.tag]:
            for child in element.findall(child_tag):
                markdown += parse_element(child, tag_hierarchy, level + 1)
    
    # テキストノードの処理（子要素がない場合）
    if len(element) == 0 and element.text and element.text.strip():
        markdown += f"{'  ' * (level + 1)}{element.text.strip()}\n"
    
    return markdown

def convert_xml_to_markdown(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    markdown = ""
    stack = deque([(root, 0)])
    
    while stack:
        element, level = stack.pop()
        
        # タイトルの処理
        if element.tag == "title":
            markdown += f"{'#' * (level + 1)} {element.text}\n\n"
        elif element.text and element.text.strip():
            markdown += f"{'  ' * level}- {element.tag}: {element.text.strip()}\n"
        else:
            markdown += f"{'  ' * level}- {element.tag}\n"
        
        # 属性の処理
        for attr, value in element.attrib.items():
            markdown += f"{'  ' * (level + 1)}  {attr}: {value}\n"
        
        # 子要素をスタックに追加（逆順で追加して、正しい順序で処理されるようにする）
        for child in reversed(list(element)):
            stack.append((child, level + 1))
    
    return markdown


def markdown_to_word(markdown_content, output_word_file):
    """Convert Markdown content to a Word document."""
    doc = Document()

    # Split the markdown content by lines and parse accordingly
    for line in markdown_content.split("\n"):
        if line.startswith("### "):
            doc.add_heading(line[4:], level=3)
        elif line.startswith("#### "):
            doc.add_heading(line[5:], level=4)
        elif line.startswith("- "):
            doc.add_paragraph(line[2:], style='List Bullet')
        elif line.startswith(""):
            doc.add_paragraph(line.strip(""), style='Heading 2')
        elif line.startswith(""):
            doc.add_paragraph(line.strip(""), style='Heading 3')
        else:
            doc.add_paragraph(line)

    # Save the document
    doc.save(output_word_file)

In [None]:
# メイン処理
xml_file = r'INPUT_YOUR_FILE.xml'
markdown_output = convert_xml_to_markdown(xml_file)

# 結果の出力
print(markdown_output[:1000])  # 最初の1000文字を表示

# Wordファイルへの変換（既存のmarkdown_to_word関数を使用）
output_word_file = 'output.docx'
markdown_to_word(markdown_output, output_word_file)
print(f"Markdown converted to Word file: {output_word_file}")

In [None]:



# Markdownに変換
markdown_output = parse_element(root, tag_counts, tag_hierarchy)
print(markdown_output[:500])  # 最初の500文字を表示

# Wordファイルに変換
output_word_file = 'output.docx'
markdown_to_word(markdown_output, output_word_file)
print(f"Markdown converted to Word file: {output_word_file}")