In [1]:
import os
import shutil
from pathlib import Path
from docx import Document
import comtypes.client  # 用于 Word 转换
import fitz  # pip install PyMuPDF，用于 PDF 转 DOCX
import pandas as pd

import markitdown  # 假设你已安装并配置好 MarkItDown
import glob

# 源目录和目标目录
source_dir = r"C:\\Users\\ghuang11\\work\\project\\cmmi5\\CMMI5\\02.Org-Cfg-Repo"
tmp_dir = r"C:\\Users\\ghuang11\\study\\cmmi5_training\\tmp"
target_dir = r"C:\\Users\\ghuang11\\study\\cmmi5_training\\output"



In [None]:

def is_hidden_or_temp(path):
    parts = Path(path).parts
    return any(
        part.lower() in ['02.组织培训库', '04.组织风险库', '05.经验教训库', '06.样例库', '07.产品库'] 
        or part.startswith('.') 
        or part.startswith('~') 
        for part in parts)

def convert_pdf_to_docx(pdf_path, docx_path):
    doc = fitz.open(pdf_path)
    text = "\n".join(page.get_text() for page in doc)
    doc.close()
    document = Document()
    document.add_paragraph(text)
    document.save(docx_path)

def convert_doc_to_docx(doc_path, docx_path, word):
    # word = comtypes.client.CreateObject('Word.Application')
    # word.Visible = False
    doc = word.Documents.Open(doc_path)
    doc.SaveAs(docx_path, FileFormat=16)  # 16 = wdFormatDocumentDefault (docx)
    doc.Close()
    # word.Quit()

def convert_xls_to_xlsx(xls_path, xlsx_path):
    df = pd.read_excel(xls_path, engine='xlrd')
    df.to_excel(xlsx_path, index=False)

In [None]:
def prepare_tmp_directory():
    os.makedirs(tmp_dir, exist_ok=True)
    word = comtypes.client.CreateObject('Word.Application')
    for root, dirs, files in os.walk(source_dir):
        if is_hidden_or_temp(root):
            continue
        for file in files:
            ext = file.lower().split('.')[-1]
            full_path = os.path.join(root, file)
            rel_path = os.path.relpath(full_path, source_dir)
            tmp_path = os.path.join(tmp_dir, rel_path)
            os.makedirs(os.path.dirname(tmp_path), exist_ok=True)

            try:
                if ext == 'docx':
                    shutil.copy2(full_path, tmp_path)
                elif ext == 'doc':
                    convert_doc_to_docx(full_path, tmp_path + '.docx', word)
                elif ext == 'pdf':
                    convert_pdf_to_docx(full_path, tmp_path + '.docx')
                elif ext == 'xls':
                    convert_xls_to_xlsx(full_path, tmp_path + '.xlsx')
                elif ext == 'xlsx':
                    shutil.copy2(full_path, tmp_path)
            except Exception as e:
                print(f"跳过文件 {full_path}，错误：{e}")
    word.Quit()

prepare_tmp_directory()

In [2]:
converter = markitdown.MarkItDown()
def convert_docx_to_md(docx_path, md_path):
    try:
        markdown_content = converter.convert(docx_path)  # 返回 Markdown 字符串
        with open(md_path, 'w', encoding='utf-8') as f:
            f.write(markdown_content.text_content)
    except Exception as e:
        print(f"转换失败：{docx_path} -> {md_path}，错误：{e}")

In [3]:


def convert_tmp_to_target():
    count = 0
    fail_count = 0
    for root, dirs, files in os.walk(tmp_dir):
        for file in files:
            ext = file.lower().split('.')[-1]
            full_path = os.path.join(root, file)
            rel_path = os.path.relpath(full_path, tmp_dir)
            target_path = os.path.join(target_dir, rel_path)

            os.makedirs(os.path.dirname(target_path), exist_ok=True)

            count += 1
            try:
                if ext == 'docx':
                    md_path = os.path.splitext(target_path)[0] + '.md'
                    if count < 10:
                        print(f"Processing {full_path} -> {md_path}")
                    convert_docx_to_md(full_path, md_path)
                # elif ext == 'xlsx':
                #     df = pd.read_excel(full_path)
                #     csv_path = os.path.splitext(target_path)[0] + '.csv'
                #     df.to_csv(csv_path, index=False)
            except Exception as e:
                print(f"转换失败：{full_path}，错误：{e}")
                fail_count += 1
                if fail_count > 5 and fail_count / count > 0.8:
                    print(f"错误次数过多{fail_count}/{count}，停止转换。")
                    raise e

convert_tmp_to_target()

Processing C:\\Users\\ghuang11\\study\\cmmi5_training\\tmp\01.标准过程库\01.CMMI体系认证\01.体系概要\00-研发体系管理手册.docx -> C:\\Users\\ghuang11\\study\\cmmi5_training\\output\01.标准过程库\01.CMMI体系认证\01.体系概要\00-研发体系管理手册.md
Processing C:\\Users\\ghuang11\\study\\cmmi5_training\\tmp\01.标准过程库\01.CMMI体系认证\01.体系概要\01-软件生命周期模型.docx -> C:\\Users\\ghuang11\\study\\cmmi5_training\\output\01.标准过程库\01.CMMI体系认证\01.体系概要\01-软件生命周期模型.md
Processing C:\\Users\\ghuang11\\study\\cmmi5_training\\tmp\01.标准过程库\01.CMMI体系认证\02.过程管理\01 组织资产开发\组织资产开发.docx -> C:\\Users\\ghuang11\\study\\cmmi5_training\\output\01.标准过程库\01.CMMI体系认证\02.过程管理\01 组织资产开发\组织资产开发.md
Processing C:\\Users\\ghuang11\\study\\cmmi5_training\\tmp\01.标准过程库\01.CMMI体系认证\02.过程管理\01 组织资产开发\指南\01-组织资产库管理指南.docx -> C:\\Users\\ghuang11\\study\\cmmi5_training\\output\01.标准过程库\01.CMMI体系认证\02.过程管理\01 组织资产开发\指南\01-组织资产库管理指南.md
Processing C:\\Users\\ghuang11\\study\\cmmi5_training\\tmp\01.标准过程库\01.CMMI体系认证\02.过程管理\01 组织资产开发\指南\02-组织工作环境标准.docx -> C:\\Users\\ghuang11\\study\\c

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
