In [6]:
import subprocess
import os
from pathlib import Path

def find_libreoffice():
    """
    查找 LibreOffice 的安装路径
    """
    possible_paths = [
        "/usr/local/bin/libreoffice",
        "/Applications/LibreOffice.app/Contents/MacOS/soffice",
        "/opt/homebrew/bin/libreoffice",
        "soffice"
    ]
    
    for path in possible_paths:
        try:
            result = subprocess.run([path, '--version'], 
                                  capture_output=True, text=True, timeout=10)
            if result.returncode == 0:
                print(f"找到 LibreOffice: {path}")
                print(f"版本: {result.stdout.strip()}")
                return path
        except (FileNotFoundError, subprocess.TimeoutExpired):
            continue
    
    return None

def doc_to_markdown_via_html_fixed(doc_path):
    """
    修复版本的DOC转Markdown函数
    """
    try:
        # 查找 LibreOffice 路径
        libreoffice_path = find_libreoffice()
        if not libreoffice_path:
            print("未找到 LibreOffice，请确保已正确安装")
            return None
        
        # 确保输出目录存在
        output_dir = os.path.dirname(doc_path) if os.path.dirname(doc_path) else '.'
        html_filename = os.path.basename(doc_path).replace('.doc', '.html')
        html_path = os.path.join(output_dir, html_filename)
        
        # 使用找到的 LibreOffice 路径进行转换
        cmd = [
            libreoffice_path,
            '--headless',
            '--convert-to', 'html',
            '--outdir', output_dir,
            doc_path
        ]
        
        print(f"执行命令: {' '.join(cmd)}")
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
        
        if result.returncode != 0:
            print(f"转换失败: {result.stderr}")
            return None
        
        # 检查HTML文件是否生成
        if not os.path.exists(html_path):
            print(f"HTML文件未生成: {html_path}")
            return None
        
        # 读取HTML并转换为Markdown
        with open(html_path, 'r', encoding='utf-8') as f:
            html_content = f.read()
        
        # 使用BeautifulSoup清理HTML
        soup = BeautifulSoup(html_content, 'html.parser')
        markdown_content = html_to_markdown_improved(str(soup))
        
        # 清理临时HTML文件
        try:
            os.remove(html_path)
        except:
            pass
        
        return markdown_content
    
    except subprocess.TimeoutExpired:
        print("转换超时")
        return None
    except Exception as e:
        print(f"转换DOC到Markdown时出错: {e}")
        return None

def html_to_markdown_improved(html_content):
    """
    改进的HTML到Markdown转换
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # 移除脚本和样式标签
    for script in soup(["script", "style", "meta", "link"]):
        script.decompose()
    
    # 处理标题
    for i in range(1, 7):
        for tag in soup.find_all(f'h{i}'):
            tag.string = f"{'#' * i} {tag.get_text()}\n\n"
    
    # 处理段落
    for p in soup.find_all('p'):
        if p.get_text().strip():
            p.string = f"{p.get_text()}\n\n"
    
    # 处理列表
    for ul in soup.find_all('ul'):
        for li in ul.find_all('li'):
            li.string = f"- {li.get_text()}\n"
    
    for ol in soup.find_all('ol'):
        for i, li in enumerate(ol.find_all('li'), 1):
            li.string = f"{i}. {li.get_text()}\n"
    
    # 获取文本
    text = soup.get_text()
    
    # 清理多余的空行
    lines = text.split('\n')
    cleaned_lines = []
    prev_empty = False
    
    for line in lines:
        line = line.strip()
        if line:
            cleaned_lines.append(line)
            prev_empty = False
        elif not prev_empty:
            cleaned_lines.append('')
            prev_empty = True
    
    return '\n'.join(cleaned_lines)

def process_document_fixed(file_path):
    """
    修复版本的文档处理函数
    """
    if not os.path.exists(file_path):
        print(f"文件不存在: {file_path}")
        return
    
    file_extension = Path(file_path).suffix.lower()
    
    if file_extension == '.docx':
        print(f"处理DOCX文档: {file_path}")
        
        # 转换为Markdown
        markdown_content = docx_to_markdown(file_path)
        if markdown_content:
            markdown_file = file_path.replace('.docx', '.md')
            with open(markdown_file, 'w', encoding='utf-8') as f:
                f.write(markdown_content)
            print(f"Markdown文件已保存: {markdown_file}")
        
        # 转换为JSON
        json_data = docx_to_json(file_path)
        if json_data:
            json_file = file_path.replace('.docx', '.json')
            with open(json_file, 'w', encoding='utf-8') as f:
                json.dump(json_data, f, ensure_ascii=False, indent=2)
            print(f"JSON文件已保存: {json_file}")
            
    elif file_extension == '.doc':
        print(f"处理DOC文档: {file_path}")
        
        markdown_content = doc_to_markdown_via_html_fixed(file_path)
        if markdown_content:
            markdown_file = file_path.replace('.doc', '.md')
            with open(markdown_file, 'w', encoding='utf-8') as f:
                f.write(markdown_content)
            print(f"Markdown文件已保存: {markdown_file}")
        else:
            print("DOC转换失败")
    
    else:
        print("不支持的文件格式，请使用.doc或.docx文件")

# 首先测试 LibreOffice 是否可用
print("检查 LibreOffice 安装状态...")
libreoffice_path = find_libreoffice()

if libreoffice_path:
    print(f"LibreOffice 可用，路径: {libreoffice_path}")
else:
    print("LibreOffice 未找到，请检查安装")

检查 LibreOffice 安装状态...
找到 LibreOffice: /Applications/LibreOffice.app/Contents/MacOS/soffice
版本: LibreOffice 25.8.2.2 d401f2107ccab8f924a8e2df40f573aab7605b6f
LibreOffice 可用，路径: /Applications/LibreOffice.app/Contents/MacOS/soffice


In [7]:
# 使用修复版本处理文档
process_document_fixed("./yq2021-0602文娱产业正离文化越来越远.doc")

处理DOC文档: ./yq2021-0602文娱产业正离文化越来越远.doc
找到 LibreOffice: /Applications/LibreOffice.app/Contents/MacOS/soffice
版本: LibreOffice 25.8.2.2 d401f2107ccab8f924a8e2df40f573aab7605b6f
执行命令: /Applications/LibreOffice.app/Contents/MacOS/soffice --headless --convert-to html --outdir . ./yq2021-0602文娱产业正离文化越来越远.doc
Markdown文件已保存: ./yq2021-0602文娱产业正离文化越来越远.md
