In [78]:
import os
from spire.doc import Document
from spire.doc import FileFormat

In [82]:
def convert_docx_to_pdf(folder_path):
    # Iterate over all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.docx'):
            docx_file = os.path.join(folder_path, filename)
            pdf_file = os.path.join(folder_path, filename.replace('.docx', '.pdf'))
            
            # Load the .docx file
            document = Document()
            document.LoadFromFile(docx_file)
            
            # Save the document as a PDF
            document.SaveToFile(pdf_file, FileFormat.PDF)
            print(f'Converted {docx_file} to {pdf_file}')

In [None]:
# Replace 'your_folder_path' with the path to your folder
folder_path = '../data/Policies'
convert_docx_to_pdf(folder_path)

In [47]:
def convert_doc_to_docx(doc_path, docx_path):
    """
    Converts a .doc file to a .docx file using mammoth.

    Args:
        doc_path (str): The path to the .doc file.
        docx_path (str): The path where the .docx file will be saved.
    """
    with open(doc_path, "rb") as doc_file:
        result = mammoth.convert_to_html(doc_file)
        html = result.value  # The generated HTML
        with open(docx_path.replace('.docx', '.html'), 'w', encoding='utf-8') as html_file:
            html_file.write(html)
    # Convert HTML to DOCX using python-docx
    doc = Document()
    doc.add_paragraph(html)
    doc.save(docx_path)

In [48]:
def doc_to_html(doc):
    """
    Converts a Document object to an HTML string.

    Args:
        doc (Document): The Document object.

    Returns:
        str: The HTML representation of the document.
    """
    html = '<html><body>'
    for para in doc.paragraphs:
        html += f'<p>{para.text}</p>'
    html += '</body></html>'
    return html

In [49]:
def convert_docx_to_pdf(docx_path, pdf_path):
    """
    Converts a .docx file to a PDF.

    Args:
        docx_path (str): The path to the .docx file.
        pdf_path (str): The path where the PDF will be saved.
    """
    doc = Document(docx_path)
    html_path = docx_path.replace('.docx', '.html')
    with open(html_path, 'w', encoding='utf-8') as f:
        f.write(doc_to_html(doc))
    pdfkit.from_file(html_path, pdf_path)
    os.remove(html_path)

In [50]:
def convert_docs_in_folder(folder_path):
    """
    Converts all .doc and .docx files in a folder to PDFs and saves them in a 'converted' subfolder.

    Args:
        folder_path (str): The path to the folder containing the .doc and .docx files.
    """
    converted_folder = os.path.join(folder_path, 'converted')
    os.makedirs(converted_folder, exist_ok=True)

    for filename in os.listdir(folder_path):
        if filename.endswith('.doc'):
            doc_path = os.path.join(folder_path, filename)
            docx_path = os.path.join(converted_folder, filename.replace('.doc', '.docx'))
            convert_doc_to_docx(doc_path, docx_path)
            pdf_path = docx_path.replace('.docx', '.pdf')
            convert_docx_to_pdf(docx_path, pdf_path)
        elif filename.endswith('.docx'):
            docx_path = os.path.join(folder_path, filename)
            pdf_path = os.path.join(converted_folder, filename.replace('.docx', '.pdf'))
            convert_docx_to_pdf(docx_path, pdf_path)
    print('Conversion complete.')

In [None]:
# Specify the folder containing the .doc files
folder_path = '..\data\Policies'
convert_docs_in_folder(folder_path)

In [19]:
def doc_to_html(doc):
    """
    Converts a Document object to an HTML string.

    Args:
        doc (Document): The Document object.

    Returns:
        str: The HTML representation of the document.
    """
    html = '<html><body>'
    for para in doc.paragraphs:
        html += f'<p>{para.text}</p>'
    html += '</body></html>'
    return html

In [20]:
def convert_docx_to_pdf(docx_path, pdf_path):
    """
    Converts a .docx file to a PDF.

    Args:
        docx_path (str): The path to the .docx file.
        pdf_path (str): The path where the PDF will be saved.
    """
    # Load the .docx file
    doc = Document(docx_path)
    # Save the content to an HTML file
    html_path = docx_path.replace('.docx', '.html')
    with open(html_path, 'w', encoding='utf-8') as f:
        f.write(doc_to_html(doc))
    # Convert the HTML file to PDF
    pdfkit.from_file(html_path, pdf_path)
    # Remove the temporary HTML file
    os.remove(html_path)

In [21]:
def convert_doc_to_pdf(doc_path, pdf_path):
    """
    Converts a .doc file to a PDF using pypandoc.

    Args:
        doc_path (str): The path to the .doc file.
        pdf_path (str): The path where the PDF will be saved.
    """
    # Convert the .doc file to PDF
    output = pypandoc.convert_file(doc_path, 'pdf', outputfile=pdf_path)
    assert output == ""

In [22]:
def convert_docs_in_folder(folder_path):
    """
    Converts all .doc and .docx files in a folder to PDFs and saves them in a 'converted' subfolder.

    Args:
        folder_path (str): The path to the folder containing the .doc and .docx files.
    """
    # Create the 'converted' subfolder if it doesn't exist
    converted_folder = os.path.join(folder_path, 'converted')
    os.makedirs(converted_folder, exist_ok=True)

    for filename in os.listdir(folder_path):
        if filename.endswith('.docx'):
            docx_path = os.path.join(folder_path, filename)
            pdf_path = os.path.join(converted_folder, filename.replace('.docx', '.pdf'))
            convert_docx_to_pdf(docx_path, pdf_path)
        elif filename.endswith('.doc'):
            doc_path = os.path.join(folder_path, filename)
            pdf_path = os.path.join(converted_folder, filename.replace('.doc', '.pdf'))
            convert_doc_to_pdf(doc_path, pdf_path)
    print('Conversion complete.')

In [None]:
# Specify the folder containing the .docx files
folder_path = "..\data\Policies"
convert_docs_in_folder(folder_path)