## Word Document Parsing

In [1]:
from langchain_community.document_loaders import Docx2txtLoader

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
## Method : Using Docx2txtload 

word_doc_path = "data/pdf_files/dummy-word-pdf.docx"

try:
    word_loader = Docx2txtLoader(word_doc_path)
    word_docs = word_loader.load()

    print(f"len of word doc : {word_docs}")
    
    for index , chunk in enumerate(word_docs):
        print(f"Doc {index} Content :  {chunk.page_content}")
        print(f"Doc {index} metadata : {chunk.metadata}\n\n")
except Exception as e :
    print(f"Exception : {e}")

len of word doc : [Document(metadata={'source': 'data/pdf_files/dummy-word-pdf.docx'}, page_content='üìù Dummy Google Document Content\n\nTitle:\n\nQuarterly Marketing Strategy 2025 ‚Äî Draft\n\n\n\nüìå Executive Summary\n\nThis document outlines the proposed marketing strategy for Q1‚ÄìQ2 2025. The focus is on increasing brand awareness, improving lead conversion rates, and expanding our digital reach through optimized campaigns and data-driven decisions.\n\n\n\nüéØ Objectives\n\nIncrease website traffic by 30% within 6 months.\n\n\nAchieve a 15% conversion rate for inbound leads.\n\n\nLaunch 3 new product-specific campaigns targeting enterprise clients.\n\n\nGrow our social media following by 20% across all platforms.\n\n\n\n\nüìä Key Initiatives\n\nInitiative\n\nDescription\n\nOwner\n\nDeadline\n\nSEO Optimization\n\nImprove keyword ranking for 20+ high-volume terms.\n\nSEO Team\n\nFeb 2025\n\nPaid Campaigns\n\nLaunch targeted Google Ads and LinkedIn Ads.\n\nMarketing Ops\n\nMar

In [8]:
## Method Unstructured Document Loader

from langchain_community.document_loaders import UnstructuredWordDocumentLoader

try:
    doc_path = "data/pdf_files/dummy-word-pdf.docx"
    # takes mode 
    # The mode to use when loading the file. Can be one of "single", "multi", or "all". Default is "single" , {'paged', 'elements', 'single'}
    doc_loader = UnstructuredWordDocumentLoader(doc_path, mode="elements")
    
    docs = doc_loader.load()

    for index,chunk in enumerate(docs):
        print(f"Doc {index} content : {chunk.page_content}")
        print(f"Doc {index} metadata : {chunk.metadata}")
except Exception as e:
    print(f"Execption is : {e}")

Doc 0 content : üìù Dummy Google Document Content
Doc 0 metadata : {'source': 'data/pdf_files/dummy-word-pdf.docx', 'category_depth': 2, 'emphasized_text_contents': ['üìù Dummy Google Document Content'], 'emphasized_text_tags': ['b'], 'file_directory': 'data/pdf_files', 'filename': 'dummy-word-pdf.docx', 'last_modified': '2025-11-09T14:36:15', 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'Title', 'element_id': 'cf0cfaddbed7a596d89137deadc61c1a'}
Doc 1 content : Title:
Doc 1 metadata : {'source': 'data/pdf_files/dummy-word-pdf.docx', 'category_depth': 0, 'emphasized_text_contents': ['Title:'], 'emphasized_text_tags': ['b'], 'file_directory': 'data/pdf_files', 'filename': 'dummy-word-pdf.docx', 'last_modified': '2025-11-09T14:36:15', 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'parent_id': 'cf0cfaddbed7a596d89137deadc61c1a', 'category': 'Uncategor

### Diffrenence

#### Docx2txtLoader

- Simple and fast
- Handles para, line breaks and formatting
- No meta data
- loses structure (headings, lists, tables, and bold/italic info are not preserved)
- does not parse images etc

#### UnstructuredWordDocumentLoader

- Preserves document structure
- optionally provides metadata 
- can output structured JSON
- better suited for RAG
- Slower