## Word Document Processing

In [2]:
from langchain_community.document_loaders import Docx2txtLoader, UnstructuredWordDocumentLoader

## Method 1 Docx2txtLoader

In [11]:
try:
    docx_loader = Docx2txtLoader("data/word_file/proposal.docx")
    docx = docx_loader.load()
    print(f"❤️Loaded {len(docx)} documents")
    print(f"Content preview: {docx[0].page_content[:300]}")
    print(f"Metadeta: {docx[0].metadata}")
except Exception as e:
    print(f"Error {e}")

❤️Loaded 1 documents
Content preview: Project Proposal: RAG Implementation

Executive Summary

This proposal outlines the implementation of a Retrieval-Augmented Generation system for our organization.

Objectives

Key objectives include:

• Improve information retrieval accuracy

• Reduce response time for customer queries

• Integrate
Metadeta: {'source': 'data/word_file/proposal.docx'}


## Method 2 UnstructuredWordDocumentLoader

In [24]:
try:
    unstructured_loader = UnstructuredWordDocumentLoader("data/word_file/proposal.docx", mode='elements')
    unst_docx = unstructured_loader.load()
    print(f"❤️Loaded {len(docx)} documents")
    for i, doc in enumerate(unst_docx):
        print(f"\nElement {i+1}:")
        print(f"Type: {doc.metadata.get('category', 'unknown')}")
        print(f"Content: {doc.page_content[:100]}...")
except Exception as e:
    print(f"Error with {e}")

❤️Loaded 1 documents

Element 1:
Type: Title
Content: Project Proposal: RAG Implementation...

Element 2:
Type: Title
Content: Executive Summary...

Element 3:
Type: NarrativeText
Content: This proposal outlines the implementation of a Retrieval-Augmented Generation system for our organiz...

Element 4:
Type: Title
Content: Objectives...

Element 5:
Type: NarrativeText
Content: Key objectives include:...

Element 6:
Type: ListItem
Content: Improve information retrieval accuracy...

Element 7:
Type: ListItem
Content: Reduce response time for customer queries...

Element 8:
Type: ListItem
Content: Integrate with existing knowledge base...

Element 9:
Type: Title
Content: Budget and Timeline...

Element 10:
Type: UncategorizedText
Content: Budget: $50,000...

Element 11:
Type: UncategorizedText
Content: Timeline: 3 months...

Element 12:
Type: UncategorizedText
Content: Team: 4 developers, 1 project manager...

Element 13:
Type: Title
Content: Technical Requirements...

Element 14:
Type: 

In [26]:
unst_docx[1:10]

[Document(metadata={'source': 'data/word_file/proposal.docx', 'category_depth': 0, 'file_directory': 'data/word_file', 'filename': 'proposal.docx', 'last_modified': '2025-09-14T13:15:01', 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'Title', 'element_id': 'c0f844859abf08d9506856b3aed4a719'}, page_content='Executive Summary'),
 Document(metadata={'source': 'data/word_file/proposal.docx', 'category_depth': 0, 'file_directory': 'data/word_file', 'filename': 'proposal.docx', 'last_modified': '2025-09-14T13:15:01', 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'parent_id': 'c0f844859abf08d9506856b3aed4a719', 'category': 'NarrativeText', 'element_id': 'bbc04fc71e33a92df30d7fe7c33b6375'}, page_content='This proposal outlines the implementation of a Retrieval-Augmented Generation system for our organization.'),
 Document(metadata={'source': 'data/word_file/