In [4]:
### Document Structure

from langchain_core.documents import Document

doc = Document(
    page_content="This is the main text content of the document. Im using it to create a RAG system.",
    metadata={
        "source": "example.txt",
        "pages": 1,
        "author": "Vinay Kumar Sahu",
        "date": "2025-10-29"
    },
)
doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'author': 'Vinay Kumar Sahu', 'date': '2025-10-29'}, page_content='This is the main text content of the document. Im using it to create a RAG system.')

In [7]:
### TextLoader

from langchain_community.document_loaders import TextLoader

text_loader = TextLoader("../data/text_files/python_intro.txt", encoding="utf8")
text_document = text_loader.load()
text_document

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Introduction to Python Programming\n\nPython is a versatile, high-level programming language known for its simplicity and readability. Created by Guido van Rossum in 1991, Python has become one of the most popular programming languages worldwide.\n\nKey Features:\n- Easy to learn and understand\n- Extensive library support\n- Cross-platform compatibility\n- Dynamic typing\n- Object-oriented programming support\n\nPython is widely used in:\n* Web Development\n* Data Science\n* Artificial Intelligence\n* Machine Learning\n* Automation\n* Scientific Computing\n\nFor beginners, Python offers a gentle learning curve with its clear syntax and straightforward structure. Unlike other languages, Python uses indentation to define code blocks, making it naturally readable.\n\nGetting Started:\n1. Install Python from python.org\n2. Choose an IDE (like PyCharm, VS Code)\n3. Learn basic syntax\n4. Practice with simpl

In [10]:
### DirectoryLoader

from langchain_community.document_loaders import DirectoryLoader

directory_loader = DirectoryLoader(
    "../data/text_files",
    glob="*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding": "utf8"},
)

directory_documents = directory_loader.load()
directory_documents

[Document(metadata={'source': '..\\data\\text_files\\machine_learning.txt'}, page_content='Introduction to Machine Learning\n\nMachine learning is a transformative field of artificial intelligence that enables computers to learn and improve from experience without being explicitly programmed.\n\nKey Concepts:\n- Supervised Learning: Training with labeled data\n- Unsupervised Learning: Finding patterns in unlabeled data\n- Reinforcement Learning: Learning through trial and error\n\nMachine learning systems analyze vast amounts of data to identify patterns and make decisions with minimal human intervention. These systems power many everyday applications, from recommendation engines to voice assistants.\n\nCommon Applications:\n• Image and Speech Recognition\n• Natural Language Processing\n• Fraud Detection\n• Medical Diagnosis\n• Autonomous Vehicles\n\nThe learning process typically involves:\n1. Data Collection\n2. Data Preprocessing\n3. Model Selection\n4. Training\n5. Evaluation\n6. D

In [None]:
### PDFDirectoryLoader

from langchain_community.document_loaders import PyMuPDFLoader

pdf_directory_loader = DirectoryLoader(
    "../data/pdf_files",
    glob="*.pdf",
    loader_cls=PyMuPDFLoader,
)

pdf_documents = pdf_directory_loader.load()

# Adding additional metadata to each PDF document
for doc in pdf_documents:
    doc.metadata['file_type'] = 'pdf'
pdf_documents

{'producer': 'Skia/PDF m92 Google Docs Renderer', 'creator': '', 'creationdate': '2021-05-05T13:04:55+01:00', 'source': '..\\data\\pdf_files\\A_Brief_Introduction_To_AI.pdf', 'file_path': '..\\data\\pdf_files\\A_Brief_Introduction_To_AI.pdf', 'total_pages': 9, 'format': 'PDF 1.4', 'title': 'A Brief Introduction to Artificial Intelligence', 'author': '', 'subject': 'Article about artificial intelligence (AI) written by Ryerson Computer Science student, Dibbyo Saha, for Science Rendezvous.', 'keywords': '', 'moddate': '2021-05-05T13:04:55+01:00', 'trapped': '', 'modDate': "D:20210505130455+01'00'", 'creationDate': "D:20210505130455+01'00'", 'page': 0, 'file_type': 'pdf'}
{'producer': 'Skia/PDF m92 Google Docs Renderer', 'creator': '', 'creationdate': '2021-05-05T13:04:55+01:00', 'source': '..\\data\\pdf_files\\A_Brief_Introduction_To_AI.pdf', 'file_path': '..\\data\\pdf_files\\A_Brief_Introduction_To_AI.pdf', 'total_pages': 9, 'format': 'PDF 1.4', 'title': 'A Brief Introduction to Artifi

[Document(metadata={'producer': 'Skia/PDF m92 Google Docs Renderer', 'creator': '', 'creationdate': '2021-05-05T13:04:55+01:00', 'source': '..\\data\\pdf_files\\A_Brief_Introduction_To_AI.pdf', 'file_path': '..\\data\\pdf_files\\A_Brief_Introduction_To_AI.pdf', 'total_pages': 9, 'format': 'PDF 1.4', 'title': 'A Brief Introduction to Artificial Intelligence', 'author': '', 'subject': 'Article about artificial intelligence (AI) written by Ryerson Computer Science student, Dibbyo Saha, for Science Rendezvous.', 'keywords': '', 'moddate': '2021-05-05T13:04:55+01:00', 'trapped': '', 'modDate': "D:20210505130455+01'00'", 'creationDate': "D:20210505130455+01'00'", 'page': 0, 'file_type': 'pdf'}, page_content='A Brief Introduction to Artificial Intelligence\nWhat is AI and how is it going to shape the future\nBy Dibbyo Saha, Undergraduate Student, Computer Science, Ryerson University\nWhat is Artificial Intelligence?\nImage by\xa0Gerd Altmann\xa0from\xa0Pixabay\nGenerally speaking, Artificial 