In [None]:
# Import the Document class from langchain_core for creating document objects
from langchain_core.documents import Document

In [None]:
# Create a sample document object with page content and metadata
doc=Document(
    page_content="this is the main text content i am using to create RAG",
    metadata={
        'source':'example',
        'pages':1
    }
)
doc

Document(metadata={'source': 'example', 'pages': 1}, page_content='this is the main text content i am using to create RAG')

In [None]:
# Print the document object to display its content
print(doc)

page_content='this is the main text content i am using to create RAG' metadata={'source': 'example', 'pages': 1}


In [None]:
# Create the directory structure for storing text files
import os
os.makedirs("../data/text_files", exist_ok=True)  # exist_ok=True prevents error if directory exists

In [None]:
# Define sample text data as a dictionary with book metadata
sample_text = {
    "author": "John Doe",
    "title": "Introduction to Machine Learning",
    "category": "Technology",
    "year": 2024,
    "pages": 320,
    "publisher": "Tech Publications",
    "isbn": "978-1-23456-789-0",
    "language": "English",
    "content": "Machine learning is a subset of artificial intelligence that enables computers to learn from data without being explicitly programmed. This book covers supervised learning, unsupervised learning, deep learning, neural networks, and practical applications in various domains."
}

In [None]:
# Write the sample text data to a file in key:value format
with open("../data/text_files/sample_text.txt", "w") as f:
    for key, value in sample_text.items():
        f.write(f"{key}:{value}\n")

print("Text file created successfully at ../data/text_files/sample_text.txt")

Text file created successfully at ../data/text_files/sample_text.txt


In [None]:
# Import TextLoader for loading individual text files
from langchain_community.document_loaders import TextLoader

In [None]:
# Initialize TextLoader to load a single text file with UTF-8 encoding
loader=TextLoader("../data/text_files/sample_text.txt",encoding='utf-8')

In [None]:
# Load the text file into a document object
document=loader.load()

In [None]:
# Display the loaded document
print(document)

[Document(metadata={'source': '../data/text_files/sample_text.txt'}, page_content='author:John Doe\ntitle:Introduction to Machine Learning\ncategory:Technology\nyear:2024\npages:320\npublisher:Tech Publications\nisbn:978-1-23456-789-0\nlanguage:English\ncontent:Machine learning is a subset of artificial intelligence that enables computers to learn from data without being explicitly programmed. This book covers supervised learning, unsupervised learning, deep learning, neural networks, and practical applications in various domains.\n')]


In [None]:
# Import DirectoryLoader for loading multiple files from a directory
from langchain_community.document_loaders import DirectoryLoader

In [None]:
# Initialize DirectoryLoader to load all .txt files from the text_files directory
dir_loader=DirectoryLoader(
    '../data/text_files',
    loader_cls=TextLoader,  # Specifies which loader to use for each file
    glob='**/*.txt',  # Pattern to match all .txt files recursively
    loader_kwargs={'encoding':'utf-8'},  # Pass encoding to TextLoader
    show_progress=False
)

In [None]:
# Load all text documents from the directory
documents=dir_loader.load()

In [None]:
# Display all loaded text documents
print(documents)

[Document(metadata={'source': '..\\data\\text_files\\sample_text.txt'}, page_content='author:John Doe\ntitle:Introduction to Machine Learning\ncategory:Technology\nyear:2024\npages:320\npublisher:Tech Publications\nisbn:978-1-23456-789-0\nlanguage:English\ncontent:Machine learning is a subset of artificial intelligence that enables computers to learn from data without being explicitly programmed. This book covers supervised learning, unsupervised learning, deep learning, neural networks, and practical applications in various domains.\n')]


In [None]:
# Import PyMuPDFLoader for loading PDF files
from langchain_community.document_loaders import PyMuPDFLoader

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Initialize DirectoryLoader to load all PDF files from the pdf_files directory
dir_loader=DirectoryLoader(
    '../data/pdf_files',
    loader_cls=PyMuPDFLoader,  # Using PyMuPDF for faster PDF processing
    glob='**/*.pdf',  # Pattern to match all .pdf files recursively
    show_progress=False
)

In [None]:
# Load all PDF documents from the directory
pdf_documents=dir_loader.load()

In [None]:
# Display the first PDF document
pdf_documents[0]

Document(metadata={'producer': 'iText 4.2.0 by 1T3XT', 'creator': '', 'creationdate': '2026-01-21T22:02:52-08:00', 'source': '..\\data\\pdf_files\\Constructing and Optimizing Machine.pdf', 'file_path': '..\\data\\pdf_files\\Constructing and Optimizing Machine.pdf', 'total_pages': 45, 'format': 'PDF 1.4', 'title': 'Large Language Models for Constructing and Optimizing Machine Learning Workflows: A Survey', 'author': '', 'subject': 'ACM Trans. Softw. Eng. Methodol. 0.0', 'keywords': '', 'moddate': '2026-01-21T22:02:53-08:00', 'trapped': '', 'modDate': "D:20260121220253-08'00'", 'creationDate': "D:20260121220252-08'00'", 'page': 0}, page_content='.\n.\nLatest updates: h\ue03cps://dl.acm.org/doi/10.1145/3773084\n.\n.\nRESEARCH-ARTICLE\nLarge Language Models for Constructing and Optimizing Machine\nLearning Workflows: A Survey\nYANG GU, Shanghai Jiao Tong University, Shanghai, China\n.\nHENGYU YOU, Shanghai Jiao Tong University, Shanghai, China\n.\nJIAN CAO, Shanghai Jiao Tong University, S