In [None]:
# Lab 14: Document Loading using LangChain for RAG

This lab explores how to use LangChain Document Loaders to prepare external knowledge
for Retrieval-Augmented Generation (RAG).

Objectives:
- Understand the role of document loaders in RAG
- Load data from multiple file formats (PDF, Web, CSV)
- Inspect document metadata and content
- Compare loaders for content and metadata differences

In [1]:
# Install required libraries
!pip install langchain langchain-community pypdf unstructured --quiet


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# =========================
# Lab 14: Document Loading using LangChain for RAG
# =========================

import os
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader, CSVLoader

# =========================
# 1. Load PDF Data (PyPDFLoader)
# =========================
# ðŸ‘‰ Change 'your_file_name.pdf' to the actual name of your PDF
pdf_path = r"C:/Users/AL MAKKAH/Desktop/Artificial intelligence lab/Lab14_Assignment/your_file_name.pdf"

# Check if file exists
print("PDF exists:", os.path.isfile(pdf_path))

pdf_loader = PyPDFLoader(pdf_path)
pdf_docs = pdf_loader.load()

print("Total pages:", len(pdf_docs))
print("First page content:\n", pdf_docs[0].page_content[:500])
print("Metadata of first page:", pdf_docs[0].metadata)

# =========================
# 2. Load Web Data (WebBaseLoader)
# =========================
web_loader = WebBaseLoader("https://www.example.com")
web_docs = web_loader.load()

print("\nWeb content sample:\n", web_docs[0].page_content[:500])
print("Metadata:", web_docs[0].metadata)

# =========================
# 3. Load Structured Data (CSVLoader)
# =========================
# ðŸ‘‰ Change 'students.csv' to the actual name of your CSV file
csv_path = r"C:/Users/AL MAKKAH/Desktop/Artificial intelligence lab/Lab14_Assignment/students.csv"

print("CSV exists:", os.path.isfile(csv_path))

csv_loader = CSVLoader(file_path=csv_path)
csv_docs = csv_loader.load()

print("\nNumber of rows:", len(csv_docs))
print("Sample document:\n", csv_docs[0].page_content)
print("Metadata:", csv_docs[0].metadata)

# =========================
# 4. Compare All Loaders
# =========================
print("\n=== PDF Loader ===")
print("Content format:", type(pdf_docs[0].page_content))
print("Metadata fields:", pdf_docs[0].metadata.keys())

print("\n=== Web Loader ===")
print("Content format:", type(web_docs[0].page_content))
print("Metadata fields:", web_docs[0].metadata.keys())

print("\n=== CSV Loader ===")
print("Content format:", type(csv_docs[0].page_content))
print("Metadata fields:", csv_docs[0].metadata.keys())

In [6]:
import os
print(os.path.isfile(r"C:/Users/AL MAKKAH/Desktop/Artificial intelligence lab/Lab14_Assignment/lab14.pdf"))

True


In [8]:
# =========================
# Lab 14: Document Loading using LangChain for RAG
# =========================

import os
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader, CSVLoader

# =========================
# 1. Load PDF Data (PyPDFLoader)
# =========================
pdf_path = r"C:/Users/AL MAKKAH/Desktop/Artificial intelligence lab/Lab14_Assignment/lab14.pdf"

print("PDF exists:", os.path.isfile(pdf_path))

pdf_loader = PyPDFLoader(pdf_path)
pdf_docs = pdf_loader.load()

print("Total pages:", len(pdf_docs))
print("First page content:\n", pdf_docs[0].page_content[:500])
print("Metadata of first page:", pdf_docs[0].metadata)

# =========================
# 2. Load Web Data (WebBaseLoader)
# =========================
web_loader = WebBaseLoader("https://www.example.com")
web_docs = web_loader.load()

print("\nWeb content sample:\n", web_docs[0].page_content[:500])
print("Metadata:", web_docs[0].metadata)

# =========================
# 3. Load Structured Data (CSVLoader)
# =========================
csv_path = r"C:/Users/AL MAKKAH/Desktop/Artificial intelligence lab/Lab14_Assignment/students.csv"

print("CSV exists:", os.path.isfile(csv_path))

csv_loader = CSVLoader(file_path=csv_path)
csv_docs = csv_loader.load()

print("\nNumber of rows:", len(csv_docs))
print("Sample document:\n", csv_docs[0].page_content)
print("Metadata:", csv_docs[0].metadata)

# =========================
# 4. Compare All Loaders
# =========================
print("\n=== PDF Loader ===")
print("Content format:", type(pdf_docs[0].page_content))
print("Metadata fields:", pdf_docs[0].metadata.keys())

print("\n=== Web Loader ===")
print("Content format:", type(web_docs[0].page_content))
print("Metadata fields:", web_docs[0].metadata.keys())

print("\n=== CSV Loader ===")
print("Content format:", type(csv_docs[0].page_content))
print("Metadata fields:", csv_docs[0].metadata.keys())

PDF exists: True
Total pages: 63
First page content:
 Unsupervised Machine Learning 
Techniques
Metadata of first page: {'producer': 'MicrosoftÂ® PowerPointÂ® LTSC', 'creator': 'MicrosoftÂ® PowerPointÂ® LTSC', 'creationdate': '2025-12-13T18:53:29+05:00', 'title': 'PowerPoint Presentation', 'author': 'Pc', 'moddate': '2025-12-13T18:53:29+05:00', 'source': 'C:/Users/AL MAKKAH/Desktop/Artificial intelligence lab/Lab14_Assignment/lab14.pdf', 'total_pages': 63, 'page': 0, 'page_label': '1'}

Web content sample:
 Example DomainExample DomainThis domain is for use in documentation examples without needing permission. Avoid use in operations.Learn more

Metadata: {'source': 'https://www.example.com', 'title': 'Example Domain', 'language': 'en'}
CSV exists: False


RuntimeError: Error loading C:/Users/AL MAKKAH/Desktop/Artificial intelligence lab/Lab14_Assignment/students.csv

In [9]:
import os
print(os.path.isfile(r"C:/Users/AL MAKKAH/Desktop/Artificial intelligence lab/Lab14_Assignment/students.csv"))

True


In [10]:
# =========================
# Lab 14: Document Loading using LangChain for RAG
# =========================

import os
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader, CSVLoader

# =========================
# 1. Load PDF Data (PyPDFLoader)
# =========================
pdf_path = r"C:/Users/AL MAKKAH/Desktop/Artificial intelligence lab/Lab14_Assignment/lab14.pdf"

print("PDF exists:", os.path.isfile(pdf_path))

pdf_loader = PyPDFLoader(pdf_path)
pdf_docs = pdf_loader.load()

print("Total pages:", len(pdf_docs))
print("First page content:\n", pdf_docs[0].page_content[:500])
print("Metadata of first page:", pdf_docs[0].metadata)

# =========================
# 2. Load Web Data (WebBaseLoader)
# =========================
web_loader = WebBaseLoader("https://www.example.com")
web_docs = web_loader.load()

print("\nWeb content sample:\n", web_docs[0].page_content[:500])
print("Metadata:", web_docs[0].metadata)

# =========================
# 3. Load Structured Data (CSVLoader)
# =========================
csv_path = r"C:/Users/AL MAKKAH/Desktop/Artificial intelligence lab/Lab14_Assignment/students.csv"

print("CSV exists:", os.path.isfile(csv_path))

csv_loader = CSVLoader(file_path=csv_path)
csv_docs = csv_loader.load()

print("\nNumber of rows:", len(csv_docs))
print("Sample document:\n", csv_docs[0].page_content)
print("Metadata:", csv_docs[0].metadata)

# =========================
# 4. Compare All Loaders
# =========================
print("\n=== PDF Loader ===")
print("Content format:", type(pdf_docs[0].page_content))
print("Metadata fields:", pdf_docs[0].metadata.keys())

print("\n=== Web Loader ===")
print("Content format:", type(web_docs[0].page_content))
print("Metadata fields:", web_docs[0].metadata.keys())

print("\n=== CSV Loader ===")
print("Content format:", type(csv_docs[0].page_content))
print("Metadata fields:", csv_docs[0].metadata.keys())

PDF exists: True
Total pages: 63
First page content:
 Unsupervised Machine Learning 
Techniques
Metadata of first page: {'producer': 'MicrosoftÂ® PowerPointÂ® LTSC', 'creator': 'MicrosoftÂ® PowerPointÂ® LTSC', 'creationdate': '2025-12-13T18:53:29+05:00', 'title': 'PowerPoint Presentation', 'author': 'Pc', 'moddate': '2025-12-13T18:53:29+05:00', 'source': 'C:/Users/AL MAKKAH/Desktop/Artificial intelligence lab/Lab14_Assignment/lab14.pdf', 'total_pages': 63, 'page': 0, 'page_label': '1'}

Web content sample:
 Example DomainExample DomainThis domain is for use in documentation examples without needing permission. Avoid use in operations.Learn more

Metadata: {'source': 'https://www.example.com', 'title': 'Example Domain', 'language': 'en'}
CSV exists: True

Number of rows: 10
Sample document:
 id: 1
name: Zainab
age: 21
department: Computer Science
grade: A
Metadata: {'source': 'C:/Users/AL MAKKAH/Desktop/Artificial intelligence lab/Lab14_Assignment/students.csv', 'row': 0}

=== PDF Loade

In [11]:
from langchain_community.document_loaders import PyPDFLoader
import os

pdf_path = r"C:/Users/AL MAKKAH/Desktop/Artificial intelligence lab/Lab14_Assignment/lab14.pdf"

print("PDF exists:", os.path.isfile(pdf_path))

pdf_loader = PyPDFLoader(pdf_path)
pdf_docs = pdf_loader.load()

print("Total pages:", len(pdf_docs))
print("First page content:\n", pdf_docs[0].page_content[:300])
print("Metadata:", pdf_docs[0].metadata)

PDF exists: True
Total pages: 63
First page content:
 Unsupervised Machine Learning 
Techniques
Metadata: {'producer': 'MicrosoftÂ® PowerPointÂ® LTSC', 'creator': 'MicrosoftÂ® PowerPointÂ® LTSC', 'creationdate': '2025-12-13T18:53:29+05:00', 'title': 'PowerPoint Presentation', 'author': 'Pc', 'moddate': '2025-12-13T18:53:29+05:00', 'source': 'C:/Users/AL MAKKAH/Desktop/Artificial intelligence lab/Lab14_Assignment/lab14.pdf', 'total_pages': 63, 'page': 0, 'page_label': '1'}


In [12]:
from langchain_community.document_loaders import WebBaseLoader

web_loader = WebBaseLoader("https://example.com")
web_docs = web_loader.load()

print("Web content sample:\n", web_docs[0].page_content[:300])
print("Metadata:", web_docs[0].metadata)

Web content sample:
 Example DomainExample DomainThis domain is for use in documentation examples without needing permission. Avoid use in operations.Learn more

Metadata: {'source': 'https://example.com', 'title': 'Example Domain', 'language': 'en'}


In [13]:
from langchain_community.document_loaders import CSVLoader

csv_path = r"C:/Users/AL MAKKAH/Desktop/Artificial intelligence lab/Lab14_Assignment/students.csv"

print("CSV exists:", os.path.isfile(csv_path))

csv_loader = CSVLoader(file_path=csv_path)
csv_docs = csv_loader.load()

print("Number of rows:", len(csv_docs))
print("Sample row:\n", csv_docs[0].page_content)
print("Metadata:", csv_docs[0].metadata)

CSV exists: True
Number of rows: 10
Sample row:
 id: 1
name: Zainab
age: 21
department: Computer Science
grade: A
Metadata: {'source': 'C:/Users/AL MAKKAH/Desktop/Artificial intelligence lab/Lab14_Assignment/students.csv', 'row': 0}


In [14]:
print("\n=== PDF Loader ===")
print("Content type:", type(pdf_docs[0].page_content))
print("Metadata fields:", pdf_docs[0].metadata.keys())

print("\n=== Web Loader ===")
print("Content type:", type(web_docs[0].page_content))
print("Metadata fields:", web_docs[0].metadata.keys())

print("\n=== CSV Loader ===")
print("Content type:", type(csv_docs[0].page_content))
print("Metadata fields:", csv_docs[0].metadata.keys())


=== PDF Loader ===
Content type: <class 'str'>
Metadata fields: dict_keys(['producer', 'creator', 'creationdate', 'title', 'author', 'moddate', 'source', 'total_pages', 'page', 'page_label'])

=== Web Loader ===
Content type: <class 'str'>
Metadata fields: dict_keys(['source', 'title', 'language'])

=== CSV Loader ===
Content type: <class 'str'>
Metadata fields: dict_keys(['source', 'row'])
