In [7]:
# ==========================================
# Task 1: Environment Setup
# ==========================================
!pip install -qU langchain langchain-community pypdf unstructured beautifulsoup4 pandas

import os
import pandas as pd
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader, CSVLoader

# --- MOCK DATA CREATION (For Lab Purposes) ---
# Creating a dummy CSV so the code doesn't crash
df = pd.DataFrame({'ID': [1, 2], 'Name': ['Alice', 'Bob'], 'Topic': ['RAG', 'Embeddings']})
df.to_csv("students.csv", index=False)

# NOTE: For Task 3 (PDF), please upload a file named 'lecture_notes.pdf'
# to the Colab sidebar. I will wrap it in a try-except block.

# ==========================================
# Task 3: Load PDF Data (PyPDFLoader)
# ==========================================
print("--- TASK 3: PDF LOADING ---")
try:
    pdf_loader = PyPDFLoader("lecture_notes.pdf")
    pages = pdf_loader.load()
    print(f"Total Pages Found: {len(pages)}")
    print(f"First Page Content (Snippet): {pages[0].page_content[:200]}...")
    print(f"Metadata: {pages[0].metadata}")
except Exception as e:
    print("PDF Error: Please upload 'lecture_notes.pdf' to the folder icon on the left.")

# ==========================================
# Task 4: Load Web Data (WebBaseLoader)
# ==========================================
print("\n--- TASK 4: WEB LOADING ---")
web_loader = WebBaseLoader("https://python.langchain.com/docs/introduction/")
web_docs = web_loader.load()
print(f"Web Content Snippet: {web_docs[0].page_content[500:800].strip()}...")
print(f"Web Source: {web_docs[0].metadata['source']}")

# ==========================================
# Task 5: Load Structured Data (CSVLoader)
# ==========================================
print("\n--- TASK 5: CSV LOADING ---")
csv_loader = CSVLoader(file_path="students.csv")
csv_docs = csv_loader.load()
print(f"First CSV Row as Document:\n{csv_docs[0].page_content}")
print(f"CSV Metadata: {csv_docs[0].metadata}")

# ==========================================
# Task 6: Comparison
# ==========================================
print("\n--- TASK 6: COMPARISON SUMMARY ---")
loaders = [("PDF", pages[0] if 'pages' in locals() else None),
           ("Web", web_docs[0]),
           ("CSV", csv_docs[0])]

for label, doc in loaders:
    if doc:
        print(f"[{label}] Keys in Metadata: {list(doc.metadata.keys())}")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.6/329.6 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.7/107.7 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m68.6 MB/s[0m eta [3



--- TASK 3: PDF LOADING ---
PDF Error: Please upload 'lecture_notes.pdf' to the folder icon on the left.

--- TASK 4: WEB LOADING ---
Web Content Snippet: eeringModel Context Protocol (MCP)Human-in-the-loopMulti-agentRetrievalLong-term memoryAgent developmentLangSmith StudioTestAgent Chat UIDeploy with LangSmithDeploymentObservabilityOn this page Create an agent Core benefitsLangChain overviewCopy pageLangChain is an open source framework with a pre-b...
Web Source: https://python.langchain.com/docs/introduction/

--- TASK 5: CSV LOADING ---
First CSV Row as Document:
ID: 1
Name: Alice
Topic: RAG
CSV Metadata: {'source': 'students.csv', 'row': 0}

--- TASK 6: COMPARISON SUMMARY ---
[Web] Keys in Metadata: ['source', 'title', 'description', 'language']
[CSV] Keys in Metadata: ['source', 'row']
