In [None]:
# Install required packages
!pip install -q langchain langchain-community langchain-openai \
    chromadb pypdf pandas unstructured selenium beautifulsoup4

In [None]:
import os
from pathlib import Path
from typing import List, Dict
import pandas as pd

from langchain_community.document_loaders import (
    DirectoryLoader,
    TextLoader,
    PyPDFLoader,
    CSVLoader,
    WebBaseLoader,
    UnstructuredURLLoader
)
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter
)
from langchain.schema import Document

## Configuration

In [None]:
# Base paths
BASE_DIR = Path(r"c:\Github\GENAI\Learn-GenAI\datasets")
TXT_DIR = BASE_DIR / "txt"
PDF_DIR = BASE_DIR / "pdf"
CSV_DIR = BASE_DIR / "csv"
DB_DIR = BASE_DIR / "db"
WEB_FILE = BASE_DIR / "web" / "websites.txt"

# Output directory
OUTPUT_DIR = Path(r"c:\Github\GENAI\Learn-GenAI\rag_data")
OUTPUT_DIR.mkdir(exist_ok=True)

CHROMA_DIR = OUTPUT_DIR / "langchain_chroma"

## 1. Loading Text Files

Use DirectoryLoader with TextLoader for efficient batch processing.

In [None]:
def load_categorized_texts() -> List[Document]:
    """
    Load text files from categorized folders using DirectoryLoader.
    Adds category metadata to each document.
    """
    all_documents = []
    
    # Get all category folders
    categories = [d.name for d in TXT_DIR.iterdir() if d.is_dir()]
    
    print(f"Found categories: {categories}")
    
    for category in categories:
        category_path = TXT_DIR / category
        
        # Use DirectoryLoader for batch loading
        loader = DirectoryLoader(
            str(category_path),
            glob="**/*.txt",
            loader_cls=TextLoader,
            show_progress=True,
            use_multithreading=True  # Enable parallel loading
        )
        
        documents = loader.load()
        
        # Add metadata
        for doc in documents:
            doc.metadata.update({
                'category': category,
                'source_type': 'text',
                'file_name': Path(doc.metadata.get('source', '')).name
            })
        
        all_documents.extend(documents)
        print(f"Loaded {len(documents)} documents from {category}")
    
    print(f"\nTotal text documents: {len(all_documents)}")
    return all_documents

# Load text documents
text_documents = load_categorized_texts()

## 2. Loading PDF Files

Use PyPDFLoader for page-by-page extraction.

In [None]:
def load_pdf_documents() -> List[Document]:
    """
    Load PDF files using PyPDFLoader.
    Each page becomes a separate document.
    """
    all_documents = []
    
    pdf_files = list(PDF_DIR.glob('*.pdf'))
    print(f"Found {len(pdf_files)} PDF files")
    
    for pdf_file in pdf_files:
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Enrich metadata
            for doc in documents:
                doc.metadata.update({
                    'source_type': 'pdf',
                    'file_name': pdf_file.name,
                    'category': 'technical_documentation'
                })
            
            all_documents.extend(documents)
            print(f"Loaded {len(documents)} pages from {pdf_file.name}")
            
        except Exception as e:
            print(f"Error loading {pdf_file.name}: {e}")
    
    print(f"\nTotal PDF documents: {len(all_documents)}")
    return all_documents

# Load PDFs
pdf_documents = load_pdf_documents()

## 3. Loading CSV Files

Use CSVLoader to convert rows to documents.

In [None]:
def load_csv_documents() -> List[Document]:
    """
    Load CSV files using CSVLoader.
    Each row becomes a document.
    """
    all_documents = []
    
    csv_files = list(CSV_DIR.glob('*.csv'))
    
    for csv_file in csv_files:
        try:
            loader = CSVLoader(
                file_path=str(csv_file),
                encoding='utf-8',
                csv_args={'delimiter': ','}
            )
            
            documents = loader.load()
            
            # Add metadata
            for doc in documents:
                doc.metadata.update({
                    'source_type': 'csv',
                    'file_name': csv_file.name
                })
            
            all_documents.extend(documents)
            print(f"Loaded {len(documents)} rows from {csv_file.name}")
            
        except Exception as e:
            print(f"Error loading {csv_file.name}: {e}")
    
    print(f"\nTotal CSV documents: {len(all_documents)}")
    return all_documents

# Load CSV data
csv_documents = load_csv_documents()

## 4. Loading SQLite Database

Query database and convert to documents.

In [None]:
import sqlite3

def load_database_documents(db_path: Path) -> List[Document]:
    """
    Load SQLite database tables as documents.
    """
    documents = []
    
    if not db_path.exists():
        print(f"Database not found: {db_path}")
        return documents
    
    conn = sqlite3.connect(str(db_path))
    cursor = conn.cursor()
    
    # Get all tables
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = [row[0] for row in cursor.fetchall()]
    
    print(f"Found tables: {tables}")
    
    for table in tables:
        # Get column names
        cursor.execute(f"PRAGMA table_info({table})")
        columns = [col[1] for col in cursor.fetchall()]
        
        # Fetch rows
        cursor.execute(f"SELECT * FROM {table}")
        rows = cursor.fetchall()
        
        # Convert to documents
        for row in rows:
            row_dict = dict(zip(columns, row))
            
            # Create text content
            content = "\n".join([f"{k}: {v}" for k, v in row_dict.items()])
            
            doc = Document(
                page_content=content,
                metadata={
                    'source_type': 'database',
                    'table_name': table,
                    'db_name': db_path.name,
                    **{k: str(v) for k, v in row_dict.items()}
                }
            )
            documents.append(doc)
        
        print(f"Loaded {len(rows)} rows from table '{table}'")
    
    conn.close()
    print(f"\nTotal database documents: {len(documents)}")
    return documents

# Load database
db_path = DB_DIR / "movies.sqlite"
db_documents = load_database_documents(db_path)

## 5. Loading Web URLs

Fetch content from web pages.

In [None]:
def load_web_documents() -> List[Document]:
    """
    Load web pages from URLs using WebBaseLoader.
    """
    documents = []
    
    if not WEB_FILE.exists():
        print("No websites.txt file found")
        return documents
    
    # Read URLs
    with open(WEB_FILE, 'r') as f:
        urls = [line.strip() for line in f if line.strip() and line.startswith('http')]
    
    print(f"Found {len(urls)} URLs")
    
    try:
        # Use WebBaseLoader for batch loading
        loader = WebBaseLoader(urls)
        documents = loader.load()
        
        # Add metadata
        for doc in documents:
            doc.metadata.update({
                'source_type': 'web',
                'category': 'documentation'
            })
        
        print(f"Successfully loaded {len(documents)} web pages")
        
    except Exception as e:
        print(f"Error loading web pages: {e}")
        print("Note: Web scraping may be blocked or require browser automation")
    
    return documents

# Load web documents
web_documents = load_web_documents()

## 6. Combine All Documents

In [None]:
# Combine all documents
all_documents = (
    text_documents + 
    pdf_documents + 
    csv_documents + 
    db_documents + 
    web_documents
)

print(f"\n{'='*60}")
print("DOCUMENT LOADING SUMMARY")
print(f"{'='*60}")
print(f"Text documents: {len(text_documents)}")
print(f"PDF documents: {len(pdf_documents)}")
print(f"CSV documents: {len(csv_documents)}")
print(f"Database documents: {len(db_documents)}")
print(f"Web documents: {len(web_documents)}")
print(f"{'-'*60}")
print(f"TOTAL DOCUMENTS: {len(all_documents)}")
print(f"{'='*60}")

## 7. Split Documents into Chunks

Use text splitters for optimal chunk sizes.

In [None]:
# Create text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50,
    length_function=len,
    separators=["\n\n", "\n", " ", ""]
)

# Split documents
splits = text_splitter.split_documents(all_documents)

print(f"\nSplit {len(all_documents)} documents into {len(splits)} chunks")
print(f"Average chunks per document: {len(splits)/len(all_documents):.2f}")

## 8. Create Vector Store with ChromaDB

Build a persistent vector store for retrieval.

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings

# Initialize embeddings (using free local model)
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

print("Creating ChromaDB vector store...")

# Create vector store
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory=str(CHROMA_DIR),
    collection_name="rag_collection"
)

print(f"Vector store created with {len(splits)} chunks")
print(f"Persisted to: {CHROMA_DIR}")

## 9. Test Retrieval

Query the vector store.

In [None]:
# Create retriever
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 5}
)

# Test queries
test_queries = [
    "What are recent business developments?",
    "Explain machine learning transformers",
    "What's in the crime safety data?"
]

for query in test_queries:
    print(f"\nQuery: {query}")
    print("-" * 80)
    
    results = retriever.get_relevant_documents(query)
    
    for i, doc in enumerate(results[:3], 1):
        print(f"\nResult {i}:")
        print(f"Category: {doc.metadata.get('category', 'N/A')}")
        print(f"Source: {doc.metadata.get('source_type', 'N/A')}")
        print(f"Content: {doc.page_content[:200]}...")
    print("\n" + "="*80)

## 10. Metadata Filtering

Filter by source type or category.

In [None]:
# Search with metadata filter - business documents only
business_retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": 5,
        "filter": {"category": "business"}
    }
)

results = business_retriever.get_relevant_documents(
    "What are the latest business trends?"
)

print("Business-filtered results:")
for i, doc in enumerate(results[:3], 1):
    print(f"\n{i}. {doc.metadata.get('file_name', 'Unknown')}")
    print(f"   {doc.page_content[:150]}...")

## 11. Create QA Chain

Build a complete RAG pipeline with LLM.

In [None]:
from langchain.chains import RetrievalQA
from langchain_community.llms import Ollama

# Initialize LLM (using local Ollama)
# Make sure Ollama is running with: ollama serve
# And a model is pulled: ollama pull llama2

try:
    llm = Ollama(model="llama2", temperature=0)
    
    # Create QA chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
    )
    
    # Test the chain
    query = "What are the key concepts in machine learning?"
    result = qa_chain({"query": query})
    
    print(f"Question: {query}")
    print(f"\nAnswer: {result['result']}")
    print(f"\nSources used: {len(result['source_documents'])}")
    
except Exception as e:
    print(f"LLM not available: {e}")
    print("Install Ollama and run: ollama pull llama2")

## 12. Advanced: Incremental Loading

Add new documents to existing vector store.

In [None]:
def add_documents_incrementally(new_documents: List[Document]):
    """
    Add new documents to existing ChromaDB collection.
    """
    # Load existing vector store
    existing_vectorstore = Chroma(
        persist_directory=str(CHROMA_DIR),
        embedding_function=embeddings,
        collection_name="rag_collection"
    )
    
    # Split new documents
    new_splits = text_splitter.split_documents(new_documents)
    
    # Add to vector store
    existing_vectorstore.add_documents(new_splits)
    
    print(f"Added {len(new_splits)} new chunks to vector store")
    return existing_vectorstore

# Example: Add new documents
# new_docs = [Document(page_content="New content", metadata={"source": "new"})]
# add_documents_incrementally(new_docs)

## Summary

This notebook demonstrated:
1. ✅ Batch loading text files with DirectoryLoader
2. ✅ PDF loading with PyPDFLoader (page-level)
3. ✅ CSV loading with CSVLoader
4. ✅ SQLite database querying
5. ✅ Web scraping with WebBaseLoader
6. ✅ Document chunking with RecursiveCharacterTextSplitter
7. ✅ Vector store creation with ChromaDB
8. ✅ Metadata filtering for targeted retrieval
9. ✅ Complete RAG pipeline with QA chain
10. ✅ Incremental document addition

**LangChain Advantages:**
- Rich ecosystem of loaders for different formats
- Easy integration with LLMs and chains
- Built-in text splitters with smart chunking
- Multiple vector store backends

**Next Steps:**
- Configure OpenAI or Azure OpenAI embeddings
- Implement conversational retrieval chains
- Add memory for multi-turn conversations
- Experiment with different retrievers (MMR, similarity threshold)