In [None]:
# Install required packages
!pip install -q llama-index llama-index-readers-file llama-index-readers-web \
    llama-index-readers-database pypdf pandas sqlite3

In [None]:
import os
from pathlib import Path
from typing import List, Dict
import pandas as pd

from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    Document,
    StorageContext,
    Settings
)
from llama_index.core.node_parser import (
    SentenceSplitter,
    SemanticSplitterNodeParser
)
from llama_index.readers.file import PDFReader, CSVReader
from llama_index.readers.web import SimpleWebPageReader
from llama_index.core.schema import MetadataMode

## Configuration

In [None]:
# Base paths
BASE_DIR = Path(r"c:\Github\GENAI\Learn-GenAI\datasets")
TXT_DIR = BASE_DIR / "txt"
PDF_DIR = BASE_DIR / "pdf"
CSV_DIR = BASE_DIR / "csv"
DB_DIR = BASE_DIR / "db"
WEB_FILE = BASE_DIR / "web" / "websites.txt"

# Output directory for processed data
OUTPUT_DIR = Path(r"c:\Github\GENAI\Learn-GenAI\rag_data")
OUTPUT_DIR.mkdir(exist_ok=True)

## 1. Loading Text Files (Categorized Content)

Efficiently load text files from multiple categories with metadata enrichment.

In [None]:
def load_categorized_texts() -> List[Document]:
    """
    Load text files from categorized folders with metadata.
    Uses SimpleDirectoryReader with recursive loading and metadata extraction.
    """
    all_documents = []
    
    # Get all category folders
    categories = [d.name for d in TXT_DIR.iterdir() if d.is_dir()]
    
    print(f"Found categories: {categories}")
    
    for category in categories:
        category_path = TXT_DIR / category
        
        # Use SimpleDirectoryReader for efficient batch loading
        reader = SimpleDirectoryReader(
            input_dir=str(category_path),
            recursive=False,
            required_exts=['.txt'],
            filename_as_id=True  # Use filename as document ID
        )
        
        documents = reader.load_data()
        
        # Enrich with category metadata
        for doc in documents:
            doc.metadata.update({
                'category': category,
                'source_type': 'text',
                'file_name': Path(doc.metadata.get('file_path', '')).name
            })
        
        all_documents.extend(documents)
        print(f"Loaded {len(documents)} documents from {category}")
    
    print(f"\nTotal text documents loaded: {len(all_documents)}")
    return all_documents

# Load text documents
text_documents = load_categorized_texts()

## 2. Loading PDF Files (Books)

Load PDF books with page-level metadata for precise retrieval.

In [None]:
def load_pdf_documents() -> List[Document]:
    """
    Load PDF documents with page-level granularity.
    """
    # SimpleDirectoryReader handles PDFs natively
    reader = SimpleDirectoryReader(
        input_dir=str(PDF_DIR),
        recursive=False,
        required_exts=['.pdf'],
        filename_as_id=True
    )
    
    documents = reader.load_data()
    
    # Add metadata
    for doc in documents:
        file_name = Path(doc.metadata.get('file_path', '')).name
        doc.metadata.update({
            'source_type': 'pdf',
            'file_name': file_name,
            'category': 'technical_documentation'
        })
    
    print(f"Loaded {len(documents)} PDF documents")
    return documents

# Load PDFs
pdf_documents = load_pdf_documents()

## 3. Loading CSV Files

Load CSV data with pandas for structured data handling.

In [None]:
def load_csv_documents() -> List[Document]:
    """
    Load CSV files and convert rows to documents.
    Each row becomes a document with column names as metadata.
    """
    documents = []
    
    csv_files = list(CSV_DIR.glob('*.csv'))
    
    for csv_file in csv_files:
        df = pd.read_csv(csv_file)
        
        # Convert each row to a document
        for idx, row in df.iterrows():
            # Create text content from all columns
            text_parts = [f"{col}: {val}" for col, val in row.items()]
            text_content = "\n".join(text_parts)
            
            # Create document with metadata
            doc = Document(
                text=text_content,
                metadata={
                    'source_type': 'csv',
                    'file_name': csv_file.name,
                    'row_index': idx,
                    **{k: str(v) for k, v in row.items()}  # Add all columns as metadata
                }
            )
            documents.append(doc)
        
        print(f"Loaded {len(df)} rows from {csv_file.name}")
    
    print(f"\nTotal CSV documents: {len(documents)}")
    return documents

# Load CSV data
csv_documents = load_csv_documents()

## 4. Loading SQLite Database

Query database and convert results to documents.

In [None]:
import sqlite3

def load_database_documents(db_path: Path, table_name: str = None) -> List[Document]:
    """
    Load data from SQLite database.
    If table_name is None, loads all tables.
    """
    documents = []
    
    conn = sqlite3.connect(str(db_path))
    cursor = conn.cursor()
    
    # Get all tables if not specified
    if table_name is None:
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
        tables = [row[0] for row in cursor.fetchall()]
    else:
        tables = [table_name]
    
    for table in tables:
        # Get column names
        cursor.execute(f"PRAGMA table_info({table})")
        columns = [col[1] for col in cursor.fetchall()]
        
        # Fetch all rows
        cursor.execute(f"SELECT * FROM {table}")
        rows = cursor.fetchall()
        
        # Convert each row to a document
        for row in rows:
            row_dict = dict(zip(columns, row))
            
            # Create text content
            text_parts = [f"{col}: {val}" for col, val in row_dict.items()]
            text_content = "\n".join(text_parts)
            
            doc = Document(
                text=text_content,
                metadata={
                    'source_type': 'database',
                    'table_name': table,
                    'db_name': db_path.name,
                    **{k: str(v) for k, v in row_dict.items()}
                }
            )
            documents.append(doc)
        
        print(f"Loaded {len(rows)} rows from table '{table}'")
    
    conn.close()
    print(f"\nTotal database documents: {len(documents)}")
    return documents

# Load database
db_path = DB_DIR / "movies.sqlite"
if db_path.exists():
    db_documents = load_database_documents(db_path)
else:
    db_documents = []
    print("No database file found")

## 5. Loading Web URLs

Fetch content from web pages listed in the websites file.

In [None]:
def load_web_documents() -> List[Document]:
    """
    Load web pages from URLs listed in websites.txt
    """
    documents = []
    
    if not WEB_FILE.exists():
        print("No websites.txt file found")
        return documents
    
    # Read URLs from file
    with open(WEB_FILE, 'r') as f:
        urls = [line.strip() for line in f if line.strip() and line.startswith('http')]
    
    print(f"Found {len(urls)} URLs to process")
    
    # Use SimpleWebPageReader
    try:
        reader = SimpleWebPageReader(html_to_text=True)
        documents = reader.load_data(urls)
        
        # Add metadata
        for doc, url in zip(documents, urls):
            doc.metadata.update({
                'source_type': 'web',
                'url': url,
                'category': 'documentation'
            })
        
        print(f"Successfully loaded {len(documents)} web pages")
    except Exception as e:
        print(f"Error loading web pages: {e}")
        print("Note: Web scraping may require additional setup or may be blocked")
    
    return documents

# Load web documents
web_documents = load_web_documents()

## 6. Combine All Documents

Merge all data sources into a unified document collection.

In [None]:
# Combine all documents
all_documents = (
    text_documents + 
    pdf_documents + 
    csv_documents + 
    db_documents + 
    web_documents
)

print(f"\n{'='*60}")
print("DOCUMENT LOADING SUMMARY")
print(f"{'='*60}")
print(f"Text documents: {len(text_documents)}")
print(f"PDF documents: {len(pdf_documents)}")
print(f"CSV documents: {len(csv_documents)}")
print(f"Database documents: {len(db_documents)}")
print(f"Web documents: {len(web_documents)}")
print(f"{'-'*60}")
print(f"TOTAL DOCUMENTS: {len(all_documents)}")
print(f"{'='*60}")

## 7. Parse Documents into Nodes

Use efficient node parsers to chunk documents for optimal retrieval.

In [None]:
# Create node parser with optimal chunk sizes
node_parser = SentenceSplitter(
    chunk_size=512,
    chunk_overlap=50,
    separator=" "
)

# Parse documents into nodes
nodes = node_parser.get_nodes_from_documents(all_documents)

print(f"\nCreated {len(nodes)} nodes from {len(all_documents)} documents")
print(f"Average nodes per document: {len(nodes)/len(all_documents):.2f}")

## 8. Create Vector Store Index

Build an index for efficient semantic search.

In [None]:
# Create index from nodes
# Note: This will use the default embedding model
# You can configure Settings.embed_model for custom embeddings

print("Creating vector store index...")
index = VectorStoreIndex(nodes)

# Persist index to disk
index.storage_context.persist(persist_dir=str(OUTPUT_DIR / "llamaindex_storage"))

print(f"Index created and saved to {OUTPUT_DIR / 'llamaindex_storage'}")

## 9. Query the Index

Test retrieval with sample queries.

In [None]:
# Create query engine
query_engine = index.as_query_engine(
    similarity_top_k=5,
    response_mode="compact"
)

# Test queries
test_queries = [
    "What are the main topics in business articles?",
    "Tell me about machine learning concepts from the PDFs",
    "What information is in the crime safety dataset?"
]

for query in test_queries:
    print(f"\nQuery: {query}")
    print("-" * 80)
    response = query_engine.query(query)
    print(f"Response: {response}\n")

## 10. Metadata Filtering

Demonstrate filtered queries by source type or category.

In [None]:
from llama_index.core.vector_stores import MetadataFilters, ExactMatchFilter

# Query only business documents
business_query_engine = index.as_query_engine(
    similarity_top_k=5,
    filters=MetadataFilters(
        filters=[ExactMatchFilter(key="category", value="business")]
    )
)

response = business_query_engine.query("What are the recent business trends?")
print(f"Business-filtered response: {response}")

# Query only PDFs
pdf_query_engine = index.as_query_engine(
    similarity_top_k=5,
    filters=MetadataFilters(
        filters=[ExactMatchFilter(key="source_type", value="pdf")]
    )
)

response = pdf_query_engine.query("Explain transformers in machine learning")
print(f"\nPDF-filtered response: {response}")

## 11. Advanced: Batch Processing for Large Datasets

Process documents in batches to manage memory efficiently.

In [None]:
def load_and_index_in_batches(batch_size: int = 100):
    """
    Load and index documents in batches for memory efficiency.
    """
    from llama_index.core import load_index_from_storage
    
    # Initialize or load existing index
    storage_dir = OUTPUT_DIR / "llamaindex_storage_batched"
    
    if storage_dir.exists():
        storage_context = StorageContext.from_defaults(persist_dir=str(storage_dir))
        index = load_index_from_storage(storage_context)
        print("Loaded existing index")
    else:
        index = VectorStoreIndex([])
        print("Created new index")
    
    # Process documents in batches
    total_docs = len(all_documents)
    
    for i in range(0, total_docs, batch_size):
        batch = all_documents[i:i+batch_size]
        print(f"Processing batch {i//batch_size + 1} ({len(batch)} documents)...")
        
        # Parse and insert batch
        batch_nodes = node_parser.get_nodes_from_documents(batch)
        index.insert_nodes(batch_nodes)
        
        # Persist after each batch
        index.storage_context.persist(persist_dir=str(storage_dir))
    
    print(f"\nBatch indexing complete. Processed {total_docs} documents.")
    return index

# Uncomment to run batch processing
# batched_index = load_and_index_in_batches(batch_size=100)

## Summary

This notebook demonstrated:
1. ✅ Loading text files from categorized folders
2. ✅ Loading PDF documents with metadata
3. ✅ Converting CSV data to searchable documents
4. ✅ Querying SQLite databases
5. ✅ Fetching web content
6. ✅ Combining all sources into unified index
7. ✅ Efficient node parsing and chunking
8. ✅ Metadata-based filtering
9. ✅ Batch processing for scalability

**Next Steps:**
- Configure custom embedding models (OpenAI, HuggingFace, etc.)
- Experiment with different node parsers (SemanticSplitter, SentenceWindow)
- Implement hybrid search (keyword + semantic)
- Add reranking for improved retrieval accuracy