In [41]:
# Install required packages
%pip install -q langchain langchain_community chromadb pymupdf PyPDF2 FlagEmbedding
%pip install -q sentence-transformers transformers torch pydantic


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [42]:
# Core imports
import torch
import os
from pathlib import Path
from typing import List, Dict, Any
from tqdm import tqdm

# Document processing
from langchain.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document

# Embeddings and vector store
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

# Reranking
from FlagEmbedding import FlagReranker

# LLM
from transformers import AutoTokenizer, T5ForConditionalGeneration

# Data validation
from pydantic import BaseModel, Field, field_validator

import re
import json
import warnings
warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Prevent tokenizer warnings


In [43]:
# Configuration
BOOK_PDF_PATH = Path("../Sources/book.pdf")
SECTIONS_JSON_PATH = Path("./working/page_to_section.json")
CHROMA_PATH = Path("./chroma")

# Model settings
EMBEDDING_MODEL = "BAAI/bge-large-en-v1.5"
RERANKER_MODEL = "BAAI/bge-reranker-v2-m3"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Document processing settings
CHUNK_SIZE = 800
CHUNK_OVERLAP = 100
LOGICAL_PAGE_OFFSET = 12  # Difference between physical and logical page numbers

# Verify paths exist
assert BOOK_PDF_PATH.exists(), f"PDF not found: {BOOK_PDF_PATH}"
assert SECTIONS_JSON_PATH.exists(), f"Sections JSON not found: {SECTIONS_JSON_PATH}"
CHROMA_PATH.mkdir(parents=True, exist_ok=True)

print("Device being used:", DEVICE)


Device being used: cpu


In [44]:
# Load section mapping
def load_section_mapping():
    """Load and prepare section mapping"""
    print("Loading sections mapping...")
    with open(SECTIONS_JSON_PATH, "r") as f:
        sections_map = json.load(f)
    
    # Ensure all keys are strings
    sections_map = {str(k): v for k, v in sections_map.items()}
    print(f"Loaded {len(sections_map)} page-to-section mappings")
    return sections_map

# Text cleaning
def clean_text(text: str) -> str:
    """Clean and normalize text, removing supplementary sections"""
    if not text:
        return ""
    
    # Remove supplementary sections
    sections_to_remove = [
        r'Key Terms\s.*?(?=\n\n|\Z)',
        r'Summary\s.*?(?=\n\n|\Z)',
        r'Review Questions\s.*?(?=\n\n|\Z)',
        r'Critical Thinking Questions\s.*?(?=\n\n|\Z)',
        r'Personal Application Questions\s.*?(?=\n\n|\Z)',
        r'Access for free at openstax\.org\.*',
        r'LINK TO LEARNING.*?(?=\n|$)',
        r'Watch a brief video.*?(?=\n|$)',
        r'http[s]?://\S+'
    ]
    
    for pattern in sections_to_remove:
        text = re.sub(pattern, '', text, flags=re.DOTALL | re.IGNORECASE)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Load sections mapping
sections_map = load_section_mapping()


Loading sections mapping...
Loaded 494 page-to-section mappings


In [45]:
# Pydantic models for validation
class ChunkMetadata(BaseModel):
    """Metadata for a single chunk"""
    chunk_id: str
    physical_page: int
    logical_page: int
    section: str

class RetrievedChunk(BaseModel):
    """A single retrieved chunk with metadata"""
    text: str
    metadata: ChunkMetadata
    score: float

class QueryResults(BaseModel):
    """Results for a single query"""
    query: str
    chunks: List[RetrievedChunk] = Field(
        ...,
        min_items=5,
        max_items=5,
        description="Top 5 retrieved chunks for this query"
    )

class FinalResponse(BaseModel):
    """Final RAG output"""
    query: str
    answer: str
    source_chunks: List[ChunkMetadata]
    sections_referenced: List[str]


In [46]:
# Pydantic models for validation
class ChunkMetadata(BaseModel):
    """Metadata for a single chunk"""
    chunk_id: str
    physical_page: int
    logical_page: int
    section: str

class RetrievedChunk(BaseModel):
    """A single retrieved chunk with metadata"""
    text: str
    metadata: ChunkMetadata
    score: float

class QueryVariation(BaseModel):
    """A single query variation with its results"""
    query: str
    chunks: List[RetrievedChunk] = Field(
        ...,
        min_items=5,
        max_items=5,
        description="Top 5 retrieved chunks for this query"
    )

class QueryExpansionResult(BaseModel):
    """Results from query expansion"""
    original_query: str
    variations: List[QueryVariation] = Field(
        ...,
        min_items=5,
        max_items=5,
        description="Original query plus 4 variations"
    )

class FinalResponse(BaseModel):
    """Final RAG output"""
    query: str
    answer: str
    source_chunks: List[ChunkMetadata]
    sections_referenced: List[str]
    query_variations: List[str]  # Store the variations used

    @field_validator("source_chunks")
    def validate_chunks(cls, v):
        if not v:
            raise ValueError("Must have at least one source chunk")
        return v

    @field_validator("sections_referenced")
    def validate_sections(cls, v):
        if not v:
            raise ValueError("Must have at least one section referenced")
        return v

    @field_validator("query_variations")
    def validate_variations(cls, v):
        if len(v) != 5:
            raise ValueError("Must have exactly 5 query variations")
        if len(set(v)) != len(v):
            raise ValueError("All query variations must be unique")
        return v


In [47]:
# Setup LLM for generation
print("Loading Flan-T5-small model...")
model_name = "google/flan-t5-small"

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float32
)

def generate_response(query: str, context: str) -> str:
    """Generate response using T5 model"""
    prompt = f"""Based on the provided context, answer the question clearly and accurately.

    Context: {context}

    Question: {query}

    Instructions:
    1. Use ONLY information from the context
    2. Be specific and accurate
    3. Include relevant page references [Page X] when possible
    4. Provide a clear, structured explanation
    5. If context doesn't contain enough information, say so

    Format:
    - Start with a clear definition or main point
    - Provide supporting details from the context
    - Include page references in [brackets]
    - End with a brief summary if appropriate

    Answer:"""
    
    # Tokenize
    inputs = tokenizer(
        prompt, 
        return_tensors="pt",
        max_length=512,
        truncation=True
    ).to(model.device)
    
    # Generate
    outputs = model.generate(
        **inputs,
        max_length=300,  # Increased for more complete answers
        min_length=100,  # Increased for more detailed responses
        num_beams=5,     # More beam search paths
        length_penalty=1.5,  # Favor longer responses
        no_repeat_ngram_size=3
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def generate_query_variations(query: str) -> List[str]:
    """Generate query variations using LLM"""
    # Extract core concept from query
    core_concept = query.lower().replace('what is ', '').replace('?', '').strip()
    
    # Template-based variations that work well for textbook content
    variations = [
        query,  # Original query
        f"How would you define {core_concept} in academic terms?",
        f"What are the main concepts and principles of {core_concept}?",
        f"Can you explain the fundamental aspects of {core_concept}?",
        f"What are the key characteristics that define {core_concept}?"
    ]
    
    return variations
    
    # Generate variations
    response = generate_response(query=query, context=prompt)
    
    # Split into lines and clean
    variations = [
        line.strip().split('.', 1)[-1].strip()  # Remove numbering
        for line in response.split('\n')
        if line.strip() and any(c.isdigit() for c in line)  # Only lines with numbers
    ]
    
    # Ensure variations are unique
    variations = list(dict.fromkeys(variations))  # Remove duplicates
    
    # Add original query at the start
    all_queries = [query]
    
    # Add unique variations
    for var in variations:
        if var not in all_queries and len(all_queries) < 5:
            all_queries.append(var)
    
    # If we still need more variations, add template-based ones
    template_variations = [
        f"Explain the concept of {query.lower().replace('what is ', '')}",
        f"What are the key aspects of {query.lower().replace('what is ', '')}",
        f"How would you define {query.lower().replace('what is ', '')}",
        f"What does {query.lower().replace('what is ', '')} encompass"
    ]
    
    for var in template_variations:
        if len(all_queries) >= 5:
            break
        if var not in all_queries:
            all_queries.append(var)
    
    return all_queries[:5]  # Return exactly 5 unique queries

def generate_answer(query: str, context: List[Document]) -> str:
    """Generate final answer using LLM"""
    # Format context
    formatted_context = "\n\n".join([
        f"[Page {doc.metadata['physical_page']} - {doc.metadata['section']}]:\n{doc.page_content}"
        for doc in context
    ])
    
    prompt = f"""Based on the provided context, answer the question clearly and accurately.
    
    Context:
    {formatted_context}
    
    Question: {query}
    
    Instructions:
    1. Use only information from the provided context
    2. Be specific and accurate
    3. Include relevant page references [Page X] in your answer
    4. If context doesn't contain enough information, say so
    
    Answer:"""
    
    # Generate response
    return generate_response(query=query, context=prompt)

print("LLM setup complete!")


Loading Flan-T5-small model...
LLM setup complete!


In [None]:
# Query generation and retrieval
def generate_query_variations(query: str) -> List[str]:
    """Generate academically-focused query variations"""
    # Extract core concept from query
    core_concept = query.lower().replace('what is ', '').replace('?', '').strip()
    
    # Academic-focused variations
    variations = [
        query,  # Original query
        f"Define {core_concept} as a field of study",
        f"What are the fundamental principles of {core_concept}?",
        f"How is {core_concept} defined in academic terms?",
        f"What are the core concepts and methods of {core_concept}?"
    ]
    
    return variations

def reciprocal_rank_fusion(all_results: List[tuple], k: int = 60) -> List[tuple]:
    """Combine results using Reciprocal Rank Fusion"""
    rrf_scores = {}
    
    # Calculate RRF scores
    for rank, (doc, score) in enumerate(all_results):
        doc_id = doc.metadata['chunk_id']
        rrf_scores[doc_id] = rrf_scores.get(doc_id, 0) + 1/(rank + k)
    
    # Sort by RRF score
    sorted_docs = sorted(
        [(doc, score) for doc, score in all_results],
        key=lambda x: rrf_scores[x[0].metadata['chunk_id']],
        reverse=True
    )
    
    # Take top 3 unique chunks
    seen_ids = set()
    final_results = []
    for doc, score in sorted_docs:
        if doc.metadata['chunk_id'] not in seen_ids:
            seen_ids.add(doc.metadata['chunk_id'])
            final_results.append((doc, score))
            if len(final_results) == 3:
                break
    
    return final_results

def retrieve_and_rerank(query: str, vectorstore, reranker, top_k: int = 5) -> List[tuple]:
    """Retrieve and rerank chunks for a single query"""
    # Get initial results with more candidates for reranking
    results = vectorstore.similarity_search_with_score(
        query,
        k=top_k * 2  # Get more candidates for better reranking
    )
    
    # Rerank
    pairs = [[query, doc.page_content] for doc, _ in results]
    rerank_scores = reranker.compute_score(pairs)
    
    # Sort by reranker score
    scored_results = [(doc, float(score)) for (doc, _), score in zip(results, rerank_scores)]
    scored_results.sort(key=lambda x: x[1], reverse=True)
    
    return scored_results[:top_k]

# Example usage
test_query = "What is psychology?"
variations = generate_query_variations(test_query)
print("Original:", test_query)
print("\nGenerated variations:")
for i, var in enumerate(variations[1:], 1):
    print(f"{i}. {var}")


In [None]:
# Main RAG pipeline
def process_query(query: str) -> FinalResponse:
    """Process a query through the full RAG pipeline"""
    # 1. Generate query variations
    variations = generate_query_variations(query)
    print(f"Generated {len(variations)} query variations:")
    for i, var in enumerate(variations):
        print(f"{i}. {var}")
    
    # 2. Get results for each variation
    all_results = []
    for var in variations:
        # Get results for this variation
        results = retrieve_and_rerank(var, vectorstore, reranker)
        all_results.extend(results)
    
    # 3. Apply RRF to get final chunks
    final_chunks = reciprocal_rank_fusion(all_results)
    
    # 4. Extract unique sections
    sections = list(set(
        doc.metadata['section'] 
        for doc, _ in final_chunks
    ))
    
    # 5. Generate answer using LLM
    final_docs = [doc for doc, _ in final_chunks]
    answer = generate_answer(query, final_docs)
    
    # 6. Create final response
    response = FinalResponse(
        query=query,
        answer=answer,
        source_chunks=[
            ChunkMetadata(
                chunk_id=doc.metadata['chunk_id'],
                physical_page=doc.metadata['physical_page'],
                logical_page=doc.metadata['logical_page'],
                section=doc.metadata['section']
            )
            for doc, _ in final_chunks
        ],
        sections_referenced=sections,
        query_variations=variations
    )
    
    return response

# Test the pipeline
def test_pipeline():
    """Test the RAG pipeline with a sample query"""
    query = "What is psychology?"
    print(f"\nProcessing query: {query}")
    
    response = process_query(query)
    
    print("\nResults:")
    print(f"Query: {response.query}")
    print("\nQuery variations used:")
    for i, var in enumerate(response.query_variations):
        print(f"{i}. {var}")
    print(f"\nAnswer: {response.answer}")
    print("\nSource chunks:")
    for chunk in response.source_chunks:
        print(f"- Page {chunk.physical_page} (Section: {chunk.section})")
    print("\nSections referenced:")
    for section in response.sections_referenced:
        print(f"- {section}")

# Run test
test_pipeline()


In [48]:
# Main RAG pipeline
def process_query(query: str) -> FinalResponse:
    """Process a query through the full RAG pipeline"""
    # 1. Generate query variations using LLM
    variations = generate_query_variations(query)
    print(f"Generated {len(variations)} query variations:")
    for i, var in enumerate(variations):
        print(f"{i}. {var}")
    
    # 2. Get results for each variation
    all_results = []
    variation_results = []
    
    for var in variations:
        # Get initial results
        results = vectorstore.similarity_search_with_score(var, k=5)
        
        # Rerank results
        pairs = [[var, doc.page_content] for doc, _ in results]
        rerank_scores = reranker.compute_score(pairs)
        
        # Create validated chunks
        chunks = [
            RetrievedChunk(
                text=doc.page_content,
                metadata=ChunkMetadata(
                    chunk_id=doc.metadata["chunk_id"],
                    physical_page=doc.metadata["physical_page"],
                    logical_page=doc.metadata["logical_page"],
                    section=doc.metadata["section"]
                ),
                score=float(score)
            )
            for (doc, _), score in zip(results, rerank_scores)
        ]
        
        # Add to variation results
        variation_results.append(
            QueryVariation(
                query=var,
                chunks=chunks[:5]
            )
        )
        
        # Add to combined results for RRF
        all_results.extend([(doc, score) for (doc, _), score in zip(results, rerank_scores)])
    
    # 3. Apply RRF to get final chunks
    final_chunks = reciprocal_rank_fusion(all_results)
    
    # 4. Extract unique sections
    sections = list(set(
        doc.metadata['section'] 
        for doc, _ in final_chunks
    ))
    
    # 5. Generate answer using LLM
    final_docs = [doc for doc, _ in final_chunks]
    answer = generate_answer(query, final_docs)
    
    # 6. Create and validate final response
    response = FinalResponse(
        query=query,
        answer=answer,
        source_chunks=[
            ChunkMetadata(
                chunk_id=doc.metadata['chunk_id'],
                physical_page=doc.metadata['physical_page'],
                logical_page=doc.metadata['logical_page'],
                section=doc.metadata['section']
            )
            for doc, _ in final_chunks
        ],
        sections_referenced=sections,
        query_variations=variations
    )
    
    return response

# Test the pipeline
def test_pipeline():
    """Test the RAG pipeline with a sample query"""
    query = "What is psychology?"
    print(f"\nProcessing query: {query}")
    
    response = process_query(query)
    
    print("\nResults:")
    print(f"Query: {response.query}")
    print("\nQuery variations used:")
    for i, var in enumerate(response.query_variations):
        print(f"{i}. {var}")
    print(f"\nAnswer: {response.answer}")
    print("\nSource chunks:")
    for chunk in response.source_chunks:
        print(f"- Page {chunk.physical_page} (Section: {chunk.section})")
    print("\nSections referenced:")
    for section in response.sections_referenced:
        print(f"- {section}")

# Run test
test_pipeline()



Processing query: What is psychology?
Generated 5 query variations:
0. What is psychology?
1. How would you define psychology in academic terms?
2. What are the main concepts and principles of psychology?
3. Can you explain the fundamental aspects of psychology?
4. What are the key characteristics that define psychology?

Results:
Query: What is psychology?

Query variations used:
0. What is psychology?
1. How would you define psychology in academic terms?
2. What are the main concepts and principles of psychology?
3. Can you explain the fundamental aspects of psychology?
4. What are the key characteristics that define psychology?

Answer: Cognitive psychology is a relatively young science with its experimental roots in the 19th century, compared, for example, to human physiology, which dates much earlier. As mentioned, anyone interested in [Page 33 - introduction_to_psychology/contemporary_Psychology]: revolution created an impetus for psychologists to focus their attention on better

In [49]:
# Updated RAG pipeline
def retrieve_and_rerank(query: str, vectorstore, reranker, top_k: int = 5) -> List[tuple]:
    """Retrieve and rerank chunks for a single query"""
    # Get initial results with more candidates for reranking
    results = vectorstore.similarity_search_with_score(
        query,
        k=top_k * 2  # Get more candidates for better reranking
    )
    
    # Rerank
    pairs = [[query, doc.page_content] for doc, _ in results]
    rerank_scores = reranker.compute_score(pairs)
    
    # Sort by reranker score
    scored_results = [(doc, float(score)) for (doc, _), score in zip(results, rerank_scores)]
    scored_results.sort(key=lambda x: x[1], reverse=True)
    
    return scored_results[:top_k]

def process_query(query: str) -> FinalResponse:
    """Process a query through the full RAG pipeline"""
    # 1. Generate query variations
    variations = generate_query_variations(query)
    print(f"Generated {len(variations)} query variations:")
    for i, var in enumerate(variations):
        print(f"{i}. {var}")
    
    # 2. Get results for each variation
    all_results = []
    for var in variations:
        results = retrieve_and_rerank(var, vectorstore, reranker)
        all_results.extend(results)
    
    # 3. Apply RRF to get final chunks
    final_chunks = reciprocal_rank_fusion(all_results)
    
    # 4. Extract unique sections
    sections = list(set(
        doc.metadata['section'] 
        for doc, _ in final_chunks
    ))
    
    # 5. Generate answer using LLM
    final_docs = [doc for doc, _ in final_chunks]
    answer = generate_answer(query, final_docs)
    
    # 6. Create final response
    response = FinalResponse(
        query=query,
        answer=answer,
        source_chunks=[
            ChunkMetadata(
                chunk_id=doc.metadata['chunk_id'],
                physical_page=doc.metadata['physical_page'],
                logical_page=doc.metadata['logical_page'],
                section=doc.metadata['section']
            )
            for doc, _ in final_chunks
        ],
        sections_referenced=sections,
        query_variations=variations
    )
    
    return response

# Test the pipeline
def test_pipeline():
    """Test the RAG pipeline with a sample query"""
    query = "What is psychology?"
    print(f"\nProcessing query: {query}")
    
    response = process_query(query)
    
    print("\nResults:")
    print(f"Query: {response.query}")
    print("\nQuery variations used:")
    for i, var in enumerate(response.query_variations):
        print(f"{i}. {var}")
    print(f"\nAnswer: {response.answer}")
    print("\nSource chunks:")
    for chunk in response.source_chunks:
        print(f"- Page {chunk.physical_page} (Section: {chunk.section})")
    print("\nSections referenced:")
    for section in response.sections_referenced:
        print(f"- {section}")

# Run test
test_pipeline()



Processing query: What is psychology?
Generated 5 query variations:
0. What is psychology?
1. How would you define psychology in academic terms?
2. What are the main concepts and principles of psychology?
3. Can you explain the fundamental aspects of psychology?
4. What are the key characteristics that define psychology?

Results:
Query: What is psychology?

Query variations used:
0. What is psychology?
1. How would you define psychology in academic terms?
2. What are the main concepts and principles of psychology?
3. Can you explain the fundamental aspects of psychology?
4. What are the key characteristics that define psychology?

Answer: [Page 20 - introduction_to_psychology/what_is_psycho]: and it cannot arrive at knowledge about values and morality. This is one reason why our scientific understanding of the mind is so limited, since thoughts, at least as we experience them, are neither matter nor energy. The scientific method is also a form of empiricism. An empirical method for a

In [50]:
# Load section mapping
def load_section_mapping():
    """Load and prepare section mapping"""
    print("Loading sections mapping...")
    with open(SECTIONS_JSON_PATH, "r") as f:
        sections_map = json.load(f)
    
    # Ensure all keys are strings
    sections_map = {str(k): v for k, v in sections_map.items()}
    print(f"Loaded {len(sections_map)} page-to-section mappings")
    return sections_map

# Text cleaning
def clean_text(text: str) -> str:
    """Clean and normalize text, removing supplementary sections"""
    if not text:
        return ""
    
    # Remove supplementary sections
    sections_to_remove = [
        r'Key Terms\s.*?(?=\n\n|\Z)',
        r'Summary\s.*?(?=\n\n|\Z)',
        r'Review Questions\s.*?(?=\n\n|\Z)',
        r'Critical Thinking Questions\s.*?(?=\n\n|\Z)',
        r'Personal Application Questions\s.*?(?=\n\n|\Z)',
        r'Access for free at openstax\.org\.*',
        r'LINK TO LEARNING.*?(?=\n|$)',
        r'Watch a brief video.*?(?=\n|$)',
        r'http[s]?://\S+'
    ]
    
    for pattern in sections_to_remove:
        text = re.sub(pattern, '', text, flags=re.DOTALL | re.IGNORECASE)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Load sections mapping
sections_map = load_section_mapping()


Loading sections mapping...
Loaded 494 page-to-section mappings


In [51]:
# Document processing
def process_documents():
    """Load and process PDF, only including mapped pages"""
    # Load documents
    print("Loading PDF...")
    loader = PyPDFLoader(str(BOOK_PDF_PATH))
    documents = loader.load()
    
    # Get valid pages
    valid_logical_pages = set(int(page) for page in sections_map.keys())
    chunks = []
    
    # Process documents
    print("Processing pages...")
    for doc in tqdm(documents, desc="Processing pages"):
        physical_page = int(doc.metadata.get("page", 0)) + 1
        logical_page = physical_page - LOGICAL_PAGE_OFFSET
        
        # Skip if not in mapping
        if logical_page not in valid_logical_pages:
            continue
        
        # Get section and clean text
        section = sections_map[str(logical_page)]
        cleaned_text = clean_text(doc.page_content)
        
        # Skip if too short
        if len(cleaned_text.split()) < 20:
            continue
            
        # Create chunks
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=CHUNK_SIZE,
            chunk_overlap=CHUNK_OVERLAP,
            length_function=len
        )
        
        doc_chunks = splitter.split_text(cleaned_text)
        
        # Add chunks with metadata
        for i, chunk_text in enumerate(doc_chunks):
            chunk_id = f"p{physical_page}_w{i:03d}"
            chunks.append(Document(
                page_content=chunk_text,
                metadata={
                    "chunk_id": chunk_id,
                    "physical_page": physical_page,
                    "logical_page": logical_page,
                    "section": section
                }
            ))
    
    print(f"\nCreated {len(chunks)} chunks from {len(valid_logical_pages)} valid pages")
    return chunks

# Process documents
chunks = process_documents()


Loading PDF...
Processing pages...


Processing pages: 100%|██████████| 753/753 [00:00<00:00, 2456.22it/s]


Created 2364 chunks from 494 valid pages





In [52]:
# Setup vector store and reranker
def setup_vector_store(chunks: List[Document]):
    """Initialize and populate vector store"""
    print("Setting up embedding model...")
    embeddings = HuggingFaceEmbeddings(
        model_name=EMBEDDING_MODEL,
        model_kwargs={'device': DEVICE},
        encode_kwargs={'normalize_embeddings': True}
    )
    
    print("Creating vector store...")
    vectorstore = Chroma(
        persist_directory=str(CHROMA_PATH),
        embedding_function=embeddings
    )
    
    print("Adding documents...")
    vectorstore.add_documents(chunks)
    vectorstore.persist()
    
    return vectorstore, embeddings

# Initialize reranker
def setup_reranker():
    """Initialize the reranker model"""
    print("Initializing reranker...")
    return FlagReranker(
        RERANKER_MODEL,
        use_fp16=True,
        device=DEVICE
    )

# Setup models
vectorstore, embeddings = setup_vector_store(chunks)
reranker = setup_reranker()


Setting up embedding model...
Creating vector store...
Adding documents...
Initializing reranker...


In [53]:
# Setup LLM for generation
print("Loading Flan-T5-small model...")
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float32
)

def generate_response(query: str, context: str) -> str:
    """Generate response using T5 model"""
    prompt = f"""Answer the question based on the given context. Be clear and concise.
    
    Context: {context}
    
    Question: {query}
    
    Instructions:
    1. Use only information from the context
    2. Be specific and accurate
    3. If context doesn't contain enough information, say so
    
    Answer:"""
    
    # Tokenize
    inputs = tokenizer(
        prompt, 
        return_tensors="pt",
        max_length=512,
        truncation=True
    ).to(model.device)
    
    # Generate
    outputs = model.generate(
        **inputs,
        max_length=150,
        min_length=50,
        temperature=0.7,
        num_beams=3,
        no_repeat_ngram_size=2
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print("LLM setup complete!")


Loading Flan-T5-small model...
LLM setup complete!


In [54]:
# Query expansion and retrieval
def generate_query_variations(query: str) -> List[str]:
    """Generate query variations"""
    variations = [
        query,  # Original query
        f"Define {query.lower().replace('what is ', '')}",
        f"Explain the concept of {query.lower().replace('what is ', '')}",
        f"Describe {query.lower().replace('what is ', '')}",
        f"Tell me about {query.lower().replace('what is ', '')}"
    ]
    return variations

def reciprocal_rank_fusion(all_results: List[tuple], k: int = 60) -> List[tuple]:
    """
    Combine results using Reciprocal Rank Fusion
    
    Args:
        all_results: List of (Document, score) tuples
        k: Constant to prevent division by zero and smooth impact
    """
    rrf_scores = {}
    
    # Calculate RRF scores
    for rank, (doc, score) in enumerate(all_results):
        doc_id = doc.metadata['chunk_id']
        rrf_scores[doc_id] = rrf_scores.get(doc_id, 0) + 1/(rank + k)
    
    # Sort by RRF score
    sorted_docs = sorted(
        [(doc, score) for doc, score in all_results],
        key=lambda x: rrf_scores[x[0].metadata['chunk_id']],
        reverse=True
    )
    
    # Take top 3 unique chunks
    seen_ids = set()
    final_results = []
    for doc, score in sorted_docs:
        if doc.metadata['chunk_id'] not in seen_ids:
            seen_ids.add(doc.metadata['chunk_id'])
            final_results.append((doc, score))
            if len(final_results) == 3:
                break
    
    return final_results

def retrieve_and_rerank(query: str, vectorstore, reranker, top_k: int = 5) -> QueryResults:
    """Retrieve and rerank chunks for a single query"""
    # Get initial results
    results = vectorstore.similarity_search_with_score(
        query,
        k=top_k
    )
    
    # Rerank
    pairs = [[query, doc.page_content] for doc, _ in results]
    rerank_scores = reranker.compute_score(pairs)
    
    # Create validated results
    chunks = [
        RetrievedChunk(
            text=doc.page_content,
            metadata=ChunkMetadata(
                chunk_id=doc.metadata["chunk_id"],
                physical_page=doc.metadata["physical_page"],
                logical_page=doc.metadata["logical_page"],
                section=doc.metadata["section"]
            ),
            score=float(score)
        )
        for (doc, _), score in zip(results, rerank_scores)
    ]
    
    return QueryResults(
        query=query,
        chunks=chunks[:5]  # Ensure exactly 5 chunks
    )


In [57]:
# Query expansion using LLM
def generate_query_variations(query: str) -> List[str]:
    """Generate query variations using LLM"""
    prompt = f"""Generate 4 different ways to ask this question. The variations should help in retrieving relevant information.
    
    Original question: {query}
    
    Instructions:
    1. Keep the same meaning but use different phrasings
    2. Consider different aspects of the topic
    3. Make variations specific and focused
    4. Return exactly 4 variations
    
    Format each variation on a new line.
    """
    
    # Generate variations
    response = generate_response(query=query, context=prompt)
    
    # Split into lines and clean
    variations = [line.strip() for line in response.split('\n') if line.strip()]
    variations = variations[:4]  # Ensure exactly 4 variations
    
    # Add original query and ensure we have exactly 5 queries
    all_queries = [query] + variations
    
    
    
    return all_queries[:5]  # Return exactly 5 queries

# Example usage
test_query = "What is psychology?"
variations = generate_query_variations(test_query)
print("Original:", test_query)
print("\nGenerated variations:")
for i, var in enumerate(variations[1:], 1):
    print(f"{i}. {var}")


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Original: What is psychology?

Generated variations:
1. 1. Answer the question. 2. Be specific and accurate. 3. if you don't have enough information, say so. 4. If you have a question, you can answer it. 5. If the answer is no, it's not possible to answer.


In [58]:
# Main RAG pipeline
def process_query(query: str) -> FinalResponse:
    """Process a query through the full RAG pipeline"""
    # 1. Generate query variations using LLM
    variations = generate_query_variations(query)
    print(f"Generated {len(variations)} query variations:")
    for i, var in enumerate(variations):
        print(f"{i}. {var}")
    
    # 2. Get results for each variation
    all_results = []
    for var in variations:
        # Get results for this variation
        results = retrieve_and_rerank(var, vectorstore, reranker)
        
        # Keep the Document objects intact
        initial_results = vectorstore.similarity_search_with_score(var, k=5)
        for doc, base_score in initial_results:
            # Find matching reranked score
            for chunk in results.chunks:
                if chunk.metadata.chunk_id == doc.metadata['chunk_id']:
                    all_results.append((doc, chunk.score))
                    break
    
    # 3. Apply RRF to get final chunks
    final_chunks = reciprocal_rank_fusion(all_results)
    
    # 4. Extract unique sections
    sections = list(set(
        doc.metadata['section'] 
        for doc, _ in final_chunks
    ))
    
    # 5. Generate answer using LLM
    final_docs = [doc for doc, _ in final_chunks]
    answer = generate_answer(query, final_docs)
    
    # 6. Create and validate final response
    response = FinalResponse(
        query=query,
        answer=answer,
        source_chunks=[
            ChunkMetadata(
                chunk_id=doc.metadata['chunk_id'],
                physical_page=doc.metadata['physical_page'],
                logical_page=doc.metadata['logical_page'],
                section=doc.metadata['section']
            )
            for doc, _ in final_chunks
        ],
        sections_referenced=sections,
        query_variations=variations
    )
    
    return response

# Test the pipeline
def test_pipeline():
    """Test the RAG pipeline with a sample query"""
    query = "What is psychology?"
    print(f"\nProcessing query: {query}")
    
    response = process_query(query)
    
    print("\nResults:")
    print(f"Query: {response.query}")
    print("\nQuery variations used:")
    for i, var in enumerate(response.query_variations):
        print(f"{i}. {var}")
    print(f"\nAnswer: {response.answer}")
    print("\nSource chunks:")
    for chunk in response.source_chunks:
        print(f"- Page {chunk.physical_page} (Section: {chunk.section})")
    print("\nSections referenced:")
    for section in response.sections_referenced:
        print(f"- {section}")

# Run test
test_pipeline()


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Processing query: What is psychology?
Generated 2 query variations:
0. What is psychology?
1. 1. Answer the question. 2. Be specific and accurate. 3. if you don't have enough information, say so. 4. If you have a question, you can answer it. 5. If the answer is no, it's not possible to answer.


ValidationError: 5 validation errors for QueryResults
chunks.0
  Input should be a valid dictionary or instance of RetrievedChunk [type=model_type, input_value=RetrievedChunk(text='2002...ology'), score=3.703125), input_type=RetrievedChunk]
    For further information visit https://errors.pydantic.dev/2.11/v/model_type
chunks.1
  Input should be a valid dictionary or instance of RetrievedChunk [type=model_type, input_value=RetrievedChunk(text='2002...ology'), score=3.703125), input_type=RetrievedChunk]
    For further information visit https://errors.pydantic.dev/2.11/v/model_type
chunks.2
  Input should be a valid dictionary or instance of RetrievedChunk [type=model_type, input_value=RetrievedChunk(text='2002...ology'), score=3.703125), input_type=RetrievedChunk]
    For further information visit https://errors.pydantic.dev/2.11/v/model_type
chunks.3
  Input should be a valid dictionary or instance of RetrievedChunk [type=model_type, input_value=RetrievedChunk(text='revo...gy'), score=-1.37890625), input_type=RetrievedChunk]
    For further information visit https://errors.pydantic.dev/2.11/v/model_type
chunks.4
  Input should be a valid dictionary or instance of RetrievedChunk [type=model_type, input_value=RetrievedChunk(text='revo...gy'), score=-1.37890625), input_type=RetrievedChunk]
    For further information visit https://errors.pydantic.dev/2.11/v/model_type