# RAG System for IT Support (Local CPU Version)

This notebook implements a complete Retrieve-Augmented Generation system using Flan-T5 and FAISS.

In [None]:

# Install Dependencies
!pip install -q transformers sentence-transformers faiss-cpu pandas numpy torch accelerate sentencepiece


## 1. Configuration

In [None]:
"""
Configuration Module for RAG System

This module centralizes all configuration parameters for the RAG system.
"""

import os
from pathlib import Path

# ============================================================================
# PROJECT PATHS
# ============================================================================
PROJECT_ROOT = Path(".")
DATA_DIR = PROJECT_ROOT / "data"
FAISS_INDEX_DIR = DATA_DIR / "faiss_index"
RESULTS_DIR = DATA_DIR / "results"

for dir_path in [DATA_DIR, FAISS_INDEX_DIR, RESULTS_DIR]:
    dir_path.mkdir(parents=True, exist_ok=True)

DATASET_PATH = PROJECT_ROOT / "rag_sample_qas_from_kis.csv"

# ============================================================================
# EMBEDDING MODEL CONFIGURATION
# ============================================================================
# Small, efficient model (384 dims, ~80MB)
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

# ============================================================================
# RETRIEVAL CONFIGURATION
# ============================================================================
RETRIEVAL_TOP_K = 5
SIMILARITY_THRESHOLD = 0.3

# ============================================================================
# RERANKING CONFIGURATION
# ============================================================================
# Small cross-encoder (~80MB)
# Small cross-encoder (~80MB)
RERANKER_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
RERANK_TOP_K = 2  # Increased to 2 (with smaller chunks) to capture more context

# ============================================================================
# LLM CONFIGURATION (LOCAL FLAN-T5)
# ============================================================================
# Changed to local model for CPU compatibility
USE_LOCAL_LLM = True
LLM_MODEL_NAME = "google/flan-t5-base"  # ~250MB size, perfectly fits in 1.4GB RAM

# Generation parameters
LLM_TEMPERATURE = 0.0
LLM_MAX_TOKENS = 512

# ============================================================================
# EVALUATION CONFIGURATION
# ============================================================================
RETRIEVAL_RECALL_TARGET = 0.95
RESPONSE_SIMILARITY_TARGET = 0.90
ROUGE_L_TARGET = 0.70

# ============================================================================
# APPLICATION CONFIGURATION
# ============================================================================
APP_TITLE = "ü§ñ RAG-Based IT Support (Local)"
APP_LAYOUT = "wide"
DEBUG_MODE = True

# ============================================================================
# DATA PROCESSING CONFIGURATION
# ============================================================================
COL_TOPIC = "ki_topic"
COL_TEXT = "ki_text"
COL_QUESTION = "sample_question"
COL_GROUND_TRUTH = "sample_ground_truth"
CHUNKING_ENABLED = False
CHUNK_OVERLAP = 50

# ============================================================================
# VALIDATION
# ============================================================================
def validate_config():
    if not DATASET_PATH.exists():
        raise ValueError(f"Dataset not found: {DATASET_PATH}")
    return True


## 2. Data Loading & Chunking

In [None]:
"""
Data Loader Module for RAG System

This module handles loading and preprocessing the KIS Q&A dataset.
Each function is documented with its purpose and reasoning.

WHY THIS MODULE EXISTS:
- Centralizes all data loading logic
- Provides clean, validated data to other components
- Handles edge cases (missing values, malformed data)
- Creates structured objects that are easy to work with

Author: RAG System
"""

import pandas as pd
from pathlib import Path
from typing import List, Dict, Tuple
import logging


    DATASET_PATH,
    COL_TOPIC,
    COL_TEXT,
    COL_QUESTION,
    COL_GROUND_TRUTH
)

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class KnowledgeItem:
    """
    Represents a single knowledge base article
    
    WHY THIS CLASS:
    - Provides a clean, typed interface for knowledge items
    - Easier to work with than raw dictionaries
    - Can add methods for processing/validation
    """
    
    def __init__(self, topic: str, text: str, metadata: Dict = None):
        self.topic = topic
        self.text = text
        self.metadata = metadata or {}
        
        # Generate a unique ID based on topic
        # WHY: Needed for tracking which document was retrieved
        self.id = self._generate_id()
    
    def _generate_id(self) -> str:
        """Generate unique ID from topic"""
        # Simple slug generation: lowercase, replace spaces with underscores
        return self.topic.lower().replace(" ", "_")
    
    def to_dict(self) -> Dict:
        """Convert to dictionary for serialization"""
        return {
            "id": self.id,
            "topic": self.topic,
            "text": self.text,
            "metadata": self.metadata
        }
    
    def __repr__(self):
        return f"KnowledgeItem(id='{self.id}', topic='{self.topic[:50]}...')"


class QAPair:
    """
    Represents a question-answer pair for evaluation
    
    WHY THIS CLASS:
    - Keeps test data organized
    - Links questions to their expected answers
    - Tracks which KI article should be retrieved
    """
    
    def __init__(self, question: str, ground_truth: str, expected_ki_id: str):
        self.question = question
        self.ground_truth = ground_truth
        self.expected_ki_id = expected_ki_id
    
    def to_dict(self) -> Dict:
        """Convert to dictionary for serialization"""
        return {
            "question": self.question,
            "ground_truth": self.ground_truth,
            "expected_ki_id": self.expected_ki_id
        }
    
    def __repr__(self):
        return f"QAPair(question='{self.question[:50]}...', expected_ki='{self.expected_ki_id}')"


def clean_text(text: str) -> str:
    """
    Clean and normalize text
    
    WHY MINIMAL CLEANING:
    - The dataset is already high-quality corporate documentation
    - Over-aggressive cleaning can remove important information
    - Embeddings models handle varied text well
    
    Args:
        text: Raw text string
    
    Returns:
        Cleaned text string
    """
    if not isinstance(text, str):
        return ""
    
    # Remove excessive whitespace
    # WHY: Multiple spaces/newlines don't add semantic value
    text = " ".join(text.split())
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    # That's it! We want to preserve:
    # - Punctuation (important for step-by-step instructions)
    # - Numbers (version numbers, step numbers)
    # - Special characters (e.g., file paths, commands)
    
    return text


def load_knowledge_base() -> List[KnowledgeItem]:
    """
    Load all knowledge base articles from CSV with CHUNKING
    
    WHY CHUNKING:
    - Small models (Flan-T5) have limited context (512 tokens)
    - Original documents are ~1300 tokens, causing truncation
    - We split docs into ~300 token chunks so the model sees the answer
    """
    logger.info(f"Loading knowledge base from {DATASET_PATH}")
    
    # Check file exists
    if not DATASET_PATH.exists():
        raise FileNotFoundError(f"Dataset not found: {DATASET_PATH}")
    
    # Load CSV
    df = pd.read_csv(DATASET_PATH)
    
    knowledge_items = []
    
    # Configure chunking
    # 600 chars ~= 150 tokens. Allows retrieving 2-3 chunks.
    CHUNK_SIZE = 600  
    CHUNK_OVERLAP = 100
    
    # Group by topic
    unique_kis = df.groupby(COL_TOPIC).first().reset_index()
    
    for idx, row in unique_kis.iterrows():
        topic = str(row[COL_TOPIC])
        text = clean_text(str(row[COL_TEXT]))
        
        # Skip empty
        if not topic or not text:
            continue
            
        # Create chunks
        chunks = []
        if len(text) > CHUNK_SIZE:
            start = 0
            while start < len(text):
                end = min(start + CHUNK_SIZE, len(text))
                
                # Attempt to split on space to avoid cutting words
                if end < len(text):
                    last_space = text.rfind(' ', start, end)
                    if last_space != -1 and last_space > start + (CHUNK_SIZE // 2):
                        end = last_space
                
                chunk_text = text[start:end]
                chunks.append(chunk_text)
                
                # Move start forward by stride (size - overlap)
                start += (CHUNK_SIZE - CHUNK_OVERLAP)
        else:
            chunks = [text]
            
        # Create KnowledgeItem for each chunk
        for i, chunk in enumerate(chunks):
            # Create ID that links back to original topic but is unique for chunk
            # e.g., "email_setup_chunk_0"
            chunk_id = f"{topic.lower().replace(' ', '_')}_{i}"
            
            # Add context to text so model knows what this chunk is about
            # WHY: Isolated chunks might lose context (e.g., "Step 5: Click OK")
            # Adding title helps: "Email Setup (Part 1): Step 5: Click OK"
            chunk_text_with_context = f"{topic} (Part {i+1}):\n{chunk}"
            
            ki = KnowledgeItem(
                topic=topic,
                text=chunk_text_with_context, 
                metadata={
                    "source_row": int(idx),
                    "chunk_index": i,
                    "total_chunks": len(chunks),
                    "is_chunk": True,
                    "original_id": topic.lower().replace(" ", "_")
                }
            )
            # Override ID manually to ensure uniqueness
            ki.id = chunk_id
            
            knowledge_items.append(ki)
    
    logger.info(f"Loaded {len(knowledge_items)} chunks from {len(unique_kis)} original documents")
    return knowledge_items


def load_qa_pairs() -> List[QAPair]:
    """
    Load question-answer pairs for evaluation
    
    WHY THIS FUNCTION:
    - Creates test set for accuracy measurement
    - Links each question to its expected KI article
    
    Returns:
        List of QAPair objects
    
    Raises:
        FileNotFoundError: If dataset doesn't exist
        ValueError: If dataset is malformed
    """
    logger.info(f"Loading Q&A pairs from {DATASET_PATH}")
    
    if not DATASET_PATH.exists():
        raise FileNotFoundError(f"Dataset not found: {DATASET_PATH}")
    
    df = pd.read_csv(DATASET_PATH)
    
    # Validate required columns
    required_cols = [COL_TOPIC, COL_QUESTION, COL_GROUND_TRUTH]
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")
    
    qa_pairs = []
    
    for idx, row in df.iterrows():
        topic = row[COL_TOPIC]
        question = row[COL_QUESTION]
        ground_truth = row[COL_GROUND_TRUTH]
        
        # Clean text
        clean_question = clean_text(str(question))
        clean_ground_truth = clean_text(str(ground_truth))
        
        # Skip if any field is empty
        if not clean_question or not clean_ground_truth:
            logger.warning(f"Skipping incomplete Q&A pair at row {idx}")
            continue
        
        # Generate expected KI ID from topic
        # WHY: This links the question to which document should be retrieved
        expected_ki_id = topic.lower().replace(" ", "_")
        
        qa_pair = QAPair(
            question=clean_question,
            ground_truth=clean_ground_truth,
            expected_ki_id=expected_ki_id
        )
        
        qa_pairs.append(qa_pair)
    
    logger.info(f"Loaded {len(qa_pairs)} Q&A pairs for evaluation")
    
    return qa_pairs


def load_all_data() -> Tuple[List[KnowledgeItem], List[QAPair]]:
    """
    Load both knowledge base and Q&A pairs
    
    WHY THIS CONVENIENCE FUNCTION:
    - Single function to load everything needed
    - Ensures consistent loading
    
    Returns:
        Tuple of (knowledge_items, qa_pairs)
    """
    knowledge_items = load_knowledge_base()
    qa_pairs = load_qa_pairs()
    
    logger.info(f"Data loading complete: {len(knowledge_items)} KIs, {len(qa_pairs)} Q&A pairs")
    
    return knowledge_items, qa_pairs


# ============================================================================
# TESTING & VALIDATION
# ============================================================================

if __name__ == "__main__":
    """
    Test the data loader
    
    WHY: Validates that data loading works correctly before using in pipeline
    """
    print("=" * 80)
    print("TESTING DATA LOADER")
    print("=" * 80)
    
    try:
        # Load data
        knowledge_items, qa_pairs = load_all_data()
        
        # Display sample knowledge item
        print("\nüìö SAMPLE KNOWLEDGE ITEM:")
        print("-" * 80)
        sample_ki = knowledge_items[0]
        print(f"ID: {sample_ki.id}")
        print(f"Topic: {sample_ki.topic}")
        print(f"Text (first 200 chars): {sample_ki.text[:200]}...")
        print(f"Metadata: {sample_ki.metadata}")
        
        # Display sample Q&A pair
        print("\n‚ùì SAMPLE Q&A PAIR:")
        print("-" * 80)
        sample_qa = qa_pairs[0]
        print(f"Question: {sample_qa.question}")
        print(f"Expected KI: {sample_qa.expected_ki_id}")
        print(f"Ground Truth (first 200 chars): {sample_qa.ground_truth[:200]}...")
        
        # Statistics
        print("\nüìä DATASET STATISTICS:")
        print("-" * 80)
        print(f"Total Knowledge Items: {len(knowledge_items)}")
        print(f"Total Q&A Pairs: {len(qa_pairs)}")
        print(f"Average text length: {sum(ki.metadata['character_count'] for ki in knowledge_items) / len(knowledge_items):.0f} chars")
        print(f"Average question length: {sum(len(qa.question) for qa in qa_pairs) / len(qa_pairs):.0f} chars")
        
        # Verify all expected KI IDs exist in knowledge base
        ki_ids = {ki.id for ki in knowledge_items}
        missing_kis = [qa.expected_ki_id for qa in qa_pairs if qa.expected_ki_id not in ki_ids]
        
        if missing_kis:
            print(f"\n‚ö†Ô∏è  WARNING: {len(missing_kis)} Q&A pairs reference missing KIs:")
            for ki_id in set(missing_kis):
                print(f"  - {ki_id}")
        else:
            print("\n‚úÖ All Q&A pairs have corresponding knowledge items")
        
        print("\n‚úÖ DATA LOADER TEST PASSED")
        
    except Exception as e:
        print(f"\n‚ùå DATA LOADER TEST FAILED: {e}")
        import traceback
        traceback.print_exc()


## 3. Vector Database (FAISS)

In [None]:
"""
Vector Store Module for RAG System

This module handles:
1. Generating embeddings for knowledge base documents
2. Building and managing the FAISS vector index
3. Performing similarity search

WHY THIS MODULE EXISTS:
- Centralizes all vector database operations
- Provides fast semantic search over knowledge base
- Handles index persistence (save/load)

WHY FAISS:
- Extremely fast similarity search (optimized by Facebook AI)
- Can scale to millions of documents
- No external dependencies (runs locally)
- Industry standard for vector search

Author: RAG System
"""

import numpy as np
import faiss
import pickle
from pathlib import Path
from typing import List, Tuple, Dict
from sentence_transformers import SentenceTransformer
import logging


    EMBEDDING_MODEL_NAME,
    FAISS_INDEX_DIR,
    RETRIEVAL_TOP_K,
    SIMILARITY_THRESHOLD
)


# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class VectorStore:
    """
    Manages vector embeddings and FAISS index for semantic search
    
    WHY THIS CLASS:
    - Encapsulates all vector database logic
    - Provides clean API for retrieval
    - Handles embedding generation and caching
    """
    
    def __init__(self, embedding_model_name: str = EMBEDDING_MODEL_NAME):
        """
        Initialize vector store
        
        Args:
            embedding_model_name: Name of the sentence-transformers model
        """
        logger.info(f"Initializing VectorStore with model: {embedding_model_name}")
        
        # Initialize embedding model
        # WHY sentence-transformers:
        # - Pre-trained on semantic similarity tasks
        # - Produces high-quality embeddings for Q&A matching
        # - Easy to use, well-maintained library
        self.embedding_model = SentenceTransformer(embedding_model_name)
        
        # Get embedding dimension
        # WHY: Need this to initialize FAISS index with correct dimension
        self.embedding_dim = self.embedding_model.get_sentence_embedding_dimension()
        logger.info(f"Embedding dimension: {self.embedding_dim}")
        
        # Initialize FAISS index (will be built when add_documents is called)
        self.index = None
        
        # Store knowledge items for retrieval
        # WHY: After FAISS returns document IDs, we need to map back to original docs
        self.knowledge_items: List[KnowledgeItem] = []
        
        # Store embeddings for potential reuse
        self.embeddings: np.ndarray = None
    
    def _encode_texts(self, texts: List[str], show_progress: bool = True) -> np.ndarray:
        """
        Generate embeddings for texts
        
        WHY SEPARATE METHOD:
        - Can be used for both documents and queries
        - Centralizes encoding logic
        - Easy to add batching, caching, etc.
        
        Args:
            texts: List of text strings to encode
            show_progress: Show progress bar
        
        Returns:
            Numpy array of embeddings, shape (n_texts, embedding_dim)
        """
        logger.info(f"Encoding {len(texts)} texts...")
        
        # Generate embeddings
        # WHY normalize_embeddings=True:
        # - Normalized vectors allow using cosine similarity via dot product
        # - Faster computation and better numerical stability
        # - FAISS IndexFlatIP (inner product) can be used for cosine similarity
        embeddings = self.embedding_model.encode(
            texts,
            show_progress_bar=show_progress,
            convert_to_numpy=True,
            normalize_embeddings=True  # Critical for cosine similarity
        )
        
        return embeddings
    
    def build_index(self, knowledge_items: List[KnowledgeItem]):
        """
        Build FAISS index from knowledge base documents
        
        WHY THIS METHOD:
        - Creates the searchable vector database
        - Must be called before search can be performed
        
        Args:
            knowledge_items: List of KnowledgeItem objects to index
        """
        logger.info(f"Building FAISS index for {len(knowledge_items)} documents")
        
        if not knowledge_items:
            raise ValueError("No knowledge items provided for indexing")
        
        # Store knowledge items
        self.knowledge_items = knowledge_items
        
        # Extract texts for embedding
        # WHY: We embed the full text of each KI article
        # The topic is learned implicitly in the text content
        texts = [ki.text for ki in knowledge_items]
        
        # Generate embeddings
        self.embeddings = self._encode_texts(texts)
        
        logger.info(f"Generated embeddings with shape: {self.embeddings.shape}")
        
        # Build FAISS index
        # WHY IndexFlatIP (Inner Product):
        # - "Flat" = exhaustive search (guaranteed to find best matches)
        # - "IP" = inner product (equivalent to cosine similarity for normalized vectors)
        # - For small datasets (<10k docs), flat search is fast enough
        # - For larger datasets, could use IndexIVFFlat for approximate search
        self.index = faiss.IndexFlatIP(self.embedding_dim)
        
        # Add embeddings to index
        # WHY: FAISS requires float32 numpy arrays
        self.index.add(self.embeddings.astype('float32'))
        
        logger.info(f"‚úÖ FAISS index built with {self.index.ntotal} vectors")
    
    def search(
        self,
        query: str,
        top_k: int = RETRIEVAL_TOP_K,
        score_threshold: float = SIMILARITY_THRESHOLD
    ) -> List[Tuple[KnowledgeItem, float]]:
        """
        Search for most similar documents to query
        
        WHY THIS METHOD:
        - Core retrieval function for RAG system
        - Returns ranked results with similarity scores
        
        Args:
            query: User's question/query text
            top_k: Number of results to return
            score_threshold: Minimum similarity score (0-1)
        
        Returns:
            List of (KnowledgeItem, similarity_score) tuples, sorted by score descending
        """
        if self.index is None:
            raise ValueError("Index not built. Call build_index() first")
        
        if not query or not query.strip():
            logger.warning("Empty query provided")
            return []
        
        # Encode query
        # WHY: Query needs to be in same embedding space as documents
        query_embedding = self._encode_texts([query], show_progress=False)
        
        # Search FAISS index
        # WHY top_k: We retrieve slightly more candidates for reranking
        scores, indices = self.index.search(query_embedding.astype('float32'), top_k)
        
        # Process results
        results = []
        for score, idx in zip(scores[0], indices[0]):
            # Skip invalid indices (can happen if top_k > index size)
            if idx < 0 or idx >= len(self.knowledge_items):
                continue
            
            # Filter by score threshold
            # WHY: Removes completely irrelevant results
            if score < score_threshold:
                logger.debug(f"Skipping result with score {score:.3f} < threshold {score_threshold}")
                continue
            
            ki = self.knowledge_items[idx]
            results.append((ki, float(score)))
        
        logger.info(f"Found {len(results)} results for query (top_k={top_k}, threshold={score_threshold})")
        
        if results:
            logger.debug(f"Top result: {results[0][0].topic} (score: {results[0][1]:.3f})")
        
        return results
    
    def save_index(self, index_dir: Path = FAISS_INDEX_DIR):
        """
        Save FAISS index and metadata to disk
        
        WHY THIS METHOD:
        - Avoids rebuilding index every time (expensive for large datasets)
        - Enables loading pre-built index quickly
        
        Args:
            index_dir: Directory to save index files
        """
        if self.index is None:
            raise ValueError("No index to save. Build index first")
        
        index_dir = Path(index_dir)
        index_dir.mkdir(parents=True, exist_ok=True)
        
        # Save FAISS index
        # WHY: FAISS provides optimized serialization
        index_path = index_dir / "faiss_index.bin"
        faiss.write_index(self.index, str(index_path))
        logger.info(f"Saved FAISS index to {index_path}")
        
        # Save knowledge items
        # WHY: Need to map FAISS indices back to original documents
        ki_path = index_dir / "knowledge_items.pkl"
        with open(ki_path, 'wb') as f:
            pickle.dump(self.knowledge_items, f)
        logger.info(f"Saved knowledge items to {ki_path}")
        
        # Save embeddings (optional, for analysis)
        # WHY: Useful for debugging and analysis
        emb_path = index_dir / "embeddings.npy"
        np.save(emb_path, self.embeddings)
        logger.info(f"Saved embeddings to {emb_path}")
        
        # Save metadata
        metadata = {
            "model_name": EMBEDDING_MODEL_NAME,
            "embedding_dim": self.embedding_dim,
            "num_documents": len(self.knowledge_items)
        }
        metadata_path = index_dir / "metadata.pkl"
        with open(metadata_path, 'wb') as f:
            pickle.dump(metadata, f)
        logger.info(f"Saved metadata to {metadata_path}")
        
        logger.info("‚úÖ Index saved successfully")
    
    def load_index(self, index_dir: Path = FAISS_INDEX_DIR):
        """
        Load FAISS index and metadata from disk
        
        WHY THIS METHOD:
        - Fast loading of pre-built index
        - Skips expensive embedding generation
        
        Args:
            index_dir: Directory containing index files
        
        Raises:
            FileNotFoundError: If index files don't exist
        """
        index_dir = Path(index_dir)
        
        if not index_dir.exists():
            raise FileNotFoundError(f"Index directory not found: {index_dir}")
        
        # Load FAISS index
        index_path = index_dir / "faiss_index.bin"
        if not index_path.exists():
            raise FileNotFoundError(f"FAISS index not found: {index_path}")
        
        self.index = faiss.read_index(str(index_path))
        logger.info(f"Loaded FAISS index with {self.index.ntotal} vectors")
        
        # Load knowledge items
        ki_path = index_dir / "knowledge_items.pkl"
        with open(ki_path, 'rb') as f:
            self.knowledge_items = pickle.load(f)
        logger.info(f"Loaded {len(self.knowledge_items)} knowledge items")
        
        # Load embeddings
        emb_path = index_dir / "embeddings.npy"
        if emb_path.exists():
            self.embeddings = np.load(emb_path)
            logger.info(f"Loaded embeddings with shape: {self.embeddings.shape}")
        
        logger.info("‚úÖ Index loaded successfully")
    
    def index_exists(self, index_dir: Path = FAISS_INDEX_DIR) -> bool:
        """
        Check if a saved index exists
        
        WHY THIS METHOD:
        - Allows conditional loading vs building
        - Useful for caching logic
        
        Args:
            index_dir: Directory to check
        
        Returns:
            True if index files exist
        """
        index_dir = Path(index_dir)
        required_files = ["faiss_index.bin", "knowledge_items.pkl"]
        return all((index_dir / f).exists() for f in required_files)


# ============================================================================
# TESTING & VALIDATION
# ============================================================================

if __name__ == "__main__":
    """
    Test the vector store
    
    WHY: Validates that embedding and search work correctly
    """
    print("=" * 80)
    print("TESTING VECTOR STORE")
    print("=" * 80)
    
    
    
    try:
        # Load knowledge base
        print("\nüìö Loading knowledge base...")
        knowledge_items = load_knowledge_base()
        
        # Initialize vector store
        print("\nüîß Initializing vector store...")
        vector_store = VectorStore()
        
        # Build index
        print("\nüèóÔ∏è  Building FAISS index...")
        vector_store.build_index(knowledge_items)
        
        # Test search
        print("\nüîç Testing search...")
        test_queries = [
            "How do I reset my PIN?",
            "Setting up email on my phone",
            "VPN not connecting"
        ]
        
        for query in test_queries:
            print(f"\nQuery: '{query}'")
            results = vector_store.search(query, top_k=3)
            
            if results:
                for i, (ki, score) in enumerate(results, 1):
                    print(f"  {i}. {ki.topic} (score: {score:.3f})")
            else:
                print("  No results found")
        
        # Test save/load
        print("\nüíæ Testing save/load...")
        vector_store.save_index()
        
        # Create new instance and load
        vector_store2 = VectorStore()
        vector_store2.load_index()
        
        # Verify loaded index works
        print("\nüîç Testing loaded index...")
        results = vector_store2.search("How do I reset my PIN?", top_k=3)
        print(f"Found {len(results)} results with loaded index")
        
        print("\n‚úÖ VECTOR STORE TEST PASSED")
        
    except Exception as e:
        print(f"\n‚ùå VECTOR STORE TEST FAILED: {e}")
        import traceback
        traceback.print_exc()


## 4. Retriever Logic

In [None]:
"""
Retriever Module for RAG System

This module implements intelligent retrieval with two-stage ranking:
1. Stage 1: Fast FAISS similarity search (retrieves top-K candidates)
2. Stage 2: Cross-encoder reranking (reranks candidates for precision)

WHY TWO-STAGE RETRIEVAL:
- Stage 1 (Bi-encoder/FAISS): Fast but less accurate, high recall
- Stage 2 (Cross-encoder): Slow but very accurate, high precision
- Together: Best of both worlds - fast AND accurate

WHY THIS IMPROVES ACCURACY:
- Bi-encoders (FAISS) encode query and doc separately ‚Üí misses interaction
- Cross-encoders see query+doc together ‚Üí captures semantic relationship
- Studies show cross-encoder reranking improves accuracy by 10-15%

Author: RAG System
"""

from typing import List, Tuple, Dict
from sentence_transformers import CrossEncoder
import logging


    RETRIEVAL_TOP_K,
    RERANKER_MODEL_NAME,
    RERANK_TOP_K,
    SIMILARITY_THRESHOLD
)



# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class Retriever:
    """
    Intelligent retrieval system with two-stage ranking
    
    WHY THIS CLASS:
    - Encapsulates complete retrieval pipeline
    - Combines FAISS (speed) with cross-encoder (accuracy)
    - Provides clean API for RAG system
    """
    
    def __init__(
        self,
        vector_store: VectorStore,
        reranker_model_name: str = RERANKER_MODEL_NAME
    ):
        """
        Initialize retriever
        
        Args:
            vector_store: Initialized VectorStore instance
            reranker_model_name: Name of cross-encoder model for reranking
        """
        logger.info("Initializing Retriever with reranking")
        
        self.vector_store = vector_store
        
        # Initialize cross-encoder for reranking
        # WHY cross-encoder:
        # - Processes (query, document) pair together
        # - Learns interaction between query and doc
        # - More accurate than bi-encoder for final ranking
        # - Slower than bi-encoder, but we only rerank top-K (e.g., 5 docs)
        logger.info(f"Loading cross-encoder: {reranker_model_name}")
        self.reranker = CrossEncoder(reranker_model_name)
        
        logger.info("‚úÖ Retriever initialized")
    
    def retrieve(
        self,
        query: str,
        top_k: int = RERANK_TOP_K,
        retrieval_k: int = RETRIEVAL_TOP_K,
        use_reranking: bool = True
    ) -> List[Tuple[KnowledgeItem, float, Dict]]:
        """
        Retrieve most relevant documents for query
        
        PIPELINE:
        1. Stage 1: FAISS retrieves top-retrieval_k candidates (fast, high recall)
        2. Stage 2: Cross-encoder reranks to top-k (slow, high precision)
        
        WHY THIS APPROACH:
        - FAISS searches entire knowledge base quickly
        - Cross-encoder only processes top candidates (5-10 docs)
        - Result: Fast retrieval with high accuracy
        
        Args:
            query: User's question
            top_k: Number of final results to return
            retrieval_k: Number of candidates for stage 1 (should be >= top_k)
            use_reranking: Whether to use cross-encoder reranking
        
        Returns:
            List of (KnowledgeItem, score, metadata) tuples, sorted by score descending
            metadata contains: stage1_score, stage2_score (if reranking), rank, etc.
        """
        if not query or not query.strip():
            logger.warning("Empty query provided")
            return []
        
        logger.info(f"Retrieving documents for query: '{query[:100]}...'")
        
        # =====================================================================
        # STAGE 1: Fast FAISS Similarity Search
        # =====================================================================
        
        logger.debug(f"Stage 1: FAISS search (top_k={retrieval_k})")
        
        # Get candidates from vector store
        # WHY retrieval_k > top_k:
        # - Casts wider net to ensure relevant docs are captured
        # - Reranking will filter down to top_k most relevant
        stage1_results = self.vector_store.search(
            query=query,
            top_k=retrieval_k,
            score_threshold=SIMILARITY_THRESHOLD
        )
        
        if not stage1_results:
            logger.warning("No results from stage 1 (FAISS search)")
            return []
        
        logger.info(f"Stage 1: Found {len(stage1_results)} candidates")
        
        # If reranking disabled, return stage 1 results
        if not use_reranking:
            logger.info("Reranking disabled, returning stage 1 results")
            results = []
            for i, (ki, score) in enumerate(stage1_results[:top_k], 1):
                metadata = {
                    "stage1_score": score,
                    "stage1_rank": i,
                    "reranked": False
                }
                results.append((ki, score, metadata))
            return results
        
        # =====================================================================
        # STAGE 2: Cross-Encoder Reranking
        # =====================================================================
        
        logger.debug(f"Stage 2: Cross-encoder reranking (top_k={top_k})")
        
        # Prepare (query, document) pairs for cross-encoder
        # WHY: Cross-encoder requires both query and doc as input
        pairs = [(query, ki.text) for ki, _ in stage1_results]
        
        # Get reranking scores
        # WHY: Cross-encoder outputs a single score for each (query, doc) pair
        # Higher score = more relevant
        rerank_scores = self.reranker.predict(pairs)
        
        # Combine results with both stage 1 and stage 2 scores
        combined_results = []
        for (ki, stage1_score), rerank_score in zip(stage1_results, rerank_scores):
            combined_results.append({
                "ki": ki,
                "stage1_score": float(stage1_score),
                "stage2_score": float(rerank_score),
                "final_score": float(rerank_score)  # Use stage 2 score as final
            })
        
        # Sort by reranking score (descending)
        # WHY: Cross-encoder scores are more accurate for final ranking
        combined_results.sort(key=lambda x: x["final_score"], reverse=True)
        
        # Take top-k
        top_results = combined_results[:top_k]
        
        # Format output
        results = []
        for i, result in enumerate(top_results, 1):
            metadata = {
                "stage1_score": result["stage1_score"],
                "stage2_score": result["stage2_score"],
                "final_score": result["final_score"],
                "stage1_rank": stage1_results.index((result["ki"], result["stage1_score"])) + 1,
                "stage2_rank": i,
                "reranked": True
            }
            results.append((result["ki"], result["final_score"], metadata))
        
        logger.info(f"Stage 2: Reranked to top {len(results)} results")
        
        if results:
            top_result = results[0]
            logger.info(
                f"Top result: '{top_result[0].topic}' "
                f"(stage1_score: {top_result[2]['stage1_score']:.3f}, "
                f"stage2_score: {top_result[2]['stage2_score']:.3f})"
            )
        
        return results
    
    def get_context_for_generation(
        self,
        query: str,
        top_k: int = RERANK_TOP_K
    ) -> Tuple[str, List[Dict]]:
        """
        Get formatted context for LLM generation
        
        WHY THIS METHOD:
        - Retrieves relevant docs and formats them for LLM
        - Returns both context string and metadata
        - Makes it easy to feed into generator
        
        Args:
            query: User's question
            top_k: Number of documents to retrieve
        
        Returns:
            Tuple of (context_string, sources_metadata)
            - context_string: Formatted text to inject into LLM prompt
            - sources_metadata: List of dicts with source information
        """
        # Retrieve documents
        results = self.retrieve(query, top_k=top_k)
        
        if not results:
            logger.warning("No documents retrieved for context")
            return "", []
        
        # Format context
        # WHY: Clear structure helps LLM understand and cite sources
        context_parts = []
        sources_metadata = []
        
        for i, (ki, score, metadata) in enumerate(results, 1):
            # Add document to context
            # WHY numbered sections: Makes it easy for LLM to cite sources
            context_parts.append(f"[Document {i}: {ki.topic}]\n{ki.text}\n")
            
            # Track source metadata
            sources_metadata.append({
                "number": i,
                "id": ki.id,
                "topic": ki.topic,
                "score": score,
                "metadata": metadata
            })
        
        context_string = "\n---\n\n".join(context_parts)
        
        logger.info(f"Generated context from {len(results)} documents ({len(context_string)} characters)")
        
        return context_string, sources_metadata


# ============================================================================
# TESTING & VALIDATION
# ============================================================================

if __name__ == "__main__":
    """
    Test the retriever with reranking
    
    WHY: Validates that two-stage retrieval improves accuracy
    """
    print("=" * 80)
    print("TESTING RETRIEVER WITH RERANKING")
    print("=" * 80)
    
    
    
    try:
        # Load knowledge base
        print("\nüìö Loading knowledge base...")
        knowledge_items = load_knowledge_base()
        
        # Initialize vector store
        print("\nüîß Initializing vector store...")
        vector_store = VectorStore()
        
        # Check if index exists, otherwise build
        if vector_store.index_exists():
            print("Loading existing index...")
            vector_store.load_index()
        else:
            print("Building new index...")
            vector_store.build_index(knowledge_items)
            vector_store.save_index()
        
        # Initialize retriever
        print("\nüéØ Initializing retriever...")
        retriever = Retriever(vector_store)
        
        # Test queries
        test_queries = [
            "How do I reset my forgotten PIN?",
            "I need to set up company email on my Android phone",
            "My printer is jammed, what should I do?",
            "How can I configure VPN to work from home?"
        ]
        
        for query in test_queries:
            print("\n" + "="*80)
            print(f"üìù Query: '{query}'")
            print("="*80)
            
            # Test WITHOUT reranking
            print("\nüîç Stage 1 Only (FAISS):")
            results_no_rerank = retriever.retrieve(query, top_k=3, use_reranking=False)
            for i, (ki, score, metadata) in enumerate(results_no_rerank, 1):
                print(f"  {i}. {ki.topic}")
                print(f"     Score: {score:.3f}")
            
            # Test WITH reranking
            print("\nüéØ Stage 1 + Stage 2 (FAISS + Reranking):")
            results_rerank = retriever.retrieve(query, top_k=3, use_reranking=True)
            for i, (ki, score, metadata) in enumerate(results_rerank, 1):
                print(f"  {i}. {ki.topic}")
                print(f"     Stage 1 Score: {metadata['stage1_score']:.3f}")
                print(f"     Stage 2 Score: {metadata['stage2_score']:.3f}")
                print(f"     Rank Change: {metadata['stage1_rank']} ‚Üí {metadata['stage2_rank']}")
            
            # Get formatted context
            print("\nüìÑ Formatted Context:")
            context, sources = retriever.get_context_for_generation(query, top_k=2)
            print(f"  Context length: {len(context)} characters")
            print(f"  Sources: {[s['topic'] for s in sources]}")
        
        print("\n‚úÖ RETRIEVER TEST PASSED")
        
    except Exception as e:
        print(f"\n‚ùå RETRIEVER TEST FAILED: {e}")
        import traceback
        traceback.print_exc()


## 5. Generator (Flan-T5)

In [None]:
"""
Generator Module for RAG System

This module handles Response generation using a LOCAL LLM (Flan-T5).

WHY LOCAL LLM:
- User requirement: "No API Key" architecture
- Hardware constraint: Low RAM (<2GB)
- "google/flan-t5-base" is chosen for efficiency and robustness

Author: RAG System
"""

import logging
from typing import List, Dict, Tuple
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM



# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class Generator:
    """
    Local LLM response generator using HuggingFace Transformers
    
    WHY THIS IMPLEMENTATION:
    - Runs entirely offline
    - Uses Seq2Seq model (T5) which is great for "text-to-text" tasks like QA
    """
    
    def __init__(self, model_name: str = LLM_MODEL_NAME):
        """
        Initialize local generator
        
        Args:
            model_name: HuggingFace model name (e.g., "google/flan-t5-base")
        """
        logger.info(f"Initializing Local Generator with model: {model_name}")
        
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            
            # Load model
            # WHY device_map="cpu":
            # - We know system is RAM constrained and lacks NVIDIA GPU
            # - "auto" might try to use GPU and fail
            self.model = AutoModelForSeq2SeqLM.from_pretrained(
                model_name,
                device_map="cpu", 
                torch_dtype=torch.float32 # FP32 is safer for CPU
            )
            
            logger.info("‚úÖ Local Generator initialized successfully")
            
        except Exception as e:
            logger.error(f"Failed to load local model: {e}")
            raise RuntimeError(f"Could not load model {model_name}. Check internet connection or RAM.") from e

    def _create_prompt(self, query: str, context: str) -> str:
        """
        Create prompt formatted for Flan-T5 (Question First Strategy)
        
        WHY QUESTION FIRST:
        - T5 has a 512 token limit.
        - If we put Context first, the Question at the end gets truncated.
        - By putting Question first, the model always knows WHAT to do, even if context is cut.
        """
        # T5 prefers: "question: ... context: ..."
        # Using standard T5 prefix format
        prompt = (
            f"question: {query} "
            f"context: {context}"
        )
        return prompt
    
    def generate_response(
        self,
        query: str,
        context: str,
        sources_metadata: List[Dict],
        temperature: float = LLM_TEMPERATURE,
        max_tokens: int = LLM_MAX_TOKENS
    ) -> Tuple[str, Dict]:
        """
        Generate response using local LLM
        """
        if not context:
            return "I don't have enough information to answer that question.", {"error": "no_context"}

        logger.info(f"Generating response for query: '{query[:50]}...'")
        
        # Prepare prompt
        prompt = self._create_prompt(query, context)
        
        try:
            # Tokenize
            inputs = self.tokenizer(
                prompt, 
                return_tensors="pt", 
                max_length=512, 
                truncation=True
            )
            
            # Generate
            # WHY generation parameters:
            # - do_sample=False: Deterministic greedy decoding (temperature ignored)
            # - max_length: Limit response size
            outputs = self.model.generate(
                **inputs,
                max_length=max_tokens,
                do_sample=(temperature > 0),
                temperature=temperature if temperature > 0 else None,
            )
            
            # Decode
            response_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            # Metadata
            metadata = {
                "model": LLM_MODEL_NAME,
                "sources_used": [s['topic'] for s in sources_metadata]
            }
            
            logger.info(f"Generated response: {response_text[:50]}...")
            return response_text, metadata

        except Exception as e:
            logger.error(f"Generation failed: {e}")
            return "Error generating response.", {"error": str(e)}

if __name__ == "__main__":
    print("Testing Local Generator...")
    gen = Generator()
    ctx = "To reset your password, visit password.corp.com and enter your employee ID."
    q = "Where do I go to reset my password?"
    res, _ = gen.generate_response(q, ctx, [])
    print(f"Query: {q}\nResponse: {res}")


## 6. Execution & Testing

In [None]:

import logging

# Setup Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize System
print("Initializing System...")

# 1. Load Data
try:
    knowledge_items = load_knowledge_base()
except FileNotFoundError:
    print("Please upload 'rag_sample_qas_from_kis.csv' to the current directory.")
    knowledge_items = []

if knowledge_items:
    # 2. Build Index
    vector_store = VectorStore()
    vector_store.build_index(knowledge_items)

    # 3. Init Components
    retriever = Retriever(vector_store)
    generator = Generator()

    print("\n‚úÖ System Initialized!\n")

    # Interactive Loop
    def ask(query):
        print(f"‚ùì Question: {query}")
        print("üîç Retrieving...")
        context, sources = retriever.get_context_for_generation(query, top_k=2)
        
        for s in sources:
            print(f"  - Found: {s['topic']} (Score: {s['score']:.3f})")

        print("üß† Generating...")
        response, _ = generator.generate_response(query, context, sources)
        print(f"ü§ñ Answer: {response}\n")
        print("-" * 50)

    # Examples
    ask("How do I reset my PIN?")
    ask("How do I configure VPN?")
else:
    print("Using dummy data or failed to load.")
