In [1]:
"""
Complete Local RAG Chatbot with Image Understanding
===================================================

‚úÖ No Cloud Dependencies (runs 100% locally)
‚úÖ No RAGatouille (direct Jina ColBERT v2 implementation)
‚úÖ PyMuPDF4LLM for PDF conversion
‚úÖ Image extraction and analysis with LLaVA vision model
‚úÖ Hybrid retrieval (BM25s + Jina ColBERT v2 + RRF + Reranking)
‚úÖ Markdown-aware semantic chunking
‚úÖ SQLite database for storage

Requirements:
- Ollama (for LLMs: llama3.2:3b, llava:7b)
- Mac Mini M4 or similar (16GB RAM recommended)
"""



In [2]:
import os
# Suppress tokenizers parallelism warning when forking
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import json
import re
import io
import time
import warnings
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path

# Suppress deprecation warnings from transformers/sentence-transformers
warnings.filterwarnings('ignore', message='.*torch_dtype.*deprecated.*')

# Core libraries
import numpy as np
import torch
from PIL import Image as PILImage  # Renamed to avoid conflict with database model

# PDF and text processing
import pymupdf4llm
import fitz  # PyMuPDF for image extraction
from transformers import AutoTokenizer

# Retrieval
import bm25s
from bm25s.hf import BM25HF
import Stemmer  # PyStemmer for stemming
from sentence_transformers import SentenceTransformer

# Database
import sqlalchemy
from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime, Boolean
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy.orm import DeclarativeBase

# LLM
import requests  # For Ollama API

In [3]:
# ============================================================================
# CONFIGURATION
# ============================================================================

@dataclass
class RAGConfig:
    """Configuration for local RAG system"""
    # Base directory (set to project root - parent of notebooks folder)
    base_dir: str = os.path.abspath(os.path.join(os.getcwd(), '..'))
    
    # Database
    db_path: str = None
    
    # Chunking - FIXED to use CHARACTER counts (more reliable than token counts)
    # Optimized for small models (3B params) with limited context windows
    min_chunk_size: int = 600   # Minimum 600 characters per chunk
    max_chunk_size: int = 800   # Maximum 800 characters per chunk (HARD LIMIT)
    chunk_overlap: int = 200    # 200 character overlap between chunks
    
    # Retrieval - ADAPTIVE strategy for better recall
    bm25_top_k: int = 100       # BM25 initial candidates
    colbert_top_k: int = 100    # ColBERT initial candidates
    
    # Adaptive top-k: Adjust based on query complexity
    final_top_k_min: int = 5    # Minimum chunks (for simple queries)
    final_top_k_max: int = 10   # Maximum chunks (for complex queries)
    final_top_k_default: int = 7  # Default for most queries
    
    # Context window management
    max_context_chars: int = 6000  # Max chars to send to LLM (7-8 chunks √ó 800)
    
    # Models
    chat_model: str = "llama3.2:3b"
    vision_model: str = "gemma3:4b"
    embedding_model: str = "jinaai/jina-colbert-v2"
    
    # Ollama
    ollama_url: str = "http://localhost:11434"
    ollama_timeout: int = 300  # Increased timeout for slower models
    
    # Paths (will be set to absolute paths in __post_init__)
    bm25_index_path: str = None
    colbert_index_path: str = None
    images_dir: str = None
    
    # Device
    device: str = "mps" if torch.backends.mps.is_available() else "cpu"
    
    def __post_init__(self):
        """Set absolute paths after initialization"""
        if self.db_path is None:
            self.db_path = os.path.join(self.base_dir, "rag_local.db")
        if self.bm25_index_path is None:
            self.bm25_index_path = os.path.join(self.base_dir, "indexes", "bm25s")
        if self.colbert_index_path is None:
            self.colbert_index_path = os.path.join(self.base_dir, "indexes", "colbert")
        if self.images_dir is None:
            self.images_dir = os.path.join(self.base_dir, "extracted_images")

In [4]:
# ============================================================================
# DATABASE MODELS
# ============================================================================

class Base(DeclarativeBase):
    pass

class Document(Base):
    __tablename__ = 'documents'
    
    id = Column(Integer, primary_key=True)
    filename = Column(String(255), nullable=False)
    upload_date = Column(DateTime, default=datetime.utcnow)
    total_pages = Column(Integer)
    status = Column(String(50))

class Image(Base):
    __tablename__ = 'images'
    
    id = Column(Integer, primary_key=True)
    document_id = Column(Integer, nullable=False)
    page_number = Column(Integer, nullable=False)
    image_path = Column(String(500), nullable=False)
    description = Column(Text)
    image_type = Column(String(50))
    ocr_text = Column(Text)

class Chunk(Base):
    __tablename__ = 'chunks'
    
    id = Column(Integer, primary_key=True)
    document_id = Column(Integer, nullable=False)
    chunk_index = Column(Integer, nullable=False)
    text = Column(Text, nullable=False)
    heading_path = Column(String(500))
    token_count = Column(Integer)
    has_images = Column(Boolean, default=False)
    chunk_metadata = Column(Text)

In [5]:
# ============================================================================
# OLLAMA CLIENT WITH STREAMING SUPPORT
# ============================================================================

class OllamaClient:
    """Client for interacting with Ollama API with streaming support"""
    
    def __init__(self, config: RAGConfig):
        self.config = config
        self.base_url = config.ollama_url
    
    def generate(
        self, 
        model: str, 
        prompt: str, 
        system: str = "",
        images: List[str] = None,
        timeout: int = 300,
        stream: bool = False
    ) -> str:
        """Generate text with Ollama (with optional streaming)"""
        url = f"{self.base_url}/api/generate"
        
        payload = {
            "model": model,
            "prompt": prompt,
            "stream": stream
        }
        
        if system:
            payload["system"] = system
        
        if images:
            payload["images"] = images
        
        try:
            if stream:
                # Streaming mode - print tokens as they arrive
                response = requests.post(url, json=payload, timeout=timeout, stream=True)
                response.raise_for_status()
                
                full_response = ""
                for line in response.iter_lines():
                    if line:
                        chunk = json.loads(line)
                        if "response" in chunk:
                            token = chunk["response"]
                            print(token, end='', flush=True)
                            full_response += token
                        
                        # Check if done
                        if chunk.get("done", False):
                            break
                
                print()  # Newline after streaming
                return full_response
            else:
                # Non-streaming mode - wait for complete response
                response = requests.post(url, json=payload, timeout=timeout)
                response.raise_for_status()
                result = response.json()
                return result.get("response", "")
        
        except requests.exceptions.Timeout:
            return f"Error: Request timed out after {timeout} seconds"
        except requests.exceptions.RequestException as e:
            return f"Error: {str(e)}"
    
    def analyze_image(self, image_path: str) -> Dict:
        """Analyze image using vision model"""
        import base64
        
        # Read and encode image
        with open(image_path, 'rb') as f:
            image_data = base64.b64encode(f.read()).decode('utf-8')
        
        # Prompt for image analysis
        prompt = """Analyze this image and provide:
1. A detailed description of what you see
2. The type of image (diagram, chart, photo, screenshot, etc.)
3. Any text visible in the image (OCR)

Format your response as:
DESCRIPTION: [your description]
TYPE: [image type]
TEXT: [any visible text]"""
        
        try:
            response = self.generate(
                model=self.config.vision_model,
                prompt=prompt,
                images=[image_data],
                timeout=self.config.ollama_timeout
            )
            
            # Parse response
            description = ""
            image_type = "unknown"
            ocr_text = ""
            
            lines = response.split('\n')
            current_section = None
            
            for line in lines:
                line = line.strip()
                if line.startswith('DESCRIPTION:'):
                    current_section = 'description'
                    description = line.replace('DESCRIPTION:', '').strip()
                elif line.startswith('TYPE:'):
                    current_section = 'type'
                    image_type = line.replace('TYPE:', '').strip()
                elif line.startswith('TEXT:'):
                    current_section = 'text'
                    ocr_text = line.replace('TEXT:', '').strip()
                elif current_section == 'description' and line:
                    description += ' ' + line
                elif current_section == 'text' and line:
                    ocr_text += ' ' + line
            
            # FIXED: Return 'type' key instead of 'image_type' to match what DocumentProcessor expects
            return {
                'description': description.strip() or response[:200],  # Fallback to first 200 chars
                'type': image_type.strip() or 'image',  # Changed from 'image_type' to 'type'
                'ocr_text': ocr_text.strip()
            }
            
        except Exception as e:
            print(f"    ‚ö†Ô∏è  Error analyzing image: {e}")
            return {
                'description': 'Image analysis failed',
                'type': 'unknown',  # Changed from 'image_type' to 'type'
                'ocr_text': ''
            }
    
    def chat(
        self,
        messages: List[Dict[str, str]],
        context: str = None,
        stream: bool = True  # Enable streaming by default!
    ) -> str:
        """Chat with context - ULTRA-STRONG grounding to prevent hallucination"""
        
        # Use /api/chat endpoint for proper message handling
        url = f"{self.base_url}/api/chat"

        # ULTRA-STRONG system message - Maximum grounding for small models
        # UPDATED: Removed citation requirement for natural conversation flow
        if context:
            system_msg = f"""You are a helpful AI assistant that answers questions based on provided documents.

üö´ ABSOLUTE RULES:
- ONLY use information from the documents provided below
- DO NOT use knowledge from your training data
- DO NOT make assumptions beyond what's written
- If the answer is not in the documents, say: "I don't have that information in the provided documents."

‚úÖ HOW TO ANSWER:
- Read the documents carefully
- Provide clear, direct answers
- Use natural language (no need to cite "Source 1" etc.)
- Be concise but complete

üìÑ DOCUMENTS:
{context}

Answer the user's question naturally and helpfully using only the information above."""
        else:
            system_msg = "You are a helpful assistant."

        # CRITICAL FIX: When context is provided (RAG mode), only pass the current query
        # This prevents the LLM from getting confused by previous answers based on different contexts
        if context:
            # For RAG queries: Only send the LATEST user question (not full conversation history)
            # This prevents hallucination from mixing old context with new queries
            chat_messages = [
                {"role": "system", "content": system_msg},
                {"role": "user", "content": messages[-1]["content"]}  # Only current question!
            ]
        else:
            # For non-RAG queries: Include full conversation history
            chat_messages = [
                {"role": "system", "content": system_msg}
            ]
            for msg in messages:
                chat_messages.append({
                    "role": msg["role"],
                    "content": msg["content"]
                })

        # Call Ollama chat API with STRONGER grounding parameters
        payload = {
            "model": self.config.chat_model,
            "messages": chat_messages,
            "stream": stream,
            "options": {
                "temperature": 0.0,  # ZERO temperature for maximum factuality!
                "top_p": 0.8,  # Reduced for more focused responses
                "top_k": 20,  # Limit vocabulary to most likely tokens
                "repeat_penalty": 1.2,  # Increased to prevent repetition
                "num_ctx": 4096  # Ensure enough context window
            }
        }

        try:
            if stream:
                response = requests.post(url, json=payload, timeout=self.config.ollama_timeout, stream=True)
                response.raise_for_status()

                full_response = ""
                for line in response.iter_lines():
                    if line:
                        chunk = json.loads(line)
                        if "message" in chunk and "content" in chunk["message"]:
                            token = chunk["message"]["content"]
                            print(token, end='', flush=True)
                            full_response += token

                        if chunk.get("done", False):
                            break

                print()
                return full_response
            else:
                response = requests.post(url, json=payload, timeout=self.config.ollama_timeout)
                response.raise_for_status()
                return response.json()["message"]["content"]

        except requests.exceptions.Timeout:
            return f"Error: Request timed out after {self.config.ollama_timeout} seconds"
        except requests.exceptions.RequestException as e:
            return f"Error: {str(e)}"

In [6]:
# ============================================================================
# MARKDOWN-AWARE SEMANTIC CHUNKER - REWRITTEN FOR PROPER SIZE ENFORCEMENT
# ============================================================================

class MarkdownSemanticChunker:
    """
    Intelligent markdown chunking that STRICTLY respects size limits while maintaining hierarchy.
    
    Key improvements:
    - Uses CHARACTER counts (not misleading token counts)
    - HARD enforces max_chunk_size (no more 26K char chunks!)
    - Respects markdown hierarchy when possible
    - Splits at sentence boundaries for better semantic coherence
    - Maintains overlap for context continuity
    """
    
    def __init__(self, config: RAGConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    
    def chunk_markdown(self, markdown_text: str, doc_context: str = "") -> List[Dict]:
        """Create semantically meaningful chunks with STRICT size enforcement"""
        sections = self._parse_markdown_hierarchy(markdown_text)
        chunks = self._create_chunks_from_sections(sections, doc_context)
        return chunks
    
    def _parse_markdown_hierarchy(self, text: str) -> List[Dict]:
        """Parse markdown into hierarchical sections"""
        lines = text.split('\n')
        sections = []
        current_section = None
        heading_stack = []
        
        for line in lines:
            heading_match = re.match(r'^(#{1,6})\s+(.+)$', line)
            
            if heading_match:
                if current_section:
                    sections.append(current_section)
                
                level = len(heading_match.group(1))
                title = heading_match.group(2).strip()
                
                heading_stack = [(lvl, ttl) for lvl, ttl in heading_stack if lvl < level]
                heading_stack.append((level, title))
                
                parent_path = ' > '.join([ttl for _, ttl in heading_stack[:-1]])
                full_path = ' > '.join([ttl for _, ttl in heading_stack])
                
                current_section = {
                    'level': level,
                    'title': title,
                    'content': '',
                    'parent_path': parent_path,
                    'full_path': full_path
                }
            else:
                if current_section is not None:
                    current_section['content'] += line + '\n'
                else:
                    if not sections or sections[-1]['level'] != 0:
                        sections.append({
                            'level': 0,
                            'title': 'Introduction',
                            'content': line + '\n',
                            'parent_path': '',
                            'full_path': 'Introduction'
                        })
                    else:
                        sections[-1]['content'] += line + '\n'
        
        if current_section:
            sections.append(current_section)
        
        return sections
    
    def _create_chunks_from_sections(self, sections: List[Dict], doc_context: str) -> List[Dict]:
        """
        Create chunks with HARD size limits while respecting markdown hierarchy.
        
        Strategy:
        1. Try to keep sections together if they fit
        2. If section is too large, split at paragraph boundaries
        3. If paragraph is too large, split at sentence boundaries
        4. ALWAYS enforce max_chunk_size as HARD limit
        """
        chunks = []
        
        for section in sections:
            section_chunks = self._process_section(section, doc_context)
            chunks.extend(section_chunks)
        
        return chunks
    
    def _process_section(self, section: Dict, doc_context: str) -> List[Dict]:
        """Process a single section, splitting if necessary"""
        # Format section with heading
        heading_text = self._format_heading(section)
        content = section['content'].strip()
        
        # Calculate sizes
        heading_size = len(heading_text)
        content_size = len(content)
        total_size = heading_size + content_size
        
        # Case 1: Entire section fits within max size
        if total_size <= self.config.max_chunk_size:
            return [{
                'text': heading_text + content,
                'heading_path': section['full_path'],
                'level': section['level'],
                'char_count': total_size,
                'token_count': self._estimate_tokens(total_size),
                'doc_context': doc_context,
                'type': 'complete_section'
            }]
        
        # Case 2: Section is too large - need to split
        # Try splitting at paragraph boundaries first
        paragraphs = re.split(r'\n\n+', content)
        
        if len(paragraphs) > 1:
            return self._split_by_paragraphs(section, heading_text, paragraphs, doc_context)
        else:
            # Single large paragraph - split by sentences
            return self._split_by_sentences(section, heading_text, content, doc_context)
    
    def _split_by_paragraphs(
        self, 
        section: Dict, 
        heading_text: str, 
        paragraphs: List[str], 
        doc_context: str
    ) -> List[Dict]:
        """Split section by paragraphs, respecting max_chunk_size"""
        chunks = []
        current_text = heading_text
        current_size = len(heading_text)
        part_num = 1
        
        for para in paragraphs:
            para = para.strip()
            if not para:
                continue
            
            para_size = len(para) + 2  # +2 for \n\n
            
            # Check if adding this paragraph would exceed max size
            if current_size + para_size > self.config.max_chunk_size:
                # Save current chunk if it has content beyond heading
                if current_size > len(heading_text):
                    chunks.append({
                        'text': current_text.strip(),
                        'heading_path': section['full_path'],
                        'level': section['level'],
                        'char_count': len(current_text.strip()),
                        'token_count': self._estimate_tokens(len(current_text.strip())),
                        'doc_context': doc_context,
                        'type': 'split_section',
                        'part': part_num
                    })
                    part_num += 1
                
                # Check if paragraph itself is too large
                if para_size > self.config.max_chunk_size - len(heading_text):
                    # Paragraph is too large - split by sentences
                    sentence_chunks = self._split_paragraph_by_sentences(
                        section, heading_text, para, doc_context, part_num
                    )
                    chunks.extend(sentence_chunks)
                    part_num += len(sentence_chunks)
                    current_text = heading_text
                    current_size = len(heading_text)
                else:
                    # Start new chunk with this paragraph
                    current_text = heading_text + para + '\n\n'
                    current_size = len(current_text)
            else:
                # Add paragraph to current chunk
                current_text += para + '\n\n'
                current_size += para_size
        
        # Add final chunk if it has content
        if current_size > len(heading_text):
            chunks.append({
                'text': current_text.strip(),
                'heading_path': section['full_path'],
                'level': section['level'],
                'char_count': len(current_text.strip()),
                'token_count': self._estimate_tokens(len(current_text.strip())),
                'doc_context': doc_context,
                'type': 'split_section',
                'part': part_num
            })
        
        return chunks
    
    def _split_by_sentences(
        self, 
        section: Dict, 
        heading_text: str, 
        content: str, 
        doc_context: str
    ) -> List[Dict]:
        """Split content by sentences when paragraphs are too large"""
        # Simple sentence splitting (can be improved with nltk if needed)
        sentences = re.split(r'(?<=[.!?])\s+', content)
        
        chunks = []
        current_text = heading_text
        current_size = len(heading_text)
        part_num = 1
        
        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue
            
            sentence_size = len(sentence) + 1  # +1 for space
            
            # Check if adding this sentence would exceed max size
            if current_size + sentence_size > self.config.max_chunk_size:
                # Save current chunk if it has content
                if current_size > len(heading_text):
                    chunks.append({
                        'text': current_text.strip(),
                        'heading_path': section['full_path'],
                        'level': section['level'],
                        'char_count': len(current_text.strip()),
                        'token_count': self._estimate_tokens(len(current_text.strip())),
                        'doc_context': doc_context,
                        'type': 'sentence_split',
                        'part': part_num
                    })
                    part_num += 1
                
                # If sentence itself is too large, truncate it (last resort)
                if sentence_size > self.config.max_chunk_size - len(heading_text):
                    truncated = sentence[:self.config.max_chunk_size - len(heading_text) - 3] + "..."
                    chunks.append({
                        'text': heading_text + truncated,
                        'heading_path': section['full_path'],
                        'level': section['level'],
                        'char_count': len(heading_text + truncated),
                        'token_count': self._estimate_tokens(len(heading_text + truncated)),
                        'doc_context': doc_context,
                        'type': 'truncated',
                        'part': part_num
                    })
                    part_num += 1
                    current_text = heading_text
                    current_size = len(heading_text)
                else:
                    # Start new chunk with this sentence
                    current_text = heading_text + sentence + ' '
                    current_size = len(current_text)
            else:
                # Add sentence to current chunk
                current_text += sentence + ' '
                current_size += sentence_size
        
        # Add final chunk if it has content
        if current_size > len(heading_text):
            chunks.append({
                'text': current_text.strip(),
                'heading_path': section['full_path'],
                'level': section['level'],
                'char_count': len(current_text.strip()),
                'token_count': self._estimate_tokens(len(current_text.strip())),
                'doc_context': doc_context,
                'type': 'sentence_split',
                'part': part_num
            })
        
        return chunks
    
    def _split_paragraph_by_sentences(
        self,
        section: Dict,
        heading_text: str,
        paragraph: str,
        doc_context: str,
        start_part_num: int
    ) -> List[Dict]:
        """Helper to split a single large paragraph by sentences"""
        sentences = re.split(r'(?<=[.!?])\s+', paragraph)
        chunks = []
        current_text = heading_text
        current_size = len(heading_text)
        part_num = start_part_num
        
        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue
            
            sentence_size = len(sentence) + 1
            
            if current_size + sentence_size > self.config.max_chunk_size:
                if current_size > len(heading_text):
                    chunks.append({
                        'text': current_text.strip(),
                        'heading_path': section['full_path'],
                        'level': section['level'],
                        'char_count': len(current_text.strip()),
                        'token_count': self._estimate_tokens(len(current_text.strip())),
                        'doc_context': doc_context,
                        'type': 'sentence_split',
                        'part': part_num
                    })
                    part_num += 1
                
                current_text = heading_text + sentence + ' '
                current_size = len(current_text)
            else:
                current_text += sentence + ' '
                current_size += sentence_size
        
        if current_size > len(heading_text):
            chunks.append({
                'text': current_text.strip(),
                'heading_path': section['full_path'],
                'level': section['level'],
                'char_count': len(current_text.strip()),
                'token_count': self._estimate_tokens(len(current_text.strip())),
                'doc_context': doc_context,
                'type': 'sentence_split',
                'part': part_num
            })
        
        return chunks
    
    def _format_heading(self, section: Dict) -> str:
        """Format section heading with context"""
        parts = []
        
        if section['parent_path']:
            parts.append(f"[Context: {section['parent_path']}]")
        
        if section['title'] and section['title'] != 'Introduction':
            heading_prefix = '#' * section['level']
            parts.append(f"{heading_prefix} {section['title']}")
        
        if parts:
            return '\n\n'.join(parts) + '\n\n'
        return ''
    
    def _estimate_tokens(self, char_count: int) -> int:
        """Estimate token count from character count (rough approximation)"""
        # Rough estimate: 1 token ‚âà 4 characters for English text
        return char_count // 4
    
    def _count_tokens(self, text: str) -> int:
        """
        DEPRECATED: Old method that was broken due to truncation.
        Kept for compatibility but now just returns character count.
        """
        return len(text)

In [7]:
# ============================================================================
# DOCUMENT PROCESSOR WITH IMAGE EXTRACTION
# ============================================================================

class DocumentProcessor:
    """Handles PDF processing with image extraction and analysis"""
    
    def __init__(self, config: RAGConfig, ollama_client: OllamaClient):
        self.config = config
        self.ollama = ollama_client
        self.chunker = MarkdownSemanticChunker(config)
        
        # Create images directory
        os.makedirs(config.images_dir, exist_ok=True)
    
    def _sanitize_utf8(self, text: str) -> str:
        """IMPROVED: Robust UTF-8 sanitization to prevent database corruption and LLM errors"""
        if not text:
            return text
        
        try:
            # Step 1: Remove invalid UTF-8 sequences
            clean_text = text.encode('utf-8', errors='ignore').decode('utf-8', errors='ignore')
            
            # Step 2: Remove null bytes which cause database issues
            clean_text = clean_text.replace('\x00', '')
            
            # Step 3: Remove problematic control characters (keep newlines, tabs, carriage returns)
            clean_text = ''.join(
                char for char in clean_text
                if char in ['\n', '\t', '\r'] or ord(char) >= 32
            )
            
            # Step 4: Normalize whitespace (optional but helps with consistency)
            # Replace multiple spaces with single space
            import re
            clean_text = re.sub(r' +', ' ', clean_text)
            
            return clean_text
            
        except Exception as e:
            print(f"    ‚ö†Ô∏è  UTF-8 sanitization error: {e}")
            # Last resort: keep only printable ASCII
            return ''.join(char for char in text if 32 <= ord(char) <= 126 or char in ['\n', '\t'])
    
    def pdf_to_markdown(self, pdf_path: str) -> str:
        """Convert PDF to Markdown using PyMuPDF4LLM"""
        markdown_text = pymupdf4llm.to_markdown(pdf_path)
        # Sanitize to remove invalid UTF-8
        return self._sanitize_utf8(markdown_text)
    
    def _group_nearby_rectangles(self, rects: List[fitz.Rect], proximity_threshold: float = 20) -> List[List[int]]:
        """Group rectangles that are close to each other"""
        if not rects:
            return []

        # Each rect gets assigned to a group
        groups = []
        assigned = [False] * len(rects)

        for i, rect in enumerate(rects):
            if assigned[i]:
                continue

            # Start a new group
            current_group = [i]
            assigned[i] = True

            # Find all rects that should be in this group
            changed = True
            while changed:
                changed = False
                for j, other_rect in enumerate(rects):
                    if assigned[j]:
                        continue

                    # Check if this rect is close to any rect in current group
                    for group_idx in current_group:
                        group_rect = rects[group_idx]

                        # Calculate distance between rectangles
                        # Expand each rect by proximity_threshold and check for intersection
                        expanded_group = fitz.Rect(
                            group_rect.x0 - proximity_threshold,
                            group_rect.y0 - proximity_threshold,
                            group_rect.x1 + proximity_threshold,
                            group_rect.y1 + proximity_threshold
                        )

                        if expanded_group.intersects(other_rect):
                            current_group.append(j)
                            assigned[j] = True
                            changed = True
                            break

            groups.append(current_group)

        return groups

    def extract_images_from_pdf(
        self,
        pdf_path: str,
        document_id: int,
        min_image_size: int = 50,  # Minimum width/height in pixels
        proximity_threshold: float = 20  # Group images within this distance (points)
    ) -> List[Dict]:
        """
        Extract images from PDF with intelligent grouping.
        Groups nearby images together to capture complete diagrams.
        """
        doc = fitz.open(pdf_path)
        images = []

        for page_num in range(len(doc)):
            page = doc[page_num]
            image_list = page.get_images(full=True)

            if not image_list:
                continue

            # Get bounding boxes for all images on this page
            image_bboxes = []
            for img_info in image_list:
                xref = img_info[0]
                # Get all instances of this image on the page
                rects = page.get_image_rects(xref)
                if rects:
                    for rect in rects:
                        # Check minimum size
                        width = rect.width
                        height = rect.height
                        if width >= min_image_size and height >= min_image_size:
                            image_bboxes.append({
                                'rect': rect,
                                'xref': xref,
                                'width': width,
                                'height': height
                            })

            if not image_bboxes:
                continue

            # Group nearby images
            rects_only = [bbox['rect'] for bbox in image_bboxes]
            groups = self._group_nearby_rectangles(rects_only, proximity_threshold)

            # Process each group
            for group_idx, group in enumerate(groups):
                if len(group) == 1:
                    # Single image - extract normally
                    bbox = image_bboxes[group[0]]
                    try:
                        base_image = doc.extract_image(bbox['xref'])
                        image_bytes = base_image["image"]
                        pil_image = PILImage.open(io.BytesIO(image_bytes))

                        # Save image
                        image_filename = f"doc{document_id}_page{page_num+1}_img{len(images)+1}.png"
                        image_path = os.path.join(self.config.images_dir, image_filename)

                        if pil_image.mode == 'RGBA':
                            pil_image = pil_image.convert('RGB')

                        pil_image.save(image_path, 'PNG')

                        images.append({
                            'page_number': page_num + 1,
                            'image_path': image_path,
                            'image_index': len(images),
                            'is_composite': False,
                            'bbox': bbox['rect']
                        })
                    except Exception as e:
                        print(f"    ‚ö†Ô∏è  Failed to extract single image on page {page_num+1}: {e}")

                else:
                    # Multiple images grouped together - capture as screenshot
                    # Calculate bounding box that encompasses all images in group
                    union_rect = image_bboxes[group[0]]['rect']
                    for idx in group[1:]:
                        union_rect = union_rect | image_bboxes[idx]['rect']  # Union of rectangles

                    # Add some padding
                    padding = 5
                    union_rect = fitz.Rect(
                        max(0, union_rect.x0 - padding),
                        max(0, union_rect.y0 - padding),
                        min(page.rect.width, union_rect.x1 + padding),
                        min(page.rect.height, union_rect.y1 + padding)
                    )

                    try:
                        # Render this region as an image
                        mat = fitz.Matrix(2, 2)  # 2x zoom for better quality
                        pix = page.get_pixmap(matrix=mat, clip=union_rect)

                        # Convert to PIL Image
                        img_data = pix.tobytes("png")
                        pil_image = PILImage.open(io.BytesIO(img_data))

                        # Save composite image
                        image_filename = f"doc{document_id}_page{page_num+1}_composite{group_idx+1}.png"
                        image_path = os.path.join(self.config.images_dir, image_filename)

                        pil_image.save(image_path, 'PNG')

                        images.append({
                            'page_number': page_num + 1,
                            'image_path': image_path,
                            'image_index': len(images),
                            'is_composite': True,
                            'num_components': len(group),
                            'bbox': union_rect
                        })

                        print(f"    üìä Grouped {len(group)} images into composite on page {page_num+1}")

                    except Exception as e:
                        print(f"    ‚ö†Ô∏è  Failed to create composite image on page {page_num+1}: {e}")

        doc.close()
        return images
    
    def analyze_images(
        self, 
        images: List[Dict],
        document_id: int,
        db_session
    ) -> List[int]:
        """Analyze images with vision model and save to database"""
        image_ids = []
        
        for idx, img_info in enumerate(images):
            print(f"    Analyzing image {idx+1} on page {img_info['page_number']}...", end=' ')
            start_time = time.time()
            
            # Analyze with vision model
            analysis = self.ollama.analyze_image(img_info['image_path'])
            
            # Save to database with UTF-8 sanitization
            image_record = Image(
                document_id=document_id,
                page_number=img_info['page_number'],
                image_path=img_info['image_path'],
                description=self._sanitize_utf8(analysis['description']),
                image_type=self._sanitize_utf8(analysis['type']),
                ocr_text=self._sanitize_utf8(analysis['ocr_text'])
            )
            db_session.add(image_record)
            db_session.flush()
            
            image_ids.append(image_record.id)
            
            elapsed = time.time() - start_time
            print(f"‚úì ({elapsed:.1f}s)")
        
        db_session.commit()
        return image_ids
    
    def enrich_chunks_with_images(
        self,
        chunks: List[Dict],
        images_data: List[Dict],
        db_session
    ) -> List[Dict]:
        """Add image context (description + OCR text) to relevant chunks for better search accuracy"""
        
        enriched_chunks = []
        
        for chunk in chunks:
            chunk_copy = chunk.copy()
            
            # Find images that might be relevant to this chunk
            # Simple heuristic: chunks that mention visual content keywords
            relevant_images = []
            
            for img in images_data:
                if any(keyword in chunk['text'].lower() for keyword in 
                       ['figure', 'image', 'diagram', 'chart', 'screenshot', 'see below', 'shown in']):
                    relevant_images.append(img)
            
            if relevant_images:
                # Build comprehensive image context including OCR text
                image_context = "\n\n[Images in this section]:\n"
                image_metadata = []
                
                for img in relevant_images:
                    # Add type and description
                    image_context += f"- {img['type'].capitalize()}: {img['description']}\n"
                    
                    # CRITICAL: Add OCR text if available (makes text in images searchable!)
                    if img.get('ocr_text') and img['ocr_text'].strip():
                        image_context += f"  Text visible in image: {img['ocr_text']}\n"
                    
                    image_metadata.append({
                        'path': img['image_path'],
                        'description': img['description'],
                        'type': img['type'],
                        'ocr_text': img.get('ocr_text', '')
                    })
                
                chunk_copy['text'] = self._sanitize_utf8(chunk['text'] + image_context)
                chunk_copy['has_images'] = True
                chunk_copy['image_paths'] = [img['image_path'] for img in relevant_images]
                chunk_copy['image_metadata'] = image_metadata
            else:
                chunk_copy['text'] = self._sanitize_utf8(chunk['text'])
                chunk_copy['has_images'] = False
            
            enriched_chunks.append(chunk_copy)
        
        return enriched_chunks
    
    def process_document(
        self, 
        pdf_path: str,
        db_session
    ) -> Tuple[List[Dict], int]:
        """Complete processing pipeline"""
        print(f"\n{'='*60}")
        print(f"Processing: {pdf_path}")
        print(f"{'='*60}")
        
        # Step 1: Convert to markdown
        print("\n[Step 1/5] Converting PDF to Markdown...", end=' ')
        start_time = time.time()
        markdown_text = self.pdf_to_markdown(pdf_path)
        elapsed = time.time() - start_time
        print(f"‚úì {elapsed:.2f}s")
        print(f"  ‚Ä¢ Extracted {len(markdown_text):,} characters")
        
        # Create document record
        doc = Document(
            filename=os.path.basename(pdf_path),
            status='processing'
        )
        db_session.add(doc)
        db_session.commit()
        
        # Step 2: Extract and analyze images
        print("\n[Step 2/5] Extracting and analyzing images...")
        start_time = time.time()
        
        images = self.extract_images_from_pdf(pdf_path, doc.id)
        
        if images:
            image_ids = self.analyze_images(images, doc.id, db_session)
            
            # Get image data for enrichment
            images_data = []
            for img_id in image_ids:
                img_record = db_session.query(Image).filter_by(id=img_id).first()
                if img_record:
                    images_data.append({
                        'image_path': img_record.image_path,
                        'description': img_record.description,
                        'type': img_record.image_type,
                        'ocr_text': img_record.ocr_text
                    })
        else:
            images_data = []
        
        elapsed = time.time() - start_time
        print(f"  ‚úì Completed in {elapsed:.2f}s")
        print(f"  ‚Ä¢ Extracted {len(images)} images")
        if images:
            print(f"  ‚Ä¢ Vision analysis: ‚úì")
        
        # Step 3: Markdown-aware semantic chunking
        print("\n[Step 3/5] Markdown-aware semantic chunking...", end=' ')
        start_time = time.time()
        doc_context = f"Document: {os.path.basename(pdf_path)}\n\n{markdown_text[:500]}"
        chunks = self.chunker.chunk_markdown(markdown_text, doc_context)
        elapsed = time.time() - start_time
        print(f"‚úì {elapsed:.2f}s")
        print(f"  ‚Ä¢ Created {len(chunks)} semantic chunks")
        
        # Step 4: Enrich chunks with image context (INCLUDING OCR TEXT!)
        print("\n[Step 4/5] Enriching chunks with image context...", end=' ')
        start_time = time.time()
        if images_data:
            chunks = self.enrich_chunks_with_images(chunks, images_data, db_session)
            chunks_with_images = sum(1 for c in chunks if c.get('has_images', False))
            elapsed = time.time() - start_time
            print(f"‚úì {elapsed:.2f}s")
            print(f"  ‚Ä¢ {chunks_with_images} chunks enriched with image context + OCR text")
        else:
            # Still sanitize even if no images
            for chunk in chunks:
                chunk['text'] = self._sanitize_utf8(chunk['text'])
            elapsed = time.time() - start_time
            print(f"‚úì {elapsed:.2f}s")
            print(f"  ‚Ä¢ No images to enrich")
        
        # Step 5: Save to database
        print("\n[Step 5/5] Saving chunks to database...", end=' ')
        start_time = time.time()
        for idx, chunk in enumerate(chunks):
            chunk_record = Chunk(
                document_id=doc.id,
                chunk_index=idx,
                text=self._sanitize_utf8(chunk['text']),  # Sanitize before saving
                heading_path=self._sanitize_utf8(chunk.get('heading_path', '')),  # Sanitize heading too
                token_count=chunk.get('token_count', 0),
                has_images=chunk.get('has_images', False),
                chunk_metadata=self._sanitize_utf8(json.dumps({
                    k: v for k, v in chunk.items() 
                    if k not in ['text', 'heading_path', 'token_count', 'has_images']
                })) if chunk else ''
            )
            db_session.add(chunk_record)
        
        doc.status = 'indexed'
        db_session.commit()
        elapsed = time.time() - start_time
        print(f"‚úì {elapsed:.2f}s")
        
        return chunks, doc.id

In [8]:
# ============================================================================
# JINA COLBERT V2 RETRIEVER (NO RAGATOUILLE!)
# ============================================================================

class JinaColBERTRetriever:
    """Direct implementation of Jina ColBERT v2 (no RAGatouille dependency)"""
    
    def __init__(self, config: RAGConfig):
        self.config = config
        self.model = SentenceTransformer(
            config.embedding_model,
            trust_remote_code=True,
            device=config.device
        )
        # Set max sequence length to avoid truncation warnings
        self.model.max_seq_length = 512
        self.corpus_embeddings = None
        self.corpus = None
    
    def index(self, corpus: List[str]) -> None:
        """Index corpus with ColBERT embeddings"""
        self.corpus = corpus
        
        print(f"  Encoding {len(corpus)} documents...")
        
        # Encode corpus (this gives us token-level embeddings)
        # Truncate long sequences to avoid errors
        self.corpus_embeddings = self.model.encode(
            corpus,
            show_progress_bar=True,
            convert_to_tensor=True,
            batch_size=8  # Smaller batch size for stability
        )
        
        # Save to disk
        os.makedirs(self.config.colbert_index_path, exist_ok=True)
        torch.save({
            'embeddings': self.corpus_embeddings,
            'corpus': corpus
        }, os.path.join(self.config.colbert_index_path, 'index.pt'))
    
    def load(self) -> None:
        """Load index from disk"""
        index_file = os.path.join(self.config.colbert_index_path, 'index.pt')
        data = torch.load(index_file, map_location=self.config.device)
        self.corpus_embeddings = data['embeddings']
        self.corpus = data['corpus']
    
    def search(self, query: str, k: int = 10) -> List[Dict]:
        """Search using MaxSim scoring"""
        if not self.corpus or len(self.corpus) == 0:
            return []
        
        # Encode query
        query_embedding = self.model.encode(
            query,
            convert_to_tensor=True
        )
        
        # Compute MaxSim scores
        scores = self._maxsim_score(query_embedding, self.corpus_embeddings)
        
        # Handle single item corpus
        if len(self.corpus) == 1:
            return [{
                'document_id': 0,
                'score': float(scores.item() if scores.dim() == 0 else scores[0]),
                'text': self.corpus[0]
            }]
        
        # Get top-k
        k = min(k, len(scores))
        top_k_indices = torch.topk(scores, k=k).indices
        
        results = []
        for idx in top_k_indices:
            results.append({
                'document_id': int(idx),
                'score': float(scores[idx]),
                'text': self.corpus[idx] if self.corpus else None
            })
        
        return results
    
    def rerank(self, query: str, documents: List[str], k: int = 10) -> List[Dict]:
        """Rerank documents with more accurate scoring"""
        if not documents:
            return []
        
        # Encode query and documents
        query_embedding = self.model.encode(query, convert_to_tensor=True)
        doc_embeddings = self.model.encode(
            documents, 
            convert_to_tensor=True,
            batch_size=8  # Smaller batch size for stability
        )
        
        # Compute MaxSim scores
        scores = self._maxsim_score(query_embedding, doc_embeddings)
        
        # Handle single document
        if len(documents) == 1:
            return [{
                'result_index': 0,
                'score': float(scores.item() if scores.dim() == 0 else scores[0]),
                'rank': 1,
                'text': documents[0]
            }]
        
        # Sort by score
        sorted_indices = torch.argsort(scores, descending=True)
        
        results = []
        for rank, idx in enumerate(sorted_indices[:k]):
            results.append({
                'result_index': int(idx),
                'score': float(scores[idx]),
                'rank': rank + 1,
                'text': documents[idx]
            })
        
        return results
    
    def _maxsim_score(
        self, 
        query_embedding: torch.Tensor, 
        doc_embeddings: torch.Tensor
    ) -> torch.Tensor:
        """
        Compute MaxSim score between query and documents
        
        MaxSim: For each query token, find max similarity with all doc tokens,
        then average across query tokens
        """
        # Ensure proper dimensions
        if query_embedding.dim() == 1:
            query_embedding = query_embedding.unsqueeze(0)
        if doc_embeddings.dim() == 1:
            doc_embeddings = doc_embeddings.unsqueeze(0)
        
        # For 2D embeddings (single vector per doc), compute cosine similarity directly
        if query_embedding.dim() == 2 and doc_embeddings.dim() == 2:
            # Normalize embeddings
            query_norm = torch.nn.functional.normalize(query_embedding, p=2, dim=1)
            doc_norm = torch.nn.functional.normalize(doc_embeddings, p=2, dim=1)
            
            # Compute cosine similarity
            scores = torch.mm(query_norm, doc_norm.t())
            
            # Return as 1D tensor
            return scores.squeeze(0) if scores.size(0) == 1 else scores.squeeze()
        
        # For 3D embeddings (token-level), use mean pooling
        if query_embedding.dim() == 3:
            query_vec = query_embedding.mean(dim=1)
        else:
            query_vec = query_embedding
            
        if doc_embeddings.dim() == 3:
            doc_vec = doc_embeddings.mean(dim=1)
        else:
            doc_vec = doc_embeddings
        
        # Normalize
        query_vec = torch.nn.functional.normalize(query_vec, p=2, dim=-1)
        doc_vec = torch.nn.functional.normalize(doc_vec, p=2, dim=-1)
        
        # Compute cosine similarity
        if query_vec.dim() == 1:
            query_vec = query_vec.unsqueeze(0)
        if doc_vec.dim() == 1:
            doc_vec = doc_vec.unsqueeze(0)
            
        scores = torch.mm(query_vec, doc_vec.t())
        
        # Return as 1D tensor
        return scores.squeeze(0) if scores.size(0) == 1 else scores.squeeze()

In [9]:
# ============================================================================
# DUAL INDEXER (BM25s + Jina ColBERT)
# ============================================================================

class DualIndexer:
    """Manages BM25s and Jina ColBERT v2 indexes"""
    
    def __init__(self, config: RAGConfig):
        self.config = config
        self.bm25_retriever = None
        self.colbert_retriever = JinaColBERTRetriever(config)
    
    def build_bm25_index(self, corpus: List[str]) -> None:
        """Build BM25s index"""
        print("\n[BM25s] Building lexical search index...", end=' ')
        start_time = time.time()
        
        # Create stemmer
        stemmer = Stemmer.Stemmer("english")
        
        # Tokenize corpus
        corpus_tokens = bm25s.tokenize(
            corpus, 
            stopwords="en",
            stemmer=stemmer
        )
        
        self.bm25_retriever = bm25s.BM25()
        self.bm25_retriever.index(corpus_tokens)
        
        os.makedirs(self.config.bm25_index_path, exist_ok=True)
        self.bm25_retriever.save(self.config.bm25_index_path)
        
        elapsed = time.time() - start_time
        print(f"‚úì {elapsed:.2f}s")
    
    def build_colbert_index(self, corpus: List[str]) -> None:
        """Build Jina ColBERT v2 index"""
        print("\n[ColBERT] Building semantic search index...")
        start_time = time.time()
        
        self.colbert_retriever.index(corpus)
        
        elapsed = time.time() - start_time
        print(f"  ‚úì {elapsed:.2f}s")
    
    def load_indexes(self) -> None:
        """Load indexes from disk"""
        self.bm25_retriever = bm25s.BM25.load(self.config.bm25_index_path)
        self.colbert_retriever.load()

In [10]:
# ============================================================================
# HYBRID RETRIEVER WITH RRF AND RERANKING
# ============================================================================

class HybridRetriever:
    """Three-stage retrieval: BM25s + ColBERT + ColBERT Reranking"""
    
    def __init__(self, config: RAGConfig, indexer: DualIndexer, db_session, corpus_to_chunk_id: List[int] = None):
        self.config = config
        self.indexer = indexer
        self.db_session = db_session
        self.stemmer = Stemmer.Stemmer("english")
        # CRITICAL: Mapping from corpus index to database chunk ID
        self.corpus_to_chunk_id = corpus_to_chunk_id or []
    
    def retrieve(self, query: str, top_k_final: int = None) -> List[Dict]:
        """Three-stage hybrid retrieval with detailed scoring"""
        if top_k_final is None:
            top_k_final = self.config.final_top_k
        
        print(f"\nüîç Retrieving relevant chunks...")
        
        # Get corpus size to adjust k values
        corpus_size = len(self.indexer.colbert_retriever.corpus) if self.indexer.colbert_retriever.corpus else 0
        
        # Adjust k values based on corpus size
        bm25_k = min(self.config.bm25_top_k, corpus_size) if corpus_size > 0 else self.config.bm25_top_k
        colbert_k = min(self.config.colbert_top_k, corpus_size) if corpus_size > 0 else self.config.colbert_top_k
        
        print(f"   ‚Ä¢ Corpus size: {corpus_size}, using k={bm25_k} for retrieval")
        
        # Stage 1: BM25s
        start = time.time()
        bm25_results = self._bm25_search(query, k=bm25_k)
        bm25_time = time.time() - start
        print(f"   ‚Ä¢ BM25s: {bm25_time:.3f}s ({len(bm25_results)} results)")
        
        # Stage 2: ColBERT
        start = time.time()
        colbert_results = self._colbert_search(query, k=colbert_k)
        colbert_time = time.time() - start
        print(f"   ‚Ä¢ ColBERT: {colbert_time:.3f}s ({len(colbert_results)} results)")
        
        # Fusion
        start = time.time()
        fused_results = self._reciprocal_rank_fusion(bm25_results, colbert_results)
        candidates = fused_results[:min(50, len(fused_results))]
        fusion_time = time.time() - start
        print(f"   ‚Ä¢ Fusion: {fusion_time:.3f}s ({len(candidates)} candidates)")
        
        # Fetch chunks - USING THE MAPPING!
        start = time.time()
        candidate_corpus_indices = [r['corpus_index'] for r in candidates]
        candidate_chunks = self._fetch_chunks_from_db(candidate_corpus_indices)
        
        # PRESERVE INTERMEDIATE SCORES
        # Map corpus_index to intermediate scores
        score_map = {}
        for bm25_result in bm25_results:
            idx = bm25_result['corpus_index']
            if idx not in score_map:
                score_map[idx] = {}
            score_map[idx]['bm25_score'] = bm25_result['score']
        
        for colbert_result in colbert_results:
            idx = colbert_result['corpus_index']
            if idx not in score_map:
                score_map[idx] = {}
            score_map[idx]['colbert_score'] = colbert_result['score']
        
        for fused_result in candidates:
            idx = fused_result['corpus_index']
            if idx in score_map:
                score_map[idx]['rrf_score'] = fused_result['rrf_score']
        
        # Add intermediate scores to chunks
        for i, chunk in enumerate(candidate_chunks):
            corpus_idx = candidate_corpus_indices[i]
            if corpus_idx in score_map:
                chunk['intermediate_scores'] = score_map[corpus_idx]
        
        fetch_time = time.time() - start
        print(f"   ‚Ä¢ Fetch: {fetch_time:.3f}s ({len(candidate_chunks)} chunks)")
        
        # Stage 3: Rerank
        start = time.time()
        final_k = min(top_k_final, len(candidate_chunks))
        reranked_results = self._colbert_rerank(query, candidate_chunks, top_k=final_k)
        rerank_time = time.time() - start
        print(f"   ‚Ä¢ Rerank: {rerank_time:.3f}s (top {len(reranked_results)})")
        
        total_time = bm25_time + colbert_time + fusion_time + fetch_time + rerank_time
        print(f"   ‚úì Total retrieval: {total_time:.3f}s")
        
        return reranked_results
    
    def _bm25_search(self, query: str, k: int) -> List[Dict]:
        """Stage 1: BM25s lexical search"""
        query_tokens = bm25s.tokenize(
            query, 
            stopwords="en",
            stemmer=self.stemmer
        )
        
        results, scores = self.indexer.bm25_retriever.retrieve(query_tokens, k=k)
        
        return [
            {'corpus_index': int(results[0][i]), 'score': float(scores[0][i]), 'source': 'bm25'}
            for i in range(len(results[0]))
        ]
    
    def _colbert_search(self, query: str, k: int) -> List[Dict]:
        """Stage 2: ColBERT semantic search"""
        results = self.indexer.colbert_retriever.search(query=query, k=k)
        return [
            {'corpus_index': r['document_id'], 'score': r['score'], 'source': 'colbert'}
            for r in results
        ]
    
    def _reciprocal_rank_fusion(
        self, 
        bm25_results: List[Dict], 
        colbert_results: List[Dict],
        k: int = 60
    ) -> List[Dict]:
        """RRF fusion"""
        scores = {}
        
        for rank, result in enumerate(bm25_results, 1):
            corpus_idx = result['corpus_index']
            scores[corpus_idx] = scores.get(corpus_idx, 0) + (1 / (k + rank))
        
        for rank, result in enumerate(colbert_results, 1):
            corpus_idx = result['corpus_index']
            scores[corpus_idx] = scores.get(corpus_idx, 0) + (1 / (k + rank))
        
        sorted_results = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        return [{'corpus_index': idx, 'rrf_score': score} for idx, score in sorted_results]
    
    def _fetch_chunks_from_db(self, corpus_indices: List[int]) -> List[Dict]:
        """Fetch chunks from database using corpus index -> chunk ID mapping"""
        chunks = []
        
        for corpus_idx in corpus_indices:
            # Convert corpus index to database chunk ID
            if corpus_idx < len(self.corpus_to_chunk_id):
                chunk_id = self.corpus_to_chunk_id[corpus_idx]
                
                # Fetch from database using the actual chunk ID
                chunk = self.db_session.query(Chunk).filter_by(id=chunk_id).first()
                if chunk:
                    chunks.append({
                        'chunk_id': chunk.id,
                        'text': chunk.text,
                        'document_id': chunk.document_id,
                        'heading_path': chunk.heading_path,
                        'has_images': chunk.has_images,
                        'metadata': json.loads(chunk.chunk_metadata) if chunk.chunk_metadata else {}
                    })
                else:
                    print(f"  ‚ö†Ô∏è Chunk ID {chunk_id} not found in database")
            else:
                print(f"  ‚ö†Ô∏è Corpus index {corpus_idx} out of range (max: {len(self.corpus_to_chunk_id)-1})")
        
        return chunks
    
    def _colbert_rerank(self, query: str, chunks: List[Dict], top_k: int) -> List[Dict]:
        """Stage 3: ColBERT reranking with score preservation"""
        if not chunks:
            return []
        
        documents = [chunk['text'] for chunk in chunks]
        reranked_results = self.indexer.colbert_retriever.rerank(query=query, documents=documents, k=top_k)
        
        final_results = []
        for result in reranked_results:
            original_chunk = chunks[result['result_index']]
            intermediate_scores = original_chunk.get('intermediate_scores', {})
            
            final_results.append({
                'chunk_id': original_chunk['chunk_id'],
                'text': original_chunk['text'],
                'document_id': original_chunk['document_id'],
                'heading_path': original_chunk.get('heading_path', ''),
                'has_images': original_chunk.get('has_images', False),
                'metadata': original_chunk['metadata'],
                'score': result['score'],  # Final ColBERT rerank score (cosine similarity)
                'rank': result['rank'],
                'bm25_score': intermediate_scores.get('bm25_score', 0.0),
                'colbert_score': intermediate_scores.get('colbert_score', 0.0),
                'rrf_score': intermediate_scores.get('rrf_score', 0.0)
            })
        return final_results

In [11]:
# ============================================================================
# RAG CHATBOT WITH ADAPTIVE CHUNKING
# ============================================================================

class RAGChatbot:
    """Complete RAG chatbot with adaptive chunk selection"""
    
    def __init__(self, config: RAGConfig, retriever: HybridRetriever, ollama_client: OllamaClient):
        self.config = config
        self.retriever = retriever
        self.ollama = ollama_client
        self.conversation_history = []
        self.debug_mode = True  # Enable debugging to see what's being sent to LLM
    
    def _determine_top_k(self, query: str) -> int:
        """
        Determine optimal number of chunks based on query complexity.
        
        Simple queries (e.g., "What is X?") ‚Üí fewer chunks (5)
        Complex queries (e.g., "List all...", "Compare...") ‚Üí more chunks (10)
        """
        query_lower = query.lower()
        
        # Keywords indicating need for comprehensive answers
        comprehensive_keywords = [
            'all', 'list', 'different', 'various', 'types of', 'kinds of',
            'compare', 'contrast', 'difference', 'similarities',
            'explain', 'describe in detail', 'comprehensive',
            'multiple', 'several', 'many'
        ]
        
        # Check if query needs comprehensive answer
        needs_comprehensive = any(keyword in query_lower for keyword in comprehensive_keywords)
        
        if needs_comprehensive:
            return self.config.final_top_k_max  # 10 chunks for comprehensive answers
        else:
            return self.config.final_top_k_default  # 7 chunks for normal queries
    
    def chat(self, query: str, stream: bool = True, top_k: int = None) -> Dict:
        """
        Process user query with adaptive chunk selection.
        
        Args:
            query: User's question
            stream: Enable streaming response
            top_k: Override automatic top_k selection (optional)
        """
        # Determine optimal number of chunks
        if top_k is None:
            top_k = self._determine_top_k(query)
        
        print(f"\nüí° Query complexity analysis: Using {top_k} chunks")
        
        # Retrieve relevant chunks (use top_k_final parameter name)
        retrieved_chunks = self.retriever.retrieve(query, top_k_final=top_k)
        
        # Build context with smart truncation
        context, actual_chunks_used = self._build_context_adaptive(retrieved_chunks)
        
        # DEBUG: Show what's being sent to LLM
        if self.debug_mode:
            print(f"\n{'='*60}")
            print("üêõ DEBUG: Context being sent to LLM")
            print(f"{'='*60}")
            print(f"Context length: {len(context)} characters")
            print(f"Chunks retrieved: {len(retrieved_chunks)}")
            print(f"Chunks actually used: {actual_chunks_used}")
            print(f"\nFirst 800 characters of context:")
            print(context[:800])
            print(f"\n... [truncated, full context is {len(context)} chars]")
            print(f"{'='*60}\n")
        
        # Clean display header
        print(f"\n{'='*70}")
        print(f"üí¨ Question: {query}")
        print(f"{'='*70}")
        print(f"\nü§ñ Answer:\n")
        
        start_time = time.time()
        
        self.conversation_history.append({
            'role': 'user',
            'content': query
        })
        
        # Generate response with streaming
        response = self.ollama.chat(
            messages=self.conversation_history,
            context=context,
            stream=stream
        )
        
        elapsed = time.time() - start_time
        
        # Clean footer with metadata
        print(f"\n{'‚îÄ'*70}")
        print(f"‚è±Ô∏è  {elapsed:.1f}s | üìö {actual_chunks_used} chunks | üìù {len(context)} chars | üéØ top_k={top_k}")
        print(f"{'‚îÄ'*70}")
        
        self.conversation_history.append({
            'role': 'assistant',
            'content': response
        })
        
        # Display source information
        print(f"\nüìñ Sources Used:")
        print(f"{'‚îÄ'*70}")
        for i in range(actual_chunks_used):
            chunk = retrieved_chunks[i]
            heading = chunk.get('heading_path', 'No heading')
            score = chunk.get('score', 0.0)
            char_count = len(chunk.get('text', ''))
            print(f"  {i+1}. [{score:.4f}] {heading[:50]}... ({char_count} chars)")
        
        if len(retrieved_chunks) > actual_chunks_used:
            print(f"\n  ‚ö†Ô∏è  Note: {len(retrieved_chunks) - actual_chunks_used} additional chunks retrieved but not sent to LLM")
            print(f"     (exceeded max_context_chars limit of {self.config.max_context_chars})")
        
        print(f"{'‚îÄ'*70}\n")
        
        return {
            'response': response,
            'sources': self._format_sources(retrieved_chunks[:actual_chunks_used]),
            'retrieved_chunks': len(retrieved_chunks),
            'used_chunks': actual_chunks_used,
            'context_length': len(context),
            'top_k': top_k
        }
    
    def _build_context_adaptive(self, chunks: List[Dict]) -> tuple[str, int]:
        """
        Build context with adaptive truncation to stay within max_context_chars.
        
        Returns:
            (context_string, number_of_chunks_used)
        """
        context_parts = []
        total_chars = 0
        chunks_used = 0
        
        for i, chunk in enumerate(chunks, 1):
            chunk_text = chunk['text']
            
            # Sanity check for old chunks
            if len(chunk_text) > 1000:
                print(f"‚ö†Ô∏è  Warning: Source {i} is {len(chunk_text)} chars (expected max 800)")
                print(f"   This suggests you need to re-index with the new chunker!")
                chunk_text = chunk_text[:800] + "..."
            
            # Calculate what context size would be if we add this chunk
            source_header = f"=== SOURCE {i} ==="
            source_footer = f"=== END SOURCE {i} ==="
            heading = chunk.get('heading_path', '')
            
            if heading:
                chunk_formatted = f"{source_header}\nSection: {heading}\n\n{chunk_text}\n{source_footer}"
            else:
                chunk_formatted = f"{source_header}\n{chunk_text}\n{source_footer}"
            
            chunk_size = len(chunk_formatted) + 2  # +2 for \n\n separator
            
            # Check if adding this chunk would exceed max context
            if total_chars + chunk_size > self.config.max_context_chars:
                print(f"\n‚ö†Ô∏è  Stopping at {chunks_used} chunks (would exceed {self.config.max_context_chars} char limit)")
                break
            
            context_parts.append(chunk_formatted)
            total_chars += chunk_size
            chunks_used += 1
        
        return "\n\n".join(context_parts), chunks_used
    
    def _format_sources(self, chunks: List[Dict]) -> List[Dict]:
        """Format source citations with full text, image paths, and ALL scores"""
        sources = []
        
        for i, chunk in enumerate(chunks):
            source = {
                'source_id': i + 1,
                'chunk_id': chunk['chunk_id'],
                'document_id': chunk['document_id'],
                'heading': chunk.get('heading_path', ''),
                'score': chunk['score'],  # Final ColBERT rerank score
                'bm25_score': chunk.get('bm25_score', 0.0),
                'colbert_score': chunk.get('colbert_score', 0.0),
                'rrf_score': chunk.get('rrf_score', 0.0),
                'has_images': chunk.get('has_images', False),
                'text': chunk['text'],  # Include full text
                'preview': chunk['text'][:200] + "..." if len(chunk['text']) > 200 else chunk['text']
            }
            
            # Add image paths if available
            if chunk.get('has_images') and chunk.get('metadata'):
                image_paths = chunk['metadata'].get('image_paths', [])
                source['image_paths'] = image_paths
            
            sources.append(source)
        
        return sources
    
    def clear_history(self):
        """Clear conversation history"""
        self.conversation_history = []
        print("üóëÔ∏è  Conversation history cleared")

In [12]:
class RAGApplication:
    """Main application orchestrator"""
    
    def __init__(self, config: RAGConfig):
        self.config = config
        
        # Database setup
        db_url = f"sqlite:///{config.db_path}"
        self.engine = create_engine(db_url)
        Base.metadata.create_all(self.engine)
        Session = sessionmaker(bind=self.engine)
        self.db_session = Session()
        
        # Initialize Ollama client
        self.ollama = OllamaClient(config)
        
        # Initialize components
        self.processor = DocumentProcessor(config, self.ollama)
        self.indexer = DualIndexer(config)
        self.retriever = None
        self.chatbot = None
        
        # CRITICAL: Store mapping between corpus index and chunk IDs
        self.corpus_to_chunk_id = []  # Maps corpus index -> database chunk ID
    
    def check_ollama(self) -> bool:
        """Check if Ollama is running"""
        try:
            response = requests.get(f"{self.config.ollama_url}/api/tags", timeout=5)
            return response.status_code == 200
        except:
            return False
    
    def index_documents(self, pdf_paths: List[str]) -> None:
        """Index PDF documents"""
        
        if not self.check_ollama():
            print("‚ùå Ollama is not running!")
            print("Please start Ollama: ollama serve")
            return
        
        all_chunks = []
        
        for pdf_path in pdf_paths:
            chunks, doc_id = self.processor.process_document(pdf_path, self.db_session)
            all_chunks.extend(chunks)
        
        print(f"\n{'='*60}")
        print("Building Indexes")
        print(f"{'='*60}")
        
        # Build corpus and mapping
        # CRITICAL FIX: Store the mapping between corpus index and database chunk IDs
        all_db_chunks = self.db_session.query(Chunk).order_by(Chunk.id).all()
        corpus = []
        self.corpus_to_chunk_id = []
        
        for chunk in all_db_chunks:
            corpus.append(chunk.text)
            self.corpus_to_chunk_id.append(chunk.id)
        
        print(f"  ‚Ä¢ Corpus: {len(corpus)} chunks")
        print(f"  ‚Ä¢ Chunk ID mapping: {len(self.corpus_to_chunk_id)} entries")
        
        # Build indexes
        self.indexer.build_bm25_index(corpus)
        self.indexer.build_colbert_index(corpus)
        
        # Save the mapping to disk for later use
        import pickle
        mapping_path = os.path.join(self.config.base_dir, "indexes", "corpus_mapping.pkl")
        os.makedirs(os.path.dirname(mapping_path), exist_ok=True)
        with open(mapping_path, 'wb') as f:
            pickle.dump(self.corpus_to_chunk_id, f)
        
        print(f"\n‚úÖ Document indexed successfully!")
    
    def initialize_chatbot(self) -> None:
        """Initialize chatbot with existing indexes"""
        
        if not self.check_ollama():
            print("‚ùå Ollama is not running!")
            print("Please start Ollama: ollama serve")
            return
        
        print("Loading indexes...")
        self.indexer.load_indexes()
        
        # Load the corpus-to-chunk-id mapping
        import pickle
        mapping_path = os.path.join(self.config.base_dir, "indexes", "corpus_mapping.pkl")
        try:
            with open(mapping_path, 'rb') as f:
                self.corpus_to_chunk_id = pickle.load(f)
            print(f"  ‚Ä¢ Loaded {len(self.corpus_to_chunk_id)} chunk ID mappings")
        except FileNotFoundError:
            print("  ‚ö†Ô∏è  Warning: No corpus mapping found. Please re-index your documents.")
            self.corpus_to_chunk_id = []
        
        self.retriever = HybridRetriever(self.config, self.indexer, self.db_session, self.corpus_to_chunk_id)
        self.chatbot = RAGChatbot(self.config, self.retriever, self.ollama)
        
        print("‚úÖ Chatbot initialized and ready!")
    
    def chat(self, query: str) -> Dict:
        """Chat interface"""
        if not self.chatbot:
            raise RuntimeError("Chatbot not initialized. Call initialize_chatbot() first.")
        
        return self.chatbot.chat(query)
    
    def _filter_relevant_images(self, query: str, image_paths: List[str], chunk_text: str) -> List[str]:
        """Filter images to only show those DIRECTLY relevant to the user's query - STRICT filtering"""
        if not image_paths:
            return []
        
        relevant_images = []
        
        # Extract meaningful query keywords (remove stop words)
        stop_words = {'what', 'is', 'are', 'the', 'a', 'an', 'how', 'why', 'when', 'where', 
                      'can', 'could', 'would', 'should', 'do', 'does', 'did', 'of', 'in', 'on',
                      'for', 'to', 'with', 'by', 'from', 'at', 'about', 'as', 'into', 'through',
                      'diagram', 'chart', 'figure', 'image', 'screenshot', 'show', 'me', 'please'}
        
        query_lower = query.lower()
        query_words = [w for w in query_lower.split() if w not in stop_words and len(w) > 2]
        
        if not query_words:
            return []  # No meaningful query words, don't show images
        
        # Get image metadata from database
        for img_path in image_paths:
            # Extract just the filename for DB lookup
            img_filename = os.path.basename(img_path)
            
            # Look up image in database to get description
            img_record = self.db_session.query(Image).filter(
                Image.image_path.like(f"%{img_filename}")
            ).first()
            
            if img_record:
                # Combine all image metadata
                desc_lower = (img_record.description or "").lower()
                img_type_lower = (img_record.image_type or "").lower()
                ocr_lower = (img_record.ocr_text or "").lower()
                
                # Create searchable text from image
                image_text = f"{desc_lower} {img_type_lower} {ocr_lower}"
                image_words = [w for w in image_text.split() if w not in stop_words and len(w) > 2]
                
                # Calculate meaningful overlap
                query_set = set(query_words)
                image_set = set(image_words)
                overlap = query_set.intersection(image_set)
                
                # STRICT CRITERIA: Need at least 3 meaningful word overlaps
                # This ensures the image is actually about what the user asked
                if len(overlap) >= 3:
                    relevant_images.append(img_path)
                    # print(f"  DEBUG: Image matched with {len(overlap)} overlaps: {overlap}")
        
        return relevant_images
    
    def _display_chunk_with_images(self, chunk_text: str, image_paths: List[str] = None) -> None:
        """Display chunk text and associated images"""
        from IPython.display import display, Image as IPImage
        
        # Display chunk text
        if chunk_text:
            print(f"{chunk_text}\n")
        
        # Display images if available
        if image_paths:
            print(f"  üì∑ Relevant Images ({len(image_paths)}):")
            for img_path in image_paths:
                if os.path.exists(img_path):
                    try:
                        display(IPImage(filename=img_path, width=400))
                        print(f"  ‚îî‚îÄ {os.path.basename(img_path)}\n")
                    except Exception as e:
                        print(f"  ‚îî‚îÄ ‚ö†Ô∏è Could not display {os.path.basename(img_path)}: {e}\n")
                else:
                    print(f"  ‚îî‚îÄ ‚ö†Ô∏è Image not found: {os.path.basename(img_path)}\n")
    
    def interactive_chat(self) -> None:
        """Interactive chat loop"""
        print("\n" + "="*60)
        print("RAG Chatbot - Interactive Mode")
        print("="*60)
        print("Type your questions (or 'exit' to quit, 'clear' to clear history)\n")
        
        while True:
            try:
                user_input = input("You: ").strip()
                
                if not user_input:
                    continue
                
                if user_input.lower() in ['exit', 'quit']:
                    print("\nGoodbye! üëã")
                    break
                
                if user_input.lower() == 'clear':
                    self.chatbot.clear_history()
                    continue
                
                result = self.chat(user_input)
                print(f"\nAssistant: {result['response']}\n")
                
                # Show retrieved chunks with ALL SCORES
                if result['sources']:
                    print(f"\n{'='*60}")
                    print(f"üìä Retrieved Chunks with Similarity Scores ({len(result['sources'])})")
                    print(f"{'='*60}\n")
                    
                    for idx, src in enumerate(result['sources'], 1):
                        print(f"‚îå‚îÄ Chunk {idx} {'‚îÄ'*50}")
                        
                        # Show ALL retrieval scores
                        print(f"‚îÇ üéØ Final Score (ColBERT Rerank): {src['score']:.4f}")
                        print(f"‚îÇ üìà Intermediate Scores:")
                        print(f"‚îÇ    ‚Ä¢ BM25 (lexical):      {src.get('bm25_score', 0.0):.4f}")
                        print(f"‚îÇ    ‚Ä¢ ColBERT (semantic):  {src.get('colbert_score', 0.0):.4f}")
                        print(f"‚îÇ    ‚Ä¢ RRF (fusion):        {src.get('rrf_score', 0.0):.4f}")
                        
                        if src['heading']:
                            print(f"‚îÇ üìç Section: {src['heading']}")
                        
                        if src['has_images']:
                            print(f"‚îÇ üñºÔ∏è  Contains Images: Yes")
                        
                        print(f"‚îÇ")
                        print(f"‚îÇ üìÑ Text:")
                        
                        # Display chunk text (show first 300 chars as preview)
                        chunk_text = src.get('text', src.get('preview', ''))
                        
                        # Show preview
                        if len(chunk_text) > 300:
                            print(f"‚îÇ {chunk_text[:300]}...")
                            print(f"‚îÇ [Truncated - {len(chunk_text)} total characters]")
                        else:
                            print(f"‚îÇ {chunk_text}")
                        
                        # Filter and display only STRICTLY RELEVANT images
                        if src['has_images'] and src.get('image_paths'):
                            # Filter images based on query relevance with STRICT criteria
                            relevant_images = self._filter_relevant_images(
                                user_input, 
                                src['image_paths'], 
                                chunk_text
                            )
                            
                            if relevant_images:
                                print(f"‚îÇ")
                                print(f"‚îÇ [Showing {len(relevant_images)}/{len(src['image_paths'])} images matching your query]")
                                self._display_chunk_with_images("", relevant_images)
                            else:
                                print(f"‚îÇ")
                                print(f"‚îÇ [This chunk has images, but none directly match your specific query]")
                        
                        print(f"‚îî{'‚îÄ'*60}\n")
                    
                    print()
            
            except KeyboardInterrupt:
                print("\n\nGoodbye! üëã")
                break
            except Exception as e:
                print(f"\n‚ùå Error: {e}\n")
                import traceback
                traceback.print_exc()
    
    def print_stats(self) -> None:
        """Print database statistics"""
        doc_count = self.db_session.query(Document).count()
        chunk_count = self.db_session.query(Chunk).count()
        image_count = self.db_session.query(Image).count()
        
        print(f"\nüìä Database Statistics:")
        print(f"   ‚Ä¢ Documents: {doc_count}")
        print(f"   ‚Ä¢ Chunks: {chunk_count}")
        print(f"   ‚Ä¢ Images: {image_count}")

In [13]:
def _build_context(self, chunks: List[Dict]) -> str:
    """
    Build context from retrieved chunks.
    
    No truncation needed since chunks are now properly sized (600-800 chars)
    at indexing time by the improved MarkdownSemanticChunker.
    """
    context_parts = []
    
    for i, chunk in enumerate(chunks, 1):
        chunk_text = chunk['text']
        
        # Sanity check: warn if chunk is unexpectedly large (shouldn't happen with new chunker)
        if len(chunk_text) > 1000:
            print(f"‚ö†Ô∏è  Warning: Source {i} is {len(chunk_text)} chars (expected max 800)")
            print(f"   This suggests you need to re-index with the new chunker!")
            # Truncate as fallback for old chunks
            chunk_text = chunk_text[:800] + "..."
        
        # Clear source boundaries help model understand context
        source_header = f"=== SOURCE {i} ==="
        source_footer = f"=== END SOURCE {i} ==="
        
        # Include heading path for better context
        heading = chunk.get('heading_path', '')
        if heading:
            context_parts.append(f"{source_header}\nSection: {heading}\n\n{chunk_text}\n{source_footer}")
        else:
            context_parts.append(f"{source_header}\n{chunk_text}\n{source_footer}")
    
    return "\n\n".join(context_parts)

In [None]:
# Initialize config with SEPARATE models for vision and chat
# Vision: gemma3:4b (multimodal, for analyzing images)
# Chat: gemma3:4b (FASTER BUT prone to hallucinations)
# Chat: llama3.2:3b (FASTER and STREAMING - recommended for 16GB RAM Mac Mini M4) -> CURRENT
# Note: gpt-oss:20b is available but VERY slow. Use only if you need maximum quality.
config = RAGConfig(chat_model='llama3.2:3b')  # Changed from gpt-oss:20b to llama3.2:3b for better performance
app = RAGApplication(config)

# Check Ollama
if not app.check_ollama():
    print("‚ùå Ollama is not running!")
    print("\nTo start Ollama:")
    print("  1. Open a terminal")
    print("  2. Run: ollama serve")
    print("  3. Keep that terminal open")
    print("\nThen run this cell again.")
else:
    # Simple menu with proper exit handling
    exit_program = False
    
    while not exit_program:
        print("\n" + "="*50)
        print("RAG Chatbot - Choose an option:")
        print("1. Upload and index a PDF")
        print("2. Start interactive chat")
        print("3. Show database statistics")
        print("4. Exit")
        
        choice = input("\nEnter your choice (1-4): ").strip()
        
        if choice == '1':
            file_path = input("Enter the path to your PDF file: ").strip()
            if os.path.exists(file_path):
                app.index_documents([file_path])
            else:
                print(f"Error: File not found at {file_path}")
                
        elif choice == '2':
            app.initialize_chatbot()
            app.interactive_chat()
            # Back to main menu after chat exits
            print("\n[Returned to main menu]")
            
        elif choice == '3':
            app.print_stats()
            
        elif choice == '4':
            print("\n" + "="*50)
            print("Goodbye! üëã")
            print("="*50)
            exit_program = True
            
        else:
            print("Invalid choice. Please enter a number between 1-4.")
    
    print("\n‚úÖ Program exited successfully.")

No sentence-transformers model found with name jinaai/jina-colbert-v2. Creating a new one with mean pooling.
`torch_dtype` is deprecated! Use `dtype` instead!
`torch_dtype` is deprecated! Use `dtype` instead!



RAG Chatbot - Choose an option:
1. Upload and index a PDF
2. Start interactive chat
3. Show database statistics
4. Exit

Processing: /Users/airees/Python/hybrid-rag-ColBERTv2/PDFs/MPO Timesheet FAQ.pdf

[Step 1/5] Converting PDF to Markdown... ‚úì 0.75s
  ‚Ä¢ Extracted 15,526 characters

[Step 2/5] Extracting and analyzing images...
    Analyzing image 1 on page 4... ‚úì (6.7s)
    Analyzing image 2 on page 4... ‚úì (5.1s)
    Analyzing image 3 on page 5... ‚úì (6.3s)
    Analyzing image 4 on page 5... ‚úì (4.7s)
    Analyzing image 5 on page 5... ‚úì (4.8s)
    Analyzing image 6 on page 6... ‚úì (4.3s)
    Analyzing image 7 on page 6... ‚úì (5.2s)
    Analyzing image 8 on page 7... ‚úì (4.9s)
    Analyzing image 9 on page 7... ‚úì (5.2s)
    Analyzing image 10 on page 7... ‚úì (5.0s)
    Analyzing image 11 on page 8... ‚úì (5.7s)
    Analyzing image 12 on page 8... ‚úì (4.7s)
    Analyzing image 13 on page 8... ‚úì (5.8s)
  ‚úì Completed in 68.44s
  ‚Ä¢ Extracted 13 images
  ‚Ä¢ Visi

Split strings:   0%|          | 0/28 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/28 [00:00<?, ?it/s]

BM25S Count Tokens:   0%|          | 0/28 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/28 [00:00<?, ?it/s]

‚úì 0.06s

[ColBERT] Building semantic search index...
  Encoding 28 documents...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

  ‚úì 1.93s

‚úÖ Document indexed successfully!

RAG Chatbot - Choose an option:
1. Upload and index a PDF
2. Start interactive chat
3. Show database statistics
4. Exit
Loading indexes...
  ‚Ä¢ Loaded 28 chunk ID mappings
‚úÖ Chatbot initialized and ready!

RAG Chatbot - Interactive Mode
Type your questions (or 'exit' to quit, 'clear' to clear history)


üí° Query complexity analysis: Using 7 chunks

üîç Retrieving relevant chunks...
   ‚Ä¢ Corpus size: 28, using k=28 for retrieval


Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

   ‚Ä¢ BM25s: 0.039s (28 results)
   ‚Ä¢ ColBERT: 0.118s (28 results)
   ‚Ä¢ Fusion: 0.000s (28 candidates)
   ‚Ä¢ Fetch: 0.004s (28 chunks)
   ‚Ä¢ Rerank: 1.356s (top 7)
   ‚úì Total retrieval: 1.517s

üêõ DEBUG: Context being sent to LLM
Context length: 5110 characters
Chunks retrieved: 7
Chunks actually used: 7

First 800 characters of context:
=== SOURCE 1 ===
Section: **MPO Timesheet FAQ** > **MPO Timesheet FAQ‚Äôs**

[Context: **MPO Timesheet FAQ**]

## **MPO Timesheet FAQ‚Äôs**

A5. All timesheets are due for submission every Friday by 5:00pm.
Please Note: The only exception is if you are on a support team and work weekends or have been involved
with a weekend rollout, then you have until Mondays no later than 10:00am to submit your timesheet for
approval.

A6. Yes, it is recommended that all users going on vacation submit timesheets before they leave.

A7. You can submit your timesheet up to **8 weeks** in advance.

A8. Yes, if you have worked on a Holiday, Saturday and Sunday

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

   ‚Ä¢ BM25s: 0.013s (28 results)
   ‚Ä¢ ColBERT: 0.426s (28 results)
   ‚Ä¢ Fusion: 0.000s (28 candidates)
   ‚Ä¢ Fetch: 0.005s (28 chunks)
   ‚Ä¢ Rerank: 1.359s (top 7)
   ‚úì Total retrieval: 1.803s

üêõ DEBUG: Context being sent to LLM
Context length: 4365 characters
Chunks retrieved: 7
Chunks actually used: 7

First 800 characters of context:
=== SOURCE 1 ===
Section: **MPO Timesheet FAQ** > **MPO Timesheet FAQ‚Äôs**

[Context: **MPO Timesheet FAQ**]

## **MPO Timesheet FAQ‚Äôs**

What happens to the hours I entered on my timesheet if I delete it? 15. What is the recommended setting for my timesheet view? 16. What is the recommended setting for my timesheet summary view? 17. Who can submit my timesheet if I am not available due to emergency?
=== END SOURCE 1 ===

=== SOURCE 2 ===
Section: **MPO Timesheet FAQ** > **MPO Timesheet FAQ‚Äôs**

[Context: **MPO Timesheet FAQ**]

## **MPO Timesheet FAQ‚Äôs**

A5. All timesheets are due for submission every Friday by 5:00pm.
Please Note: 

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

   ‚Ä¢ BM25s: 0.012s (28 results)
   ‚Ä¢ ColBERT: 0.159s (28 results)
   ‚Ä¢ Fusion: 0.000s (28 candidates)
   ‚Ä¢ Fetch: 0.003s (28 chunks)
   ‚Ä¢ Rerank: 1.363s (top 10)
   ‚úì Total retrieval: 1.537s

‚ö†Ô∏è  Stopping at 8 chunks (would exceed 6000 char limit)

üêõ DEBUG: Context being sent to LLM
Context length: 5709 characters
Chunks retrieved: 10
Chunks actually used: 8

First 800 characters of context:
=== SOURCE 1 ===
Section: **MPO Timesheet FAQ** > **MPO Timesheet FAQ‚Äôs**

[Context: **MPO Timesheet FAQ**]

## **MPO Timesheet FAQ‚Äôs**

A3. Validate that the timesheet workweek status is NOT ‚ÄòSubmitted‚Äô, ‚ÄòApproved‚Äô, ‚ÄòRejected‚Äô or ‚ÄòPeriod Closed‚Äô.
Only ‚Äò **In Progress** ‚Äô or ‚Äò **Not Yet Created** ‚Äô timesheets can be updated. To validate the status, go to My
Timesheets, look for the week and check the ‚ÄòStatus‚Äô column.

A4. Timesheet periods are open **8 weeks ahead of** the current period and **1 period** before the current period.
You cannot update

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

   ‚Ä¢ BM25s: 0.013s (28 results)
   ‚Ä¢ ColBERT: 0.185s (28 results)
   ‚Ä¢ Fusion: 0.000s (28 candidates)
   ‚Ä¢ Fetch: 0.003s (28 chunks)
   ‚Ä¢ Rerank: 1.356s (top 7)
   ‚úì Total retrieval: 1.557s

üêõ DEBUG: Context being sent to LLM
Context length: 5110 characters
Chunks retrieved: 7
Chunks actually used: 7

First 800 characters of context:
=== SOURCE 1 ===
Section: **MPO Timesheet FAQ** > **MPO Timesheet FAQ‚Äôs**

[Context: **MPO Timesheet FAQ**]

## **MPO Timesheet FAQ‚Äôs**

A5. All timesheets are due for submission every Friday by 5:00pm.
Please Note: The only exception is if you are on a support team and work weekends or have been involved
with a weekend rollout, then you have until Mondays no later than 10:00am to submit your timesheet for
approval.

A6. Yes, it is recommended that all users going on vacation submit timesheets before they leave.

A7. You can submit your timesheet up to **8 weeks** in advance.

A8. Yes, if you have worked on a Holiday, Saturday and Sunday

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

   ‚Ä¢ BM25s: 0.013s (28 results)
   ‚Ä¢ ColBERT: 0.343s (28 results)
   ‚Ä¢ Fusion: 0.000s (28 candidates)
   ‚Ä¢ Fetch: 0.003s (28 chunks)
   ‚Ä¢ Rerank: 1.353s (top 7)
   ‚úì Total retrieval: 1.712s

üêõ DEBUG: Context being sent to LLM
Context length: 5005 characters
Chunks retrieved: 7
Chunks actually used: 7

First 800 characters of context:
=== SOURCE 1 ===
Section: **MPO Timesheet FAQ** > **MPO Timesheet FAQ‚Äôs**

[Context: **MPO Timesheet FAQ**]

## **MPO Timesheet FAQ‚Äôs**

A3. Validate that the timesheet workweek status is NOT ‚ÄòSubmitted‚Äô, ‚ÄòApproved‚Äô, ‚ÄòRejected‚Äô or ‚ÄòPeriod Closed‚Äô.
Only ‚Äò **In Progress** ‚Äô or ‚Äò **Not Yet Created** ‚Äô timesheets can be updated. To validate the status, go to My
Timesheets, look for the week and check the ‚ÄòStatus‚Äô column.

A4. Timesheet periods are open **8 weeks ahead of** the current period and **1 period** before the current period.
You cannot update work weeks with a status ‚ÄòPeriod Close‚Äô.
=== END SOURCE 1 

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

   ‚Ä¢ BM25s: 0.014s (28 results)
   ‚Ä¢ ColBERT: 0.090s (28 results)
   ‚Ä¢ Fusion: 0.000s (28 candidates)
   ‚Ä¢ Fetch: 0.003s (28 chunks)
   ‚Ä¢ Rerank: 1.370s (top 7)
   ‚úì Total retrieval: 1.477s

üêõ DEBUG: Context being sent to LLM
Context length: 4857 characters
Chunks retrieved: 7
Chunks actually used: 7

First 800 characters of context:
=== SOURCE 1 ===
Section: **MPO Timesheet FAQ** > **MPO Timesheet FAQ‚Äôs**

[Context: **MPO Timesheet FAQ**]

## **MPO Timesheet FAQ‚Äôs**

A5. All timesheets are due for submission every Friday by 5:00pm.
Please Note: The only exception is if you are on a support team and work weekends or have been involved
with a weekend rollout, then you have until Mondays no later than 10:00am to submit your timesheet for
approval.

A6. Yes, it is recommended that all users going on vacation submit timesheets before they leave.

A7. You can submit your timesheet up to **8 weeks** in advance.

A8. Yes, if you have worked on a Holiday, Saturday and Sunday

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

   ‚Ä¢ BM25s: 0.019s (28 results)
   ‚Ä¢ ColBERT: 0.096s (28 results)
   ‚Ä¢ Fusion: 0.000s (28 candidates)
   ‚Ä¢ Fetch: 0.003s (28 chunks)
   ‚Ä¢ Rerank: 1.363s (top 7)
   ‚úì Total retrieval: 1.482s

üêõ DEBUG: Context being sent to LLM
Context length: 5280 characters
Chunks retrieved: 7
Chunks actually used: 7

First 800 characters of context:
=== SOURCE 1 ===
Section: **MPO Timesheet FAQ** > **MPO Timesheet FAQ‚Äôs**

[Context: **MPO Timesheet FAQ**]

## **MPO Timesheet FAQ‚Äôs**

**Time Tracking Resources Answers**
A1. Contact the Project Manager of the project you have been assigned to. Ask the PM if you have been
assigned a task. The PM may need to update and successfully published the plan. The resource should delete
then recreate the timesheet week(s) with the missing task(s). If the task is still not showing on your timesheet,
kindly submit a ticket using [this link.](https://jira.safeway.com/projects/PMOCOE/issues/PMOCOE-2406?filter=allissues)

A2. Contact the Project Mana

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

   ‚Ä¢ BM25s: 0.014s (28 results)
   ‚Ä¢ ColBERT: 0.100s (28 results)
   ‚Ä¢ Fusion: 0.000s (28 candidates)
   ‚Ä¢ Fetch: 0.005s (28 chunks)
   ‚Ä¢ Rerank: 1.384s (top 7)
   ‚úì Total retrieval: 1.502s

üêõ DEBUG: Context being sent to LLM
Context length: 4190 characters
Chunks retrieved: 7
Chunks actually used: 7

First 800 characters of context:
=== SOURCE 1 ===
Section: **MPO Timesheet FAQ** > **MPO Timesheet FAQ‚Äôs**

[Context: **MPO Timesheet FAQ**]

## **MPO Timesheet FAQ‚Äôs**

A3. Validate that the timesheet workweek status is NOT ‚ÄòSubmitted‚Äô, ‚ÄòApproved‚Äô, ‚ÄòRejected‚Äô or ‚ÄòPeriod Closed‚Äô.
Only ‚Äò **In Progress** ‚Äô or ‚Äò **Not Yet Created** ‚Äô timesheets can be updated. To validate the status, go to My
Timesheets, look for the week and check the ‚ÄòStatus‚Äô column.

A4. Timesheet periods are open **8 weeks ahead of** the current period and **1 period** before the current period.
You cannot update work weeks with a status ‚ÄòPeriod Close‚Äô.
=== END SOURCE 1 

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

   ‚Ä¢ BM25s: 0.014s (28 results)
   ‚Ä¢ ColBERT: 0.103s (28 results)
   ‚Ä¢ Fusion: 0.000s (28 candidates)
   ‚Ä¢ Fetch: 0.003s (28 chunks)
   ‚Ä¢ Rerank: 1.354s (top 7)
   ‚úì Total retrieval: 1.473s
   This suggests you need to re-index with the new chunker!
   This suggests you need to re-index with the new chunker!

üêõ DEBUG: Context being sent to LLM
Context length: 5191 characters
Chunks retrieved: 7
Chunks actually used: 7

First 800 characters of context:
=== SOURCE 1 ===
Section: **MPO Timesheet FAQ** > **MPO Timesheet FAQ‚Äôs**

[Context: **MPO Timesheet FAQ**]

## **MPO Timesheet FAQ‚Äôs**

A2. A PM does not have access to set a Resource Manager as a Status Manager. It is a onetime set up done
by the Resource Manager. Once the RM has completed the one time set up, the PM will be able to select the
[RM‚Äôs name from the status manager field for future tasks. Refer to this page Status Manager Change Process.](https://rxsafeway.sharepoint.com/sites/pwa/SitePages/Status%20Man

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

   ‚Ä¢ BM25s: 0.014s (28 results)
   ‚Ä¢ ColBERT: 0.102s (28 results)
   ‚Ä¢ Fusion: 0.000s (28 candidates)
   ‚Ä¢ Fetch: 0.003s (28 chunks)
   ‚Ä¢ Rerank: 1.349s (top 7)
   ‚úì Total retrieval: 1.468s

üêõ DEBUG: Context being sent to LLM
Context length: 4799 characters
Chunks retrieved: 7
Chunks actually used: 7

First 800 characters of context:
=== SOURCE 1 ===
Section: **MPO Timesheet FAQ** > **MPO Timesheet FAQ‚Äôs**

[Context: **MPO Timesheet FAQ**]

## **MPO Timesheet FAQ‚Äôs**

If the task to approved is still not showing on your approval page, kindly submit a ticket using [this link.](https://jira.safeway.com/projects/PMOCOE/issues/PMOCOE-2406?filter=allissues)

A9. Status manager is the timesheet approver that is set on the project schedule.

A10. Below are the steps to become a status manager or timesheet approver.
=== END SOURCE 1 ===

=== SOURCE 2 ===
Section: **MPO Timesheet FAQ** > **MPO Timesheet FAQ‚Äôs**

[Context: **MPO Timesheet FAQ**]

## **MPO Timesheet FAQ‚Äôs**

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

   ‚Ä¢ BM25s: 0.013s (28 results)
   ‚Ä¢ ColBERT: 0.090s (28 results)
   ‚Ä¢ Fusion: 0.000s (28 candidates)
   ‚Ä¢ Fetch: 0.003s (28 chunks)
   ‚Ä¢ Rerank: 1.362s (top 7)
   ‚úì Total retrieval: 1.468s

üêõ DEBUG: Context being sent to LLM
Context length: 5222 characters
Chunks retrieved: 7
Chunks actually used: 7

First 800 characters of context:
=== SOURCE 1 ===
Section: **MPO Timesheet FAQ** > **MPO Timesheet FAQ‚Äôs**

[Context: **MPO Timesheet FAQ**]

## **MPO Timesheet FAQ‚Äôs**

A10. Go to My Timesheets. Highlight the week of the timesheet that needs to be recalled then click the ‚ÄòRecall‚Äô
button in the upper right corner, click ‚ÄòOk‚Äô then go to that timesheet week and click ‚ÄòClick to create‚Äô, adjust
hours and submit for approval once again.

A11. Yes, you can Recall and even Delete your timesheet at any time within a window of 6 weeks if you have
submitted it already. Do not ask the Project Manager to reject a timesheet because it was already submitted
and needs to b

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

   ‚Ä¢ BM25s: 0.014s (28 results)
   ‚Ä¢ ColBERT: 0.105s (28 results)
   ‚Ä¢ Fusion: 0.000s (28 candidates)
   ‚Ä¢ Fetch: 0.003s (28 chunks)
   ‚Ä¢ Rerank: 1.379s (top 7)
   ‚úì Total retrieval: 1.501s

üêõ DEBUG: Context being sent to LLM
Context length: 4805 characters
Chunks retrieved: 7
Chunks actually used: 7

First 800 characters of context:
=== SOURCE 1 ===
Section: **MPO Timesheet FAQ** > **MPO Timesheet FAQ‚Äôs**

[Context: **MPO Timesheet FAQ**]

## **MPO Timesheet FAQ‚Äôs**

A14. All hours on the project and tasks will still display as is, the only hours that disappear are the hours
posted to the Admin tasks.
Suggested Practice: Make a copy of the timesheet before you delete, use Snag IT or Snipping Tool.

A15. See the settings below. This setting will make sure that you can see all assigned tasks with Actual.
View: **My Work**
Filter: **No Filter**
Group By: **Project Name**

A16. The recommended timesheet summary view setting is ‚Äò **Current + Last 3 Months** ‚Äô.
=== EN

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

   ‚Ä¢ BM25s: 0.013s (28 results)
   ‚Ä¢ ColBERT: 0.093s (28 results)
   ‚Ä¢ Fusion: 0.000s (28 candidates)
   ‚Ä¢ Fetch: 0.003s (28 chunks)
   ‚Ä¢ Rerank: 1.368s (top 7)
   ‚úì Total retrieval: 1.478s

üêõ DEBUG: Context being sent to LLM
Context length: 5110 characters
Chunks retrieved: 7
Chunks actually used: 7

First 800 characters of context:
=== SOURCE 1 ===
Section: **MPO Timesheet FAQ** > **MPO Timesheet FAQ‚Äôs**

[Context: **MPO Timesheet FAQ**]

## **MPO Timesheet FAQ‚Äôs**

A5. All timesheets are due for submission every Friday by 5:00pm.
Please Note: The only exception is if you are on a support team and work weekends or have been involved
with a weekend rollout, then you have until Mondays no later than 10:00am to submit your timesheet for
approval.

A6. Yes, it is recommended that all users going on vacation submit timesheets before they leave.

A7. You can submit your timesheet up to **8 weeks** in advance.

A8. Yes, if you have worked on a Holiday, Saturday and Sunday

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

   ‚Ä¢ BM25s: 0.014s (28 results)
   ‚Ä¢ ColBERT: 0.090s (28 results)
   ‚Ä¢ Fusion: 0.000s (28 candidates)
   ‚Ä¢ Fetch: 0.003s (28 chunks)
   ‚Ä¢ Rerank: 1.366s (top 7)
   ‚úì Total retrieval: 1.473s

üêõ DEBUG: Context being sent to LLM
Context length: 4696 characters
Chunks retrieved: 7
Chunks actually used: 7

First 800 characters of context:
=== SOURCE 1 ===
Section: **MPO Timesheet FAQ** > **MPO Timesheet FAQ‚Äôs**

[Context: **MPO Timesheet FAQ**]

## **MPO Timesheet FAQ‚Äôs**

A3. Validate that the timesheet workweek status is NOT ‚ÄòSubmitted‚Äô, ‚ÄòApproved‚Äô, ‚ÄòRejected‚Äô or ‚ÄòPeriod Closed‚Äô.
Only ‚Äò **In Progress** ‚Äô or ‚Äò **Not Yet Created** ‚Äô timesheets can be updated. To validate the status, go to My
Timesheets, look for the week and check the ‚ÄòStatus‚Äô column.

A4. Timesheet periods are open **8 weeks ahead of** the current period and **1 period** before the current period.
You cannot update work weeks with a status ‚ÄòPeriod Close‚Äô.
=== END SOURCE 1 

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

   ‚Ä¢ BM25s: 0.013s (28 results)
   ‚Ä¢ ColBERT: 0.100s (28 results)
   ‚Ä¢ Fusion: 0.000s (28 candidates)
   ‚Ä¢ Fetch: 0.003s (28 chunks)
   ‚Ä¢ Rerank: 1.371s (top 7)
   ‚úì Total retrieval: 1.488s

üêõ DEBUG: Context being sent to LLM
Context length: 5029 characters
Chunks retrieved: 7
Chunks actually used: 7

First 800 characters of context:
=== SOURCE 1 ===
Section: **MPO Timesheet FAQ** > **MPO Timesheet FAQ‚Äôs**

[Context: **MPO Timesheet FAQ**]

## **MPO Timesheet FAQ‚Äôs**

**Time Tracking Resources Answers**
A1. Contact the Project Manager of the project you have been assigned to. Ask the PM if you have been
assigned a task. The PM may need to update and successfully published the plan. The resource should delete
then recreate the timesheet week(s) with the missing task(s). If the task is still not showing on your timesheet,
kindly submit a ticket using [this link.](https://jira.safeway.com/projects/PMOCOE/issues/PMOCOE-2406?filter=allissues)

A2. Contact the Project Mana

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

   ‚Ä¢ BM25s: 0.013s (28 results)
   ‚Ä¢ ColBERT: 0.077s (28 results)
   ‚Ä¢ Fusion: 0.000s (28 candidates)
   ‚Ä¢ Fetch: 0.003s (28 chunks)
   ‚Ä¢ Rerank: 1.377s (top 7)
   ‚úì Total retrieval: 1.470s

üêõ DEBUG: Context being sent to LLM
Context length: 5438 characters
Chunks retrieved: 7
Chunks actually used: 7

First 800 characters of context:
=== SOURCE 1 ===
Section: **MPO Timesheet FAQ** > **MPO Timesheet FAQ‚Äôs**

[Context: **MPO Timesheet FAQ**]

## **MPO Timesheet FAQ‚Äôs**

A9. Go to My Timesheets. From the list of weekly timesheets, highlight the week that is Rejected. Once you
highlight the row, on the upper left corner of the page click the ‚ÄòRecall‚Äô or the ‚ÄòDelete‚Äô button (only one or the
other will be available) and click ‚ÄòOK‚Äô.

Click on ‚ÄòMy Timesheet‚Äô, look for the rejected line by referring to the column process status, see sample below.

MPO Timesheet FAQ Version 5.0 4

Zero out all hours posted on the rejected line, click save and then enter the hou

# NOTES:
- check if we can change to a bigger model
- check system prompt for LLM response (temp as well)
- check token limit for LLM response (max 6000) - should be longer
- check token limit for ColBERT (max 512) - should be longer
- check out different chunking strategies PDF from weaviate
- check the RRF (fusion) scoring and see how it works