#!/usr/bin/env python3
"""
Document categorization engine using LEANN for semantic search and LLaMA for classification.
"""

import os
from pathlib import Path
import sys

# Add current directory to path for local imports
sys.path.append(str(Path(__file__).parent))
# Add project root to path for imports
sys.path.append(str(Path(__file__).parent.parent.parent))
# Add leann-core package to path
sys.path.append(str(Path(__file__).parent.parent.parent / "packages" / "leann-core" / "src"))

# Import logging configuration to suppress info messages
from examples.logging_config import setup_suppressed_logging

# Configure logging to suppress info messages
setup_suppressed_logging("WARNING")

from config_manager import PromptConfig
from load_env import load_env_file

from indexer import Indexer

from leann import LeannChat
LEANN_AVAILABLE = True


class DocumentCategorizer:
    def __init__(self, index_path: str):
        """Initialize the categorizer with a LEANN index path."""
        self.index_path = index_path
        load_env_file()
        
        # Configure LLM
        self.llm_config = {
            "type": "ollama",
            "model": os.getenv("OLLAMA_MODEL", "gpt-oss:120b"),
            "host": os.getenv("OLLAMA_HOST", "http://localhost:11434"),
        }
        self.system_prompt= PromptConfig.master_prompt
        self.document_analyze_prompt=PromptConfig.file_analyze_prompt
        # Initialize LEANN chat lazily - only when needed and index exists
        self.leann_chat = None
        self._leann_chat_initialized = False

    def _initialize_leann_chat(self):
        """Initialize LeannChat if the index exists."""
        if self._leann_chat_initialized:
            return
            
        # Check if index file exists
        index_meta_path = f"{self.index_path}.meta.json"
        if os.path.exists(index_meta_path):
            try:
                self.leann_chat = LeannChat(
                    index_path=self.index_path,
                    llm_config=self.llm_config,
                )
                self._leann_chat_initialized = True
                print(f"✅ LeannChat initialized with index: {self.index_path}")
            except Exception as e:
                print(f"⚠️  Failed to initialize LeannChat: {e}")
                self.leann_chat = None
        else:
            print(f"⚠️  Index file not found: {index_meta_path}, LeannChat not initialized")

    def categorize(self, text: str, metadata: dict) -> str:
        """
        Categorize a document into one of the provided categories.
        
        Args:
            text: The document text to categorize
            metadata: The metadata of the document
        
        Returns:
            The selected category
        """
        # Initialize LeannChat if not already done
        self._initialize_leann_chat()
        
        # If LeannChat is not available, return a default response
        if not self.leann_chat:
            print("⚠️  LeannChat not available, returning default categorization")
            return "uncategorized", metadata
        
        # Create prompt for classification
        prompt = f"""
            {self.system_prompt}
            {self.document_analyze_prompt}
            Document: {text}
        """

        try:
            # Get response from LLM
            response = self.leann_chat.ask(prompt)
            # print(f"🤖 Response: {response}")
            return response, metadata
        except Exception as e:
            print(f"⚠️  Error during categorization: {e}")
            return "uncategorized", metadata

    def is_available(self) -> bool:
        """Check if the categorizer is available (has a working LeannChat)."""
        self._initialize_leann_chat()
        return self.leann_chat is not None


def test_categorizer():
    """Test the document categorizer."""
    # Initialize indexer and add some test documents
    index_path = "test_index.leann"
    indexer = Indexer(index_path)
    
    test_docs = [
        ("Machine learning helps computers learn from data", {"category": "technology"}),
        ("Breaking news about local elections", {"category": "politics"}),
        ("New movie releases this weekend", {"category": "entertainment"})
    ]
    
    for text, metadata in test_docs:
        indexer.index(text, metadata)
        
    # Test categorization
    categorizer = DocumentCategorizer(index_path)
    
    test_text = """
    Texas Strategic Bitcoin Reserve RFI
Executive Summary
The Texas Comptroller has issued an RFI seeking information for establishing the nation's first state-sponsored Strategic Bitcoin Reserve, with $10M in initial funding. This represents a groundbreaking initiative positioning Texas as a leader in cryptocurrency adoption at the government level.
Key Dates:
RFI Issued: September 8, 2025
Question Deadline: September 19, 2025
Response Deadline: October 30, 2025
Strategic Significance
Market Context
Institutional investors are increasing digital asset allocations (75% of surveyed institutions plan increases in 2025)
Nearly 60% of large institutional investors plan to allocate at least 5% of AUM to digital assets.
Texas is positioning itself as the first state to commit public funds to cryptocurrency.
Innovation Leadership
First-of-its-kind initiative in the United States
Demonstrates state support for responsible cryptocurrency adoption
Aligns with Texas's reputation for economic innovation
Financial Structure
Initial Funding: $10 million appropriated via GAA Article IX, Section 18.38 
Operating Model: Self-funded through asset sales (no separate operating appropriation) 
Fund Structure: Special fund outside the state treasury with Treasury Pool investment capability
Technical Requirements Analysis
Custody Requirements
The RFI emphasizes secure custody as paramount:
Security Standards:
Cold storage or other secure technologies are required
Private key generation and storage protocols
Multi-signature controls and access management
Insurance coverage for custody and transfer risks
Third-party security audits (SOC 2 Type II preferred)
Key Security Questions:
Private key generation processes
Storage locations (cold vs. hot wallets)
Access controls and authorization procedures
Cryptographic key management
Fraud mitigation procedures
Regulatory Compliance
Qualified Custodian Requirements:
State or federally chartered financial institution
Texas regulatory compliance or equivalent
Qualified Liquidity Provider Requirements:
Federal/state licensing
Audited financial statements
Minimum 5 years of digital assets experience
Texas office and registered principal
Self-certification capability
Operational Considerations
Service Scope
The RFI seeks comprehensive solutions, including:
Secure acquisition and custody
Exchange capabilities
Investment advisory services
Risk management and derivatives usage
Reporting and audit capabilities
Geographic Requirements
Texas Presence Strongly Preferred:
Offices or employees in Texas
Server locations within acceptable jurisdictions
Exclusion of "countries of concern" (North Korea, China, Russia, Iran)
Cost Management
Critical Constraint: No operating appropriations provided
Service providers must incorporate transparent expense netting
Costs covered through strategic asset sales
Efficiency and cost-effectiveness are paramount
Risk Assessment
Technical Risks
Cybersecurity threats and breach prevention
Private key security and access management
Hot wallet vulnerabilities
Third-party vendor dependencies
Regulatory Risks
Evolving digital asset regulatory landscape
AML/KYC compliance requirements
Multi-jurisdiction regulatory coordination
Audit and examination requirements
Operational Risks
Vendor performance and compliance monitoring
Service disruption protocols
Data protection and privacy
Incident response capabilities
Our Positioning
Relevant Capabilities
Strong Alignments:
Cybersecurity Services: Critical for digital asset protection
Data Engineering: Essential for blockchain data management
Cloud Consulting: Modern infrastructure requirements
Managed IT Services: Ongoing operational support
Database Services: Secure data management expertise
Government Experience:
Extensive state and local government client base
Texas DIR contract holder
Proven track record with sensitive government data
SOC 2 compliance and security certifications
Potential Partnership Strategy
Direct Response Options:
Prime Contractor: Lead comprehensive solution
Strategic Partner: Join with a qualified custodian/liquidity provider
Specialized Services: Focus on cybersecurity and infrastructure components
Recommended Approach: Partner with an established cryptocurrency custodian while providing:
Cybersecurity architecture and monitoring
Infrastructure design and management
Data protection and compliance support
Government liaison and project management
Market Opportunity Assessment
Immediate Opportunity
$10M initial reserve value
First-mover advantage in government crypto services
Reference opportunity for other states
Strategic Value
Positions for the broader digital asset government market
Demonstrates capability in emerging technology sectors
Enhances reputation in financial services
Recommendation
Partner with an established cryptocurrency custodian and position as the cybersecurity and infrastructure specialist.
Differentiators:
Government Expertise: Extensive state/local government experience
Texas Presence: Established operations and DIR contract status
Security Focus: Comprehensive cybersecurity capabilities
Compliance Track Record: SOC 2, CMMI Level 3 certifications
Cost Efficiency: Proven ability to deliver cost-effective solutions
"""
    category, metadata = categorizer.categorize(test_text, {"id": "doc1", "category": "news"})
    print(f"Test document categorized as: {category}")
    
    # Clean up
    import os
    if os.path.exists(index_path):
        os.remove(index_path)

if __name__ == "__main__":
    test_categorizer()
