Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
183 changes: 183 additions & 0 deletions config/intelligent-routing/in-tree/embedding.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
bert_model:
model_id: models/all-MiniLM-L12-v2
threshold: 0.6
use_cpu: true

semantic_cache:
enabled: true
backend_type: "memory" # Options: "memory", "milvus", or "hybrid"
similarity_threshold: 0.8
max_entries: 1000 # Only applies to memory backend
ttl_seconds: 3600
eviction_policy: "fifo"
# HNSW index configuration (for memory backend only)
use_hnsw: true # Enable HNSW index for faster similarity search
hnsw_m: 16 # Number of bi-directional links (higher = better recall, more memory)
hnsw_ef_construction: 200 # Construction parameter (higher = better quality, slower build)

# Hybrid cache configuration (when backend_type: "hybrid")
# Combines in-memory HNSW for fast search with Milvus for scalable storage
# max_memory_entries: 100000 # Max entries in HNSW index (default: 100,000)
# backend_config_path: "config/milvus.yaml" # Path to Milvus config

# Embedding model for semantic similarity matching
# Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context)
# Default: "bert" (fastest, lowest memory)
embedding_model: "bert"

tools:
enabled: true
top_k: 3
similarity_threshold: 0.2
tools_db_path: "config/tools_db.json"
fallback_to_empty: true

prompt_guard:
enabled: true # Global default - can be overridden per category with jailbreak_enabled
use_modernbert: true
model_id: "models/jailbreak_classifier_modernbert-base_model"
threshold: 0.7
use_cpu: true
jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"

# vLLM Endpoints Configuration
# IMPORTANT: 'address' field must be a valid IP address (IPv4 or IPv6)
# Supported formats: 127.0.0.1, 192.168.1.1, ::1, 2001:db8::1
# NOT supported: domain names (example.com), protocol prefixes (http://), paths (/api), ports in address (use 'port' field)
vllm_endpoints:
- name: "endpoint1"
address: "172.28.0.20" # Static IPv4 of llm-katan within docker compose network
port: 8002
weight: 1

model_config:
"qwen3":
reasoning_family: "qwen3" # This model uses Qwen-3 reasoning syntax
preferred_endpoints: ["endpoint1"]
pii_policy:
allow_by_default: true

# Classifier configuration
classifier:
category_model:
model_id: "models/category_classifier_modernbert-base_model"
use_modernbert: true
threshold: 0.6
use_cpu: true
category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
pii_model:
model_id: "models/pii_classifier_modernbert-base_presidio_token_model"
use_modernbert: true
threshold: 0.7
use_cpu: true
pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json"

# Embedding-based classification rules
# These rules use semantic similarity between query text and keywords
embedding_rules:
- category: "technical_support"
threshold: 0.75
keywords:
- "how to configure the system"
- "installation guide"
- "troubleshooting steps"
- "error message explanation"
- "setup instructions"
aggregation_method: "max" # Options: "max", "avg", "any"
model: "auto" # Options: "auto", "qwen3", "gemma"
dimension: 768 # Options: 128, 256, 512, 768, 1024
quality_priority: 0.7 # 0.0-1.0, only for "auto" model
latency_priority: 0.3 # 0.0-1.0, only for "auto" model

- category: "product_inquiry"
threshold: 0.70
keywords:
- "product features and specifications"
- "pricing information"
- "availability and stock"
- "product comparison"
- "warranty details"
aggregation_method: "avg"
model: "gemma"
dimension: 768

- category: "account_management"
threshold: 0.72
keywords:
- "password reset"
- "account settings"
- "profile update"
- "subscription management"
- "billing information"
aggregation_method: "max"
model: "qwen3"
dimension: 1024

- category: "general_inquiry"
threshold: 0.65
keywords:
- "general question"
- "information request"
- "help needed"
- "customer service"
aggregation_method: "any"
model: "auto"
dimension: 512
quality_priority: 0.5
latency_priority: 0.5

# Categories with model scores
categories:
# Embedding-based categories
- name: technical_support
system_prompt: "You are a technical support specialist. Provide detailed, step-by-step guidance for technical issues. Use clear explanations and include relevant troubleshooting steps."
model_scores:
- model: qwen3
score: 0.9
use_reasoning: true
jailbreak_enabled: true
pii_detection_enabled: true

- name: product_inquiry
system_prompt: "You are a product specialist. Provide accurate information about products, features, pricing, and availability. Be helpful and informative."
model_scores:
- model: qwen3
score: 0.85
use_reasoning: false
jailbreak_enabled: true
pii_detection_enabled: false

- name: account_management
system_prompt: "You are an account management assistant. Help users with account-related tasks such as password resets, profile updates, and subscription management. Prioritize security and privacy."
model_scores:
- model: qwen3
score: 0.88
use_reasoning: false
jailbreak_enabled: true
pii_detection_enabled: true

- name: general_inquiry
system_prompt: "You are a helpful general assistant. Answer questions clearly and concisely. If you need more information, ask clarifying questions."
model_scores:
- model: qwen3
score: 0.75
use_reasoning: false
jailbreak_enabled: true
pii_detection_enabled: false

# Embedding Models Configuration
# These models provide intelligent embedding generation with automatic routing:
# - Qwen3-Embedding-0.6B: Up to 32K context, high quality, 1024-dim embeddings
# - EmbeddingGemma-300M: Up to 8K context, fast inference, Matryoshka support (768/512/256/128)
embedding_models:
qwen3_model_path: "models/Qwen3-Embedding-0.6B"
gemma_model_path: "models/embeddinggemma-300m"
use_cpu: true # Set to false for GPU acceleration (requires CUDA)

# Default model for fallback
default_model: "qwen3"

# Entropy-based reasoning configuration
entropy_threshold: 0.5 # Threshold for entropy-based reasoning decision
high_entropy_threshold: 0.8 # High entropy threshold for complex queries

121 changes: 13 additions & 108 deletions src/semantic-router/pkg/classification/classifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -382,91 +382,6 @@ func (c *Classifier) initializeCategoryClassifier() error {
return c.categoryInitializer.Init(c.Config.CategoryModel.ModelID, c.Config.CategoryModel.UseCPU, numClasses)
}

// ClassifyCategory performs category classification on the given text
func (c *Classifier) ClassifyCategory(text string) (string, float64, error) {
// Try keyword classifier first
if c.keywordClassifier != nil {
category, confidence, err := c.keywordClassifier.Classify(text)
if err != nil {
return "", 0.0, err
}
if category != "" {
return category, confidence, nil
}
}
// TODO: more sophiscated fusion engine needs to be designed and implemented to combine classifiers' results
// Try embedding based similarity classification if properly configured
if c.keywordEmbeddingClassifier != nil {
category, confidence, err := c.keywordEmbeddingClassifier.Classify(text)
if err != nil {
return "", 0.0, err
}
if category != "" {
return category, confidence, nil
}
}
// Try in-tree first if properly configured
if c.IsCategoryEnabled() && c.categoryInference != nil {
return c.classifyCategoryInTree(text)
}

// If in-tree classifier was initialized but config is now invalid, return specific error
if c.categoryInference != nil && !c.IsCategoryEnabled() {
return "", 0.0, fmt.Errorf("category classification is not properly configured")
}

// Fall back to MCP
if c.IsMCPCategoryEnabled() && c.mcpCategoryInference != nil {
return c.classifyCategoryMCP(text)
}

return "", 0.0, fmt.Errorf("no category classification method available")
}

// classifyCategoryInTree performs category classification using in-tree model
func (c *Classifier) classifyCategoryInTree(text string) (string, float64, error) {
if !c.IsCategoryEnabled() {
return "", 0.0, fmt.Errorf("category classification is not properly configured")
}

// Use appropriate classifier based on configuration
var result candle_binding.ClassResult
var err error

start := time.Now()
result, err = c.categoryInference.Classify(text)
metrics.RecordClassifierLatency("category", time.Since(start).Seconds())

if err != nil {
return "", 0.0, fmt.Errorf("classification error: %w", err)
}

logging.Infof("Classification result: class=%d, confidence=%.4f", result.Class, result.Confidence)

// Check confidence threshold
if result.Confidence < c.Config.CategoryModel.Threshold {
logging.Infof("Classification confidence (%.4f) below threshold (%.4f)",
result.Confidence, c.Config.CategoryModel.Threshold)
return "", float64(result.Confidence), nil
}

// Convert class index to category name (MMLU-Pro)
categoryName, ok := c.CategoryMapping.GetCategoryFromIndex(result.Class)
if !ok {
logging.Warnf("Class index %d not found in category mapping", result.Class)
return "", float64(result.Confidence), nil
}

// Translate to generic category if mapping is configured
genericCategory := c.translateMMLUToGeneric(categoryName)

// Record the category classification metric using generic name when available
metrics.RecordCategoryClassification(genericCategory)

logging.Infof("Classified as category: %s (mmlu=%s)", genericCategory, categoryName)
return genericCategory, float64(result.Confidence), nil
}

// IsJailbreakEnabled checks if jailbreak detection is enabled and properly configured
func (c *Classifier) IsJailbreakEnabled() bool {
return c.Config.PromptGuard.Enabled && c.Config.PromptGuard.ModelID != "" && c.Config.PromptGuard.JailbreakMappingPath != "" && c.JailbreakMapping != nil
Expand Down Expand Up @@ -611,6 +526,19 @@ func (c *Classifier) ClassifyCategoryWithEntropy(text string) (string, float64,
}
}

// Try embedding based similarity classification if properly configured
if c.keywordEmbeddingClassifier != nil {
category, confidence, err := c.keywordEmbeddingClassifier.Classify(text)
if err != nil {
return "", 0.0, entropy.ReasoningDecision{}, err
}
if category != "" {
// Keyword embedding matched - determine reasoning mode from category configuration
reasoningDecision := c.makeReasoningDecisionForKeywordCategory(category)
return category, confidence, reasoningDecision, nil
}
}

// Try in-tree first if properly configured
if c.IsCategoryEnabled() && c.categoryInference != nil {
return c.classifyCategoryWithEntropyInTree(text)
Expand Down Expand Up @@ -926,29 +854,6 @@ func (c *Classifier) AnalyzeContentForPIIWithThreshold(contentList []string, thr
return hasPII, analysisResults, nil
}

// ClassifyAndSelectBestModel performs classification and selects the best model for the query
func (c *Classifier) ClassifyAndSelectBestModel(query string) string {
// If no categories defined, return default model
if len(c.Config.Categories) == 0 {
return c.Config.DefaultModel
}

// First, classify the text to determine the category
categoryName, confidence, err := c.ClassifyCategory(query)
if err != nil {
logging.Errorf("Classification error: %v, falling back to default model", err)
return c.Config.DefaultModel
}

if categoryName == "" {
logging.Infof("Classification confidence (%.4f) below threshold, using default model", confidence)
return c.Config.DefaultModel
}

// Then select the best model from the determined category based on score and TTFT
return c.SelectBestModelForCategory(categoryName)
}

// SelectBestModelForCategory selects the best model from a category based on score and TTFT
func (c *Classifier) SelectBestModelForCategory(categoryName string) string {
cat := c.findCategory(categoryName)
Expand Down
Loading
Loading