Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions config/config.development.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ classifier:
categories:
- name: test
system_prompt: "You are a test assistant."
# Example: Category-level cache settings
# semantic_cache_enabled: true
# semantic_cache_similarity_threshold: 0.85
model_scores:
- model: test-model
score: 1.0
Expand Down
9 changes: 9 additions & 0 deletions config/config.e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,9 @@ categories:
score: 0.4
use_reasoning: false
- name: psychology
# Example: Strict cache threshold for psychology - clinical nuances matter
# semantic_cache_enabled: true
# semantic_cache_similarity_threshold: 0.92
model_scores:
- model: "Model-A"
score: 0.6
Expand Down Expand Up @@ -156,6 +159,9 @@ categories:
score: 0.4
use_reasoning: false
- name: other
# Example: Lower threshold for general queries - better cache hit rate
# semantic_cache_enabled: true
# semantic_cache_similarity_threshold: 0.75
model_scores:
- model: "Model-B"
score: 0.8
Expand All @@ -168,6 +174,9 @@ categories:
score: 0.6
use_reasoning: false
- name: health
# Example: Very strict cache threshold for health - word changes matter medically
# semantic_cache_enabled: true
# semantic_cache_similarity_threshold: 0.95
model_scores:
- model: "Model-B"
score: 0.8
Expand Down
6 changes: 6 additions & 0 deletions config/config.production.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,18 @@ classifier:
categories:
- name: math
system_prompt: "You are a mathematics expert. Provide step-by-step solutions."
# Example: High threshold for math - precision matters
# semantic_cache_enabled: true
# semantic_cache_similarity_threshold: 0.92
model_scores:
- model: openai/gpt-oss-20b
score: 1.0
use_reasoning: true
- name: other
system_prompt: "You are a helpful assistant."
# Example: Lower threshold for general queries - more cache hits
# semantic_cache_enabled: true
# semantic_cache_similarity_threshold: 0.75
model_scores:
- model: openai/gpt-oss-20b
score: 0.7
Expand Down
6 changes: 6 additions & 0 deletions config/config.recipe-accuracy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ categories:
use_reasoning: true # Enable reasoning for legal analysis
- name: psychology
system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice."
# Category-level cache override (if global cache is enabled)
# semantic_cache_enabled: true
# semantic_cache_similarity_threshold: 0.92 # Strict for clinical nuances
model_scores:
- model: openai/gpt-oss-20b
score: 1.0
Expand Down Expand Up @@ -117,6 +120,9 @@ categories:
use_reasoning: false # Default queries don't need reasoning
- name: health
system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies."
# Category-level cache override (if global cache is enabled)
# semantic_cache_enabled: true
# semantic_cache_similarity_threshold: 0.95 # Very strict - medical accuracy critical
model_scores:
- model: openai/gpt-oss-20b
score: 1.0
Expand Down
3 changes: 3 additions & 0 deletions config/config.recipe-latency.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,9 @@ categories:
use_reasoning: false
- name: other
system_prompt: "Provide helpful responses."
# Category-level cache (optional, already enabled globally with low threshold)
# semantic_cache_enabled: true
# semantic_cache_similarity_threshold: 0.65 # Even lower for general queries
model_scores:
- model: openai/gpt-oss-20b
score: 0.7
Expand Down
3 changes: 3 additions & 0 deletions config/config.recipe-token-efficiency.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,9 @@ categories:
use_reasoning: false
- name: other
system_prompt: "You are a helpful assistant. Provide concise, accurate responses."
# Category-level cache (optional, already enabled globally)
# semantic_cache_enabled: true
# semantic_cache_similarity_threshold: 0.7 # Match global or slightly lower
model_scores:
- model: openai/gpt-oss-20b
score: 0.7
Expand Down
3 changes: 3 additions & 0 deletions config/config.testing.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ model_config:

categories:
- name: other
# Category-level cache settings (optional - falls back to global if not set)
# semantic_cache_enabled: true
# semantic_cache_similarity_threshold: 0.8
model_scores:
- model: openai/gpt-oss-20b
score: 0.7
Expand Down
6 changes: 6 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ categories:
use_reasoning: false
- name: psychology
system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice."
semantic_cache_enabled: true
semantic_cache_similarity_threshold: 0.92 # High threshold for psychology - sensitive to nuances
model_scores:
- model: qwen3
score: 0.6
Expand All @@ -98,12 +100,16 @@ categories:
use_reasoning: false
- name: other
system_prompt: "You are a helpful and knowledgeable assistant. Provide accurate, helpful responses across a wide range of topics."
semantic_cache_enabled: true
semantic_cache_similarity_threshold: 0.75 # Lower threshold for general chat - less sensitive
model_scores:
- model: qwen3
score: 0.7
use_reasoning: false
- name: health
system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies."
semantic_cache_enabled: true
semantic_cache_similarity_threshold: 0.95 # High threshold for health - very sensitive to word changes
model_scores:
- model: qwen3
score: 0.5
Expand Down
5 changes: 5 additions & 0 deletions src/semantic-router/pkg/cache/cache_interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ type CacheBackend interface {
// Returns the cached response, match status, and any error
FindSimilar(model string, query string) ([]byte, bool, error)

// FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold
// This allows category-specific similarity thresholds
// Returns the cached response, match status, and any error
FindSimilarWithThreshold(model string, query string, threshold float32) ([]byte, bool, error)

// Close releases all resources held by the cache backend
Close() error

Expand Down
31 changes: 18 additions & 13 deletions src/semantic-router/pkg/cache/inmemory_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -207,20 +207,25 @@ func (c *InMemoryCache) AddEntry(requestID string, model string, query string, r
return nil
}

// FindSimilar searches for semantically similar cached requests
// FindSimilar searches for semantically similar cached requests using the default threshold
func (c *InMemoryCache) FindSimilar(model string, query string) ([]byte, bool, error) {
return c.FindSimilarWithThreshold(model, query, c.similarityThreshold)
}

// FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold
func (c *InMemoryCache) FindSimilarWithThreshold(model string, query string, threshold float32) ([]byte, bool, error) {
Comment on lines +215 to +216
Copy link

Copilot AI Oct 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The FindSimilarWithThreshold method duplicates significant logic from the existing FindSimilar method. Consider refactoring FindSimilar to call FindSimilarWithThreshold with the default threshold to eliminate code duplication and ensure consistent behavior between both methods.

Copilot uses AI. Check for mistakes.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot please address this and other duplicate functions

start := time.Now()

if !c.enabled {
observability.Debugf("InMemoryCache.FindSimilar: cache disabled")
observability.Debugf("InMemoryCache.FindSimilarWithThreshold: cache disabled")
return nil, false, nil
}
queryPreview := query
if len(query) > 50 {
queryPreview = query[:50] + "..."
}
observability.Debugf("InMemoryCache.FindSimilar: searching for model='%s', query='%s' (len=%d chars)",
model, queryPreview, len(query))
observability.Debugf("InMemoryCache.FindSimilarWithThreshold: searching for model='%s', query='%s' (len=%d chars), threshold=%.4f",
model, queryPreview, len(query), threshold)

// Generate semantic embedding for similarity comparison
queryEmbedding, err := candle_binding.GetEmbedding(query, 0) // Auto-detect dimension
Expand All @@ -237,7 +242,7 @@ func (c *InMemoryCache) FindSimilar(model string, query string) ([]byte, bool, e
entriesChecked int
expiredCount int
)
// Capture the lookup time after acquiring the read lock so TTL checks arent skewed by embedding work or lock wait
// Capture the lookup time after acquiring the read lock so TTL checks aren't skewed by embedding work or lock wait
now := time.Now()

// Compare with completed entries for the same model, tracking only the best match
Expand Down Expand Up @@ -292,26 +297,26 @@ func (c *InMemoryCache) FindSimilar(model string, query string) ([]byte, bool, e
// Handle case where no suitable entries exist
if bestIndex < 0 {
atomic.AddInt64(&c.missCount, 1)
observability.Debugf("InMemoryCache.FindSimilar: no entries found with responses")
observability.Debugf("InMemoryCache.FindSimilarWithThreshold: no entries found with responses")
metrics.RecordCacheOperation("memory", "find_similar", "miss", time.Since(start).Seconds())
metrics.RecordCacheMiss()
return nil, false, nil
}

// Check if the best match meets the similarity threshold
if bestSimilarity >= c.similarityThreshold {
if bestSimilarity >= threshold {
atomic.AddInt64(&c.hitCount, 1)

c.mu.Lock()
c.updateAccessInfo(bestIndex, bestEntry)
c.mu.Unlock()

observability.Debugf("InMemoryCache.FindSimilar: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes",
bestSimilarity, c.similarityThreshold, len(bestEntry.ResponseBody))
observability.Debugf("InMemoryCache.FindSimilarWithThreshold: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes",
bestSimilarity, threshold, len(bestEntry.ResponseBody))
observability.LogEvent("cache_hit", map[string]interface{}{
"backend": "memory",
"similarity": bestSimilarity,
"threshold": c.similarityThreshold,
"threshold": threshold,
"model": model,
})
metrics.RecordCacheOperation("memory", "find_similar", "hit", time.Since(start).Seconds())
Expand All @@ -320,12 +325,12 @@ func (c *InMemoryCache) FindSimilar(model string, query string) ([]byte, bool, e
}

atomic.AddInt64(&c.missCount, 1)
observability.Debugf("InMemoryCache.FindSimilar: CACHE MISS - best_similarity=%.4f < threshold=%.4f (checked %d entries)",
bestSimilarity, c.similarityThreshold, entriesChecked)
observability.Debugf("InMemoryCache.FindSimilarWithThreshold: CACHE MISS - best_similarity=%.4f < threshold=%.4f (checked %d entries)",
bestSimilarity, threshold, entriesChecked)
observability.LogEvent("cache_miss", map[string]interface{}{
"backend": "memory",
"best_similarity": bestSimilarity,
"threshold": c.similarityThreshold,
"threshold": threshold,
"model": model,
"entries_checked": entriesChecked,
})
Expand Down
31 changes: 18 additions & 13 deletions src/semantic-router/pkg/cache/milvus_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -487,18 +487,23 @@ func (c *MilvusCache) addEntry(id string, requestID string, model string, query

// FindSimilar searches for semantically similar cached requests
func (c *MilvusCache) FindSimilar(model string, query string) ([]byte, bool, error) {
return c.FindSimilarWithThreshold(model, query, c.similarityThreshold)
}

// FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold
func (c *MilvusCache) FindSimilarWithThreshold(model string, query string, threshold float32) ([]byte, bool, error) {
Comment on lines +493 to +494
Copy link

Copilot AI Oct 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The FindSimilarWithThreshold method duplicates significant logic from the existing FindSimilar method. Consider refactoring FindSimilar to call FindSimilarWithThreshold with the default threshold to eliminate code duplication and ensure consistent behavior between both methods.

Copilot uses AI. Check for mistakes.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot please address this and other duplicate functions

start := time.Now()

if !c.enabled {
observability.Debugf("MilvusCache.FindSimilar: cache disabled")
observability.Debugf("MilvusCache.FindSimilarWithThreshold: cache disabled")
return nil, false, nil
}
queryPreview := query
if len(query) > 50 {
queryPreview = query[:50] + "..."
}
observability.Debugf("MilvusCache.FindSimilar: searching for model='%s', query='%s' (len=%d chars)",
model, queryPreview, len(query))
observability.Debugf("MilvusCache.FindSimilarWithThreshold: searching for model='%s', query='%s' (len=%d chars), threshold=%.4f",
model, queryPreview, len(query), threshold)

// Generate semantic embedding for similarity comparison
queryEmbedding, err := candle_binding.GetEmbedding(query, 0) // Auto-detect dimension
Expand Down Expand Up @@ -529,7 +534,7 @@ func (c *MilvusCache) FindSimilar(model string, query string) ([]byte, bool, err
searchParam,
)
if err != nil {
observability.Debugf("MilvusCache.FindSimilar: search failed: %v", err)
observability.Debugf("MilvusCache.FindSimilarWithThreshold: search failed: %v", err)
atomic.AddInt64(&c.missCount, 1)
metrics.RecordCacheOperation("milvus", "find_similar", "error", time.Since(start).Seconds())
metrics.RecordCacheMiss()
Expand All @@ -538,21 +543,21 @@ func (c *MilvusCache) FindSimilar(model string, query string) ([]byte, bool, err

if len(searchResult) == 0 || searchResult[0].ResultCount == 0 {
atomic.AddInt64(&c.missCount, 1)
observability.Debugf("MilvusCache.FindSimilar: no entries found")
observability.Debugf("MilvusCache.FindSimilarWithThreshold: no entries found")
metrics.RecordCacheOperation("milvus", "find_similar", "miss", time.Since(start).Seconds())
metrics.RecordCacheMiss()
return nil, false, nil
}

bestScore := searchResult[0].Scores[0]
if bestScore < c.similarityThreshold {
if bestScore < threshold {
atomic.AddInt64(&c.missCount, 1)
observability.Debugf("MilvusCache.FindSimilar: CACHE MISS - best_similarity=%.4f < threshold=%.4f",
bestScore, c.similarityThreshold)
observability.Debugf("MilvusCache.FindSimilarWithThreshold: CACHE MISS - best_similarity=%.4f < threshold=%.4f",
bestScore, threshold)
observability.LogEvent("cache_miss", map[string]interface{}{
"backend": "milvus",
"best_similarity": bestScore,
"threshold": c.similarityThreshold,
"threshold": threshold,
"model": model,
"collection": c.collectionName,
})
Expand All @@ -569,20 +574,20 @@ func (c *MilvusCache) FindSimilar(model string, query string) ([]byte, bool, err
}

if responseBody == nil {
observability.Debugf("MilvusCache.FindSimilar: cache hit but response_body is missing or not a string")
observability.Debugf("MilvusCache.FindSimilarWithThreshold: cache hit but response_body is missing or not a string")
atomic.AddInt64(&c.missCount, 1)
metrics.RecordCacheOperation("milvus", "find_similar", "error", time.Since(start).Seconds())
metrics.RecordCacheMiss()
return nil, false, nil
}

atomic.AddInt64(&c.hitCount, 1)
observability.Debugf("MilvusCache.FindSimilar: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes",
bestScore, c.similarityThreshold, len(responseBody))
observability.Debugf("MilvusCache.FindSimilarWithThreshold: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes",
bestScore, threshold, len(responseBody))
observability.LogEvent("cache_hit", map[string]interface{}{
"backend": "milvus",
"similarity": bestScore,
"threshold": c.similarityThreshold,
"threshold": threshold,
"model": model,
"collection": c.collectionName,
})
Expand Down
33 changes: 33 additions & 0 deletions src/semantic-router/pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,12 @@ type Category struct {
// "replace": Replace any existing system message with the category-specific prompt
// "insert": Prepend the category-specific prompt to the existing system message content
SystemPromptMode string `yaml:"system_prompt_mode,omitempty"`
// SemanticCacheEnabled controls whether semantic caching is enabled for this category
// If nil, inherits from global SemanticCache.Enabled setting
SemanticCacheEnabled *bool `yaml:"semantic_cache_enabled,omitempty"`
// SemanticCacheSimilarityThreshold defines the minimum similarity score for cache hits (0.0-1.0)
// If nil, uses the global threshold from SemanticCache.SimilarityThreshold or BertModel.Threshold
SemanticCacheSimilarityThreshold *float32 `yaml:"semantic_cache_similarity_threshold,omitempty"`
}

// GetModelReasoningFamily returns the reasoning family configuration for a given model name
Expand Down Expand Up @@ -419,6 +425,11 @@ func BoolPtr(b bool) *bool {
return &b
}

// Float32Ptr returns a pointer to a float32 value (helper for tests and config)
func Float32Ptr(f float32) *float32 {
return &f
}

// validateConfigStructure performs additional validation on the parsed config
func validateConfigStructure(cfg *RouterConfig) error {
// Ensure all categories have at least one model with scores
Expand Down Expand Up @@ -782,3 +793,25 @@ func (c *RouterConfig) GetCategoryByName(name string) *Category {
}
return nil
}

// IsCacheEnabledForCategory returns whether semantic caching is enabled for a specific category
// If the category has an explicit setting, it takes precedence; otherwise, uses global setting
func (c *RouterConfig) IsCacheEnabledForCategory(categoryName string) bool {
category := c.GetCategoryByName(categoryName)
if category != nil && category.SemanticCacheEnabled != nil {
return *category.SemanticCacheEnabled
}
// Fall back to global setting
return c.SemanticCache.Enabled
}

// GetCacheSimilarityThresholdForCategory returns the effective cache similarity threshold for a category
// Priority: category-specific > global semantic_cache > bert_model threshold
func (c *RouterConfig) GetCacheSimilarityThresholdForCategory(categoryName string) float32 {
category := c.GetCategoryByName(categoryName)
if category != nil && category.SemanticCacheSimilarityThreshold != nil {
return *category.SemanticCacheSimilarityThreshold
}
// Fall back to global cache threshold or bert threshold
return c.GetCacheSimilarityThreshold()
}
Loading
Loading