diff --git a/config/config.development.yaml b/config/config.development.yaml index 86458928..9c03ecdc 100644 --- a/config/config.development.yaml +++ b/config/config.development.yaml @@ -47,6 +47,9 @@ classifier: categories: - name: test system_prompt: "You are a test assistant." + # Example: Category-level cache settings + # semantic_cache_enabled: true + # semantic_cache_similarity_threshold: 0.85 model_scores: - model: test-model score: 1.0 diff --git a/config/config.e2e.yaml b/config/config.e2e.yaml index b588849f..60362cc7 100644 --- a/config/config.e2e.yaml +++ b/config/config.e2e.yaml @@ -107,6 +107,9 @@ categories: score: 0.4 use_reasoning: false - name: psychology + # Example: Strict cache threshold for psychology - clinical nuances matter + # semantic_cache_enabled: true + # semantic_cache_similarity_threshold: 0.92 model_scores: - model: "Model-A" score: 0.6 @@ -156,6 +159,9 @@ categories: score: 0.4 use_reasoning: false - name: other + # Example: Lower threshold for general queries - better cache hit rate + # semantic_cache_enabled: true + # semantic_cache_similarity_threshold: 0.75 model_scores: - model: "Model-B" score: 0.8 @@ -168,6 +174,9 @@ categories: score: 0.6 use_reasoning: false - name: health + # Example: Very strict cache threshold for health - word changes matter medically + # semantic_cache_enabled: true + # semantic_cache_similarity_threshold: 0.95 model_scores: - model: "Model-B" score: 0.8 diff --git a/config/config.production.yaml b/config/config.production.yaml index 9c4dd4f8..2651a4a7 100644 --- a/config/config.production.yaml +++ b/config/config.production.yaml @@ -60,12 +60,18 @@ classifier: categories: - name: math system_prompt: "You are a mathematics expert. Provide step-by-step solutions." + # Example: High threshold for math - precision matters + # semantic_cache_enabled: true + # semantic_cache_similarity_threshold: 0.92 model_scores: - model: openai/gpt-oss-20b score: 1.0 use_reasoning: true - name: other system_prompt: "You are a helpful assistant." + # Example: Lower threshold for general queries - more cache hits + # semantic_cache_enabled: true + # semantic_cache_similarity_threshold: 0.75 model_scores: - model: openai/gpt-oss-20b score: 0.7 diff --git a/config/config.recipe-accuracy.yaml b/config/config.recipe-accuracy.yaml index 584b0291..96bd258b 100644 --- a/config/config.recipe-accuracy.yaml +++ b/config/config.recipe-accuracy.yaml @@ -87,6 +87,9 @@ categories: use_reasoning: true # Enable reasoning for legal analysis - name: psychology system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice." + # Category-level cache override (if global cache is enabled) + # semantic_cache_enabled: true + # semantic_cache_similarity_threshold: 0.92 # Strict for clinical nuances model_scores: - model: openai/gpt-oss-20b score: 1.0 @@ -117,6 +120,9 @@ categories: use_reasoning: false # Default queries don't need reasoning - name: health system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies." + # Category-level cache override (if global cache is enabled) + # semantic_cache_enabled: true + # semantic_cache_similarity_threshold: 0.95 # Very strict - medical accuracy critical model_scores: - model: openai/gpt-oss-20b score: 1.0 diff --git a/config/config.recipe-latency.yaml b/config/config.recipe-latency.yaml index ce31a36f..56a4bf29 100644 --- a/config/config.recipe-latency.yaml +++ b/config/config.recipe-latency.yaml @@ -105,6 +105,9 @@ categories: use_reasoning: false - name: other system_prompt: "Provide helpful responses." + # Category-level cache (optional, already enabled globally with low threshold) + # semantic_cache_enabled: true + # semantic_cache_similarity_threshold: 0.65 # Even lower for general queries model_scores: - model: openai/gpt-oss-20b score: 0.7 diff --git a/config/config.recipe-token-efficiency.yaml b/config/config.recipe-token-efficiency.yaml index 49008db5..16a71f53 100644 --- a/config/config.recipe-token-efficiency.yaml +++ b/config/config.recipe-token-efficiency.yaml @@ -110,6 +110,9 @@ categories: use_reasoning: false - name: other system_prompt: "You are a helpful assistant. Provide concise, accurate responses." + # Category-level cache (optional, already enabled globally) + # semantic_cache_enabled: true + # semantic_cache_similarity_threshold: 0.7 # Match global or slightly lower model_scores: - model: openai/gpt-oss-20b score: 0.7 diff --git a/config/config.testing.yaml b/config/config.testing.yaml index 91722f56..8e4b631f 100644 --- a/config/config.testing.yaml +++ b/config/config.testing.yaml @@ -42,6 +42,9 @@ model_config: categories: - name: other + # Category-level cache settings (optional - falls back to global if not set) + # semantic_cache_enabled: true + # semantic_cache_similarity_threshold: 0.8 model_scores: - model: openai/gpt-oss-20b score: 0.7 diff --git a/config/config.yaml b/config/config.yaml index 667e41f8..279feb67 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -74,6 +74,8 @@ categories: use_reasoning: false - name: psychology system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice." + semantic_cache_enabled: true + semantic_cache_similarity_threshold: 0.92 # High threshold for psychology - sensitive to nuances model_scores: - model: qwen3 score: 0.6 @@ -98,12 +100,16 @@ categories: use_reasoning: false - name: other system_prompt: "You are a helpful and knowledgeable assistant. Provide accurate, helpful responses across a wide range of topics." + semantic_cache_enabled: true + semantic_cache_similarity_threshold: 0.75 # Lower threshold for general chat - less sensitive model_scores: - model: qwen3 score: 0.7 use_reasoning: false - name: health system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies." + semantic_cache_enabled: true + semantic_cache_similarity_threshold: 0.95 # High threshold for health - very sensitive to word changes model_scores: - model: qwen3 score: 0.5 diff --git a/src/semantic-router/pkg/cache/cache_interface.go b/src/semantic-router/pkg/cache/cache_interface.go index f35e165c..fcdf0073 100644 --- a/src/semantic-router/pkg/cache/cache_interface.go +++ b/src/semantic-router/pkg/cache/cache_interface.go @@ -33,6 +33,11 @@ type CacheBackend interface { // Returns the cached response, match status, and any error FindSimilar(model string, query string) ([]byte, bool, error) + // FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold + // This allows category-specific similarity thresholds + // Returns the cached response, match status, and any error + FindSimilarWithThreshold(model string, query string, threshold float32) ([]byte, bool, error) + // Close releases all resources held by the cache backend Close() error diff --git a/src/semantic-router/pkg/cache/inmemory_cache.go b/src/semantic-router/pkg/cache/inmemory_cache.go index 10386420..5820c5f8 100644 --- a/src/semantic-router/pkg/cache/inmemory_cache.go +++ b/src/semantic-router/pkg/cache/inmemory_cache.go @@ -207,20 +207,25 @@ func (c *InMemoryCache) AddEntry(requestID string, model string, query string, r return nil } -// FindSimilar searches for semantically similar cached requests +// FindSimilar searches for semantically similar cached requests using the default threshold func (c *InMemoryCache) FindSimilar(model string, query string) ([]byte, bool, error) { + return c.FindSimilarWithThreshold(model, query, c.similarityThreshold) +} + +// FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold +func (c *InMemoryCache) FindSimilarWithThreshold(model string, query string, threshold float32) ([]byte, bool, error) { start := time.Now() if !c.enabled { - observability.Debugf("InMemoryCache.FindSimilar: cache disabled") + observability.Debugf("InMemoryCache.FindSimilarWithThreshold: cache disabled") return nil, false, nil } queryPreview := query if len(query) > 50 { queryPreview = query[:50] + "..." } - observability.Debugf("InMemoryCache.FindSimilar: searching for model='%s', query='%s' (len=%d chars)", - model, queryPreview, len(query)) + observability.Debugf("InMemoryCache.FindSimilarWithThreshold: searching for model='%s', query='%s' (len=%d chars), threshold=%.4f", + model, queryPreview, len(query), threshold) // Generate semantic embedding for similarity comparison queryEmbedding, err := candle_binding.GetEmbedding(query, 0) // Auto-detect dimension @@ -237,7 +242,7 @@ func (c *InMemoryCache) FindSimilar(model string, query string) ([]byte, bool, e entriesChecked int expiredCount int ) - // Capture the lookup time after acquiring the read lock so TTL checks aren’t skewed by embedding work or lock wait + // Capture the lookup time after acquiring the read lock so TTL checks aren't skewed by embedding work or lock wait now := time.Now() // Compare with completed entries for the same model, tracking only the best match @@ -292,26 +297,26 @@ func (c *InMemoryCache) FindSimilar(model string, query string) ([]byte, bool, e // Handle case where no suitable entries exist if bestIndex < 0 { atomic.AddInt64(&c.missCount, 1) - observability.Debugf("InMemoryCache.FindSimilar: no entries found with responses") + observability.Debugf("InMemoryCache.FindSimilarWithThreshold: no entries found with responses") metrics.RecordCacheOperation("memory", "find_similar", "miss", time.Since(start).Seconds()) metrics.RecordCacheMiss() return nil, false, nil } // Check if the best match meets the similarity threshold - if bestSimilarity >= c.similarityThreshold { + if bestSimilarity >= threshold { atomic.AddInt64(&c.hitCount, 1) c.mu.Lock() c.updateAccessInfo(bestIndex, bestEntry) c.mu.Unlock() - observability.Debugf("InMemoryCache.FindSimilar: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes", - bestSimilarity, c.similarityThreshold, len(bestEntry.ResponseBody)) + observability.Debugf("InMemoryCache.FindSimilarWithThreshold: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes", + bestSimilarity, threshold, len(bestEntry.ResponseBody)) observability.LogEvent("cache_hit", map[string]interface{}{ "backend": "memory", "similarity": bestSimilarity, - "threshold": c.similarityThreshold, + "threshold": threshold, "model": model, }) metrics.RecordCacheOperation("memory", "find_similar", "hit", time.Since(start).Seconds()) @@ -320,12 +325,12 @@ func (c *InMemoryCache) FindSimilar(model string, query string) ([]byte, bool, e } atomic.AddInt64(&c.missCount, 1) - observability.Debugf("InMemoryCache.FindSimilar: CACHE MISS - best_similarity=%.4f < threshold=%.4f (checked %d entries)", - bestSimilarity, c.similarityThreshold, entriesChecked) + observability.Debugf("InMemoryCache.FindSimilarWithThreshold: CACHE MISS - best_similarity=%.4f < threshold=%.4f (checked %d entries)", + bestSimilarity, threshold, entriesChecked) observability.LogEvent("cache_miss", map[string]interface{}{ "backend": "memory", "best_similarity": bestSimilarity, - "threshold": c.similarityThreshold, + "threshold": threshold, "model": model, "entries_checked": entriesChecked, }) diff --git a/src/semantic-router/pkg/cache/milvus_cache.go b/src/semantic-router/pkg/cache/milvus_cache.go index 4af89184..372c0656 100644 --- a/src/semantic-router/pkg/cache/milvus_cache.go +++ b/src/semantic-router/pkg/cache/milvus_cache.go @@ -487,18 +487,23 @@ func (c *MilvusCache) addEntry(id string, requestID string, model string, query // FindSimilar searches for semantically similar cached requests func (c *MilvusCache) FindSimilar(model string, query string) ([]byte, bool, error) { + return c.FindSimilarWithThreshold(model, query, c.similarityThreshold) +} + +// FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold +func (c *MilvusCache) FindSimilarWithThreshold(model string, query string, threshold float32) ([]byte, bool, error) { start := time.Now() if !c.enabled { - observability.Debugf("MilvusCache.FindSimilar: cache disabled") + observability.Debugf("MilvusCache.FindSimilarWithThreshold: cache disabled") return nil, false, nil } queryPreview := query if len(query) > 50 { queryPreview = query[:50] + "..." } - observability.Debugf("MilvusCache.FindSimilar: searching for model='%s', query='%s' (len=%d chars)", - model, queryPreview, len(query)) + observability.Debugf("MilvusCache.FindSimilarWithThreshold: searching for model='%s', query='%s' (len=%d chars), threshold=%.4f", + model, queryPreview, len(query), threshold) // Generate semantic embedding for similarity comparison queryEmbedding, err := candle_binding.GetEmbedding(query, 0) // Auto-detect dimension @@ -529,7 +534,7 @@ func (c *MilvusCache) FindSimilar(model string, query string) ([]byte, bool, err searchParam, ) if err != nil { - observability.Debugf("MilvusCache.FindSimilar: search failed: %v", err) + observability.Debugf("MilvusCache.FindSimilarWithThreshold: search failed: %v", err) atomic.AddInt64(&c.missCount, 1) metrics.RecordCacheOperation("milvus", "find_similar", "error", time.Since(start).Seconds()) metrics.RecordCacheMiss() @@ -538,21 +543,21 @@ func (c *MilvusCache) FindSimilar(model string, query string) ([]byte, bool, err if len(searchResult) == 0 || searchResult[0].ResultCount == 0 { atomic.AddInt64(&c.missCount, 1) - observability.Debugf("MilvusCache.FindSimilar: no entries found") + observability.Debugf("MilvusCache.FindSimilarWithThreshold: no entries found") metrics.RecordCacheOperation("milvus", "find_similar", "miss", time.Since(start).Seconds()) metrics.RecordCacheMiss() return nil, false, nil } bestScore := searchResult[0].Scores[0] - if bestScore < c.similarityThreshold { + if bestScore < threshold { atomic.AddInt64(&c.missCount, 1) - observability.Debugf("MilvusCache.FindSimilar: CACHE MISS - best_similarity=%.4f < threshold=%.4f", - bestScore, c.similarityThreshold) + observability.Debugf("MilvusCache.FindSimilarWithThreshold: CACHE MISS - best_similarity=%.4f < threshold=%.4f", + bestScore, threshold) observability.LogEvent("cache_miss", map[string]interface{}{ "backend": "milvus", "best_similarity": bestScore, - "threshold": c.similarityThreshold, + "threshold": threshold, "model": model, "collection": c.collectionName, }) @@ -569,7 +574,7 @@ func (c *MilvusCache) FindSimilar(model string, query string) ([]byte, bool, err } if responseBody == nil { - observability.Debugf("MilvusCache.FindSimilar: cache hit but response_body is missing or not a string") + observability.Debugf("MilvusCache.FindSimilarWithThreshold: cache hit but response_body is missing or not a string") atomic.AddInt64(&c.missCount, 1) metrics.RecordCacheOperation("milvus", "find_similar", "error", time.Since(start).Seconds()) metrics.RecordCacheMiss() @@ -577,12 +582,12 @@ func (c *MilvusCache) FindSimilar(model string, query string) ([]byte, bool, err } atomic.AddInt64(&c.hitCount, 1) - observability.Debugf("MilvusCache.FindSimilar: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes", - bestScore, c.similarityThreshold, len(responseBody)) + observability.Debugf("MilvusCache.FindSimilarWithThreshold: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes", + bestScore, threshold, len(responseBody)) observability.LogEvent("cache_hit", map[string]interface{}{ "backend": "milvus", "similarity": bestScore, - "threshold": c.similarityThreshold, + "threshold": threshold, "model": model, "collection": c.collectionName, }) diff --git a/src/semantic-router/pkg/config/config.go b/src/semantic-router/pkg/config/config.go index e8c09e7d..9766d473 100644 --- a/src/semantic-router/pkg/config/config.go +++ b/src/semantic-router/pkg/config/config.go @@ -364,6 +364,12 @@ type Category struct { // "replace": Replace any existing system message with the category-specific prompt // "insert": Prepend the category-specific prompt to the existing system message content SystemPromptMode string `yaml:"system_prompt_mode,omitempty"` + // SemanticCacheEnabled controls whether semantic caching is enabled for this category + // If nil, inherits from global SemanticCache.Enabled setting + SemanticCacheEnabled *bool `yaml:"semantic_cache_enabled,omitempty"` + // SemanticCacheSimilarityThreshold defines the minimum similarity score for cache hits (0.0-1.0) + // If nil, uses the global threshold from SemanticCache.SimilarityThreshold or BertModel.Threshold + SemanticCacheSimilarityThreshold *float32 `yaml:"semantic_cache_similarity_threshold,omitempty"` } // GetModelReasoningFamily returns the reasoning family configuration for a given model name @@ -419,6 +425,11 @@ func BoolPtr(b bool) *bool { return &b } +// Float32Ptr returns a pointer to a float32 value (helper for tests and config) +func Float32Ptr(f float32) *float32 { + return &f +} + // validateConfigStructure performs additional validation on the parsed config func validateConfigStructure(cfg *RouterConfig) error { // Ensure all categories have at least one model with scores @@ -782,3 +793,25 @@ func (c *RouterConfig) GetCategoryByName(name string) *Category { } return nil } + +// IsCacheEnabledForCategory returns whether semantic caching is enabled for a specific category +// If the category has an explicit setting, it takes precedence; otherwise, uses global setting +func (c *RouterConfig) IsCacheEnabledForCategory(categoryName string) bool { + category := c.GetCategoryByName(categoryName) + if category != nil && category.SemanticCacheEnabled != nil { + return *category.SemanticCacheEnabled + } + // Fall back to global setting + return c.SemanticCache.Enabled +} + +// GetCacheSimilarityThresholdForCategory returns the effective cache similarity threshold for a category +// Priority: category-specific > global semantic_cache > bert_model threshold +func (c *RouterConfig) GetCacheSimilarityThresholdForCategory(categoryName string) float32 { + category := c.GetCategoryByName(categoryName) + if category != nil && category.SemanticCacheSimilarityThreshold != nil { + return *category.SemanticCacheSimilarityThreshold + } + // Fall back to global cache threshold or bert threshold + return c.GetCacheSimilarityThreshold() +} diff --git a/src/semantic-router/pkg/config/config_test.go b/src/semantic-router/pkg/config/config_test.go index 657d18f1..8a34f399 100644 --- a/src/semantic-router/pkg/config/config_test.go +++ b/src/semantic-router/pkg/config/config_test.go @@ -1753,4 +1753,161 @@ default_model: "test-model" }) }) }) + + Describe("Category-Level Cache Settings", func() { + Context("with category-specific cache configuration", func() { + It("should use category-specific cache enabled setting", func() { + yamlContent := ` +bert_model: + model_id: "test-model" + threshold: 0.7 + +semantic_cache: + enabled: true + similarity_threshold: 0.8 + +categories: + - name: health + semantic_cache_enabled: true + semantic_cache_similarity_threshold: 0.95 + model_scores: + - model: test-model + score: 1.0 + use_reasoning: false + - name: general + semantic_cache_enabled: false + model_scores: + - model: test-model + score: 1.0 + use_reasoning: false + - name: other + model_scores: + - model: test-model + score: 1.0 + use_reasoning: false +` + var cfg config.RouterConfig + err := yaml.Unmarshal([]byte(yamlContent), &cfg) + Expect(err).NotTo(HaveOccurred()) + + // Test category-specific enabled settings + Expect(cfg.IsCacheEnabledForCategory("health")).To(BeTrue()) + Expect(cfg.IsCacheEnabledForCategory("general")).To(BeFalse()) + // "other" should fall back to global setting + Expect(cfg.IsCacheEnabledForCategory("other")).To(BeTrue()) + // Unknown category should also fall back to global + Expect(cfg.IsCacheEnabledForCategory("unknown")).To(BeTrue()) + }) + + It("should use category-specific similarity thresholds", func() { + yamlContent := ` +bert_model: + model_id: "test-model" + threshold: 0.7 + +semantic_cache: + enabled: true + similarity_threshold: 0.8 + +categories: + - name: health + semantic_cache_similarity_threshold: 0.95 + model_scores: + - model: test-model + score: 1.0 + use_reasoning: false + - name: psychology + semantic_cache_similarity_threshold: 0.92 + model_scores: + - model: test-model + score: 1.0 + use_reasoning: false + - name: other + semantic_cache_similarity_threshold: 0.75 + model_scores: + - model: test-model + score: 1.0 + use_reasoning: false + - name: general + model_scores: + - model: test-model + score: 1.0 + use_reasoning: false +` + var cfg config.RouterConfig + err := yaml.Unmarshal([]byte(yamlContent), &cfg) + Expect(err).NotTo(HaveOccurred()) + + // Test category-specific thresholds + Expect(cfg.GetCacheSimilarityThresholdForCategory("health")).To(Equal(float32(0.95))) + Expect(cfg.GetCacheSimilarityThresholdForCategory("psychology")).To(Equal(float32(0.92))) + Expect(cfg.GetCacheSimilarityThresholdForCategory("other")).To(Equal(float32(0.75))) + // "general" should fall back to global semantic_cache threshold + Expect(cfg.GetCacheSimilarityThresholdForCategory("general")).To(Equal(float32(0.8))) + // Unknown category should also fall back + Expect(cfg.GetCacheSimilarityThresholdForCategory("unknown")).To(Equal(float32(0.8))) + }) + + It("should fall back to bert threshold when semantic_cache threshold is not set", func() { + yamlContent := ` +bert_model: + model_id: "test-model" + threshold: 0.6 + +semantic_cache: + enabled: true + +categories: + - name: test + model_scores: + - model: test-model + score: 1.0 + use_reasoning: false +` + var cfg config.RouterConfig + err := yaml.Unmarshal([]byte(yamlContent), &cfg) + Expect(err).NotTo(HaveOccurred()) + + // Should fall back to bert_model.threshold + Expect(cfg.GetCacheSimilarityThresholdForCategory("test")).To(Equal(float32(0.6))) + Expect(cfg.GetCacheSimilarityThreshold()).To(Equal(float32(0.6))) + }) + + It("should handle nil pointers for optional cache settings", func() { + category := config.Category{ + Name: "test", + ModelScores: []config.ModelScore{ + {Model: "test", Score: 1.0, UseReasoning: config.BoolPtr(false)}, + }, + } + + cfg := &config.RouterConfig{ + SemanticCache: struct { + BackendType string `yaml:"backend_type,omitempty"` + Enabled bool `yaml:"enabled"` + SimilarityThreshold *float32 `yaml:"similarity_threshold,omitempty"` + MaxEntries int `yaml:"max_entries,omitempty"` + TTLSeconds int `yaml:"ttl_seconds,omitempty"` + EvictionPolicy string `yaml:"eviction_policy,omitempty"` + BackendConfigPath string `yaml:"backend_config_path,omitempty"` + }{ + Enabled: true, + SimilarityThreshold: config.Float32Ptr(0.8), + }, + BertModel: struct { + ModelID string `yaml:"model_id"` + Threshold float32 `yaml:"threshold"` + UseCPU bool `yaml:"use_cpu"` + }{ + Threshold: 0.7, + }, + Categories: []config.Category{category}, + } + + // Nil values should use defaults + Expect(cfg.IsCacheEnabledForCategory("test")).To(BeTrue()) + Expect(cfg.GetCacheSimilarityThresholdForCategory("test")).To(Equal(float32(0.8))) + }) + }) + }) }) diff --git a/src/semantic-router/pkg/extproc/caching_test.go b/src/semantic-router/pkg/extproc/caching_test.go index fcd4dc79..5f345b79 100644 --- a/src/semantic-router/pkg/extproc/caching_test.go +++ b/src/semantic-router/pkg/extproc/caching_test.go @@ -250,4 +250,59 @@ var _ = Describe("Caching Functionality", func() { Expect(response.GetRequestBody().Response.Status).To(Equal(ext_proc.CommonResponse_CONTINUE)) }) }) + + Describe("Category-Specific Caching", func() { + It("should use category-specific cache settings", func() { + // Create a config with category-specific cache settings + cfg := CreateTestConfig() + cfg.SemanticCache.Enabled = true + cfg.SemanticCache.SimilarityThreshold = config.Float32Ptr(0.8) + + // Add categories with different cache settings + cfg.Categories = []config.Category{ + { + Name: "health", + ModelScores: []config.ModelScore{ + {Model: "model-a", Score: 1.0, UseReasoning: config.BoolPtr(false)}, + }, + SemanticCacheEnabled: config.BoolPtr(true), + SemanticCacheSimilarityThreshold: config.Float32Ptr(0.95), + }, + { + Name: "general", + ModelScores: []config.ModelScore{ + {Model: "model-a", Score: 1.0, UseReasoning: config.BoolPtr(false)}, + }, + SemanticCacheEnabled: config.BoolPtr(false), + SemanticCacheSimilarityThreshold: config.Float32Ptr(0.7), + }, + } + + // Verify category cache settings are correct + Expect(cfg.IsCacheEnabledForCategory("health")).To(BeTrue()) + Expect(cfg.IsCacheEnabledForCategory("general")).To(BeFalse()) + Expect(cfg.GetCacheSimilarityThresholdForCategory("health")).To(Equal(float32(0.95))) + Expect(cfg.GetCacheSimilarityThresholdForCategory("general")).To(Equal(float32(0.7))) + }) + + It("should fall back to global settings when category doesn't specify", func() { + cfg := CreateTestConfig() + cfg.SemanticCache.Enabled = true + cfg.SemanticCache.SimilarityThreshold = config.Float32Ptr(0.8) + + // Add category without cache settings + cfg.Categories = []config.Category{ + { + Name: "test", + ModelScores: []config.ModelScore{ + {Model: "model-a", Score: 1.0, UseReasoning: config.BoolPtr(false)}, + }, + }, + } + + // Should use global settings + Expect(cfg.IsCacheEnabledForCategory("test")).To(BeTrue()) + Expect(cfg.GetCacheSimilarityThresholdForCategory("test")).To(Equal(float32(0.8))) + }) + }) }) diff --git a/src/semantic-router/pkg/extproc/request_handler.go b/src/semantic-router/pkg/extproc/request_handler.go index 8d375ba6..e90f4745 100644 --- a/src/semantic-router/pkg/extproc/request_handler.go +++ b/src/semantic-router/pkg/extproc/request_handler.go @@ -401,8 +401,24 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo return response, nil } - // Handle caching - if response, shouldReturn := r.handleCaching(ctx); shouldReturn { + // Classify the request early to determine category for cache settings + var categoryName string + if r.Config != nil && r.Config.IsAutoModelName(originalModel) && (len(nonUserMessages) > 0 || userContent != "") { + // Determine text to use for classification + var classificationText string + if len(userContent) > 0 { + classificationText = userContent + } else if len(nonUserMessages) > 0 { + classificationText = strings.Join(nonUserMessages, " ") + } + if classificationText != "" { + categoryName = r.findCategoryForClassification(classificationText) + observability.Debugf("Classified request to category: %s", categoryName) + } + } + + // Handle caching with category-specific settings + if response, shouldReturn := r.handleCaching(ctx, categoryName); shouldReturn { return response, nil } @@ -476,8 +492,8 @@ func (r *OpenAIRouter) performSecurityChecks(ctx *RequestContext, userContent st return nil, false } -// handleCaching handles cache lookup and storage -func (r *OpenAIRouter) handleCaching(ctx *RequestContext) (*ext_proc.ProcessingResponse, bool) { +// handleCaching handles cache lookup and storage with category-specific settings +func (r *OpenAIRouter) handleCaching(ctx *RequestContext, categoryName string) (*ext_proc.ProcessingResponse, bool) { // Extract the model and query for cache lookup requestModel, requestQuery, err := cache.ExtractQueryFromOpenAIRequest(ctx.OriginalRequestBody) if err != nil { @@ -489,20 +505,34 @@ func (r *OpenAIRouter) handleCaching(ctx *RequestContext) (*ext_proc.ProcessingR ctx.RequestModel = requestModel ctx.RequestQuery = requestQuery - if requestQuery != "" && r.Cache.IsEnabled() { + // Check if caching is enabled for this category + cacheEnabled := r.Config.SemanticCache.Enabled + if categoryName != "" { + cacheEnabled = r.Config.IsCacheEnabledForCategory(categoryName) + } + + if requestQuery != "" && r.Cache.IsEnabled() && cacheEnabled { + // Get category-specific threshold + threshold := r.Config.GetCacheSimilarityThreshold() + if categoryName != "" { + threshold = r.Config.GetCacheSimilarityThresholdForCategory(categoryName) + } + // Start cache lookup span spanCtx, span := observability.StartSpan(ctx.TraceContext, observability.SpanCacheLookup) defer span.End() startTime := time.Now() - // Try to find a similar cached response - cachedResponse, found, cacheErr := r.Cache.FindSimilar(requestModel, requestQuery) + // Try to find a similar cached response using category-specific threshold + cachedResponse, found, cacheErr := r.Cache.FindSimilarWithThreshold(requestModel, requestQuery, threshold) lookupTime := time.Since(startTime).Milliseconds() observability.SetSpanAttributes(span, attribute.String(observability.AttrCacheKey, requestQuery), attribute.Bool(observability.AttrCacheHit, found), - attribute.Int64(observability.AttrCacheLookupTimeMs, lookupTime)) + attribute.Int64(observability.AttrCacheLookupTimeMs, lookupTime), + attribute.String(observability.AttrCategoryName, categoryName), + attribute.Float64("cache.threshold", float64(threshold))) if cacheErr != nil { observability.Errorf("Error searching cache: %v", cacheErr) @@ -515,6 +545,8 @@ func (r *OpenAIRouter) handleCaching(ctx *RequestContext) (*ext_proc.ProcessingR "request_id": ctx.RequestID, "model": requestModel, "query": requestQuery, + "category": categoryName, + "threshold": threshold, }) // Return immediate response from cache response := http.CreateCacheHitResponse(cachedResponse, ctx.ExpectStreamingResponse) diff --git a/website/docs/installation/configuration.md b/website/docs/installation/configuration.md index 340ad847..37d0f8e0 100644 --- a/website/docs/installation/configuration.md +++ b/website/docs/installation/configuration.md @@ -23,7 +23,7 @@ bert_model: semantic_cache: backend_type: "memory" # Options: "memory" or "milvus" enabled: false - similarity_threshold: 0.8 + similarity_threshold: 0.8 # Global default threshold max_entries: 1000 ttl_seconds: 3600 eviction_policy: "fifo" # Options: "fifo", "lru", "lfu" @@ -81,6 +81,9 @@ categories: - model: your-model score: 1.0 use_reasoning: true # Enable reasoning for math problems + # Optional: Category-level cache settings + # semantic_cache_enabled: true + # semantic_cache_similarity_threshold: 0.9 # Higher threshold for math - name: computer science model_scores: - model: your-model @@ -91,6 +94,7 @@ categories: - model: your-model score: 0.8 use_reasoning: false # No reasoning for general queries + # semantic_cache_similarity_threshold: 0.75 # Lower threshold for general queries default_model: your-model @@ -457,13 +461,38 @@ Configure additional features: ```yaml # Semantic Caching semantic_cache: - enabled: true # Enable semantic caching + enabled: true # Enable semantic caching globally backend_type: "memory" # Options: "memory" or "milvus" - similarity_threshold: 0.8 # Cache hit threshold + similarity_threshold: 0.8 # Global default cache hit threshold max_entries: 1000 # Maximum cache entries ttl_seconds: 3600 # Cache expiration time eviction_policy: "fifo" # Options: "fifo", "lru", "lfu" +# Category-Level Cache Configuration (New) +# Override global cache settings for specific categories +categories: + - name: health + semantic_cache_enabled: true + semantic_cache_similarity_threshold: 0.95 # Very strict - medical accuracy critical + model_scores: + - model: your-model + score: 0.5 + use_reasoning: false + + - name: general_chat + semantic_cache_similarity_threshold: 0.75 # Relaxed for better cache hits + model_scores: + - model: your-model + score: 0.7 + use_reasoning: false + + - name: troubleshooting + # No cache settings - uses global default (0.8) + model_scores: + - model: your-model + score: 0.7 + use_reasoning: false + # Tool Auto-Selection tools: enabled: true # Enable automatic tool selection @@ -604,6 +633,112 @@ batch_size_ranges: Access metrics at: `http://localhost:9190/metrics` +## Category-Level Cache Configuration + +**NEW**: Configure semantic cache settings at the category level for fine-grained control over caching behavior. + +### Why Use Category-Level Cache Settings? + +Different categories have different tolerance for semantic variations: + +- **Sensitive categories** (health, psychology, law): Small word changes can have significant meaning differences. Require high similarity thresholds (0.92-0.95). +- **General categories** (chat, troubleshooting): Less sensitive to minor wording changes. Can use lower thresholds (0.75-0.82) for better cache hit rates. +- **Privacy categories**: May need caching disabled entirely for compliance or security reasons. + +### Configuration Examples + +#### Example 1: Mixed Thresholds for Different Categories + +```yaml +semantic_cache: + enabled: true + backend_type: "memory" + similarity_threshold: 0.8 # Global default + +categories: + - name: health + system_prompt: "You are a health expert..." + semantic_cache_enabled: true + semantic_cache_similarity_threshold: 0.95 # Very strict - "headache" vs "severe headache" = different + model_scores: + - model: your-model + score: 0.5 + use_reasoning: false + + - name: psychology + system_prompt: "You are a psychology expert..." + semantic_cache_similarity_threshold: 0.92 # Strict - clinical nuances matter + model_scores: + - model: your-model + score: 0.6 + use_reasoning: false + + - name: general_chat + system_prompt: "You are a helpful assistant..." + semantic_cache_similarity_threshold: 0.75 # Relaxed - "how's the weather" = "what's the weather" + model_scores: + - model: your-model + score: 0.7 + use_reasoning: false + + - name: troubleshooting + system_prompt: "You are a tech support expert..." + # No cache settings - uses global threshold of 0.8 + model_scores: + - model: your-model + score: 0.7 + use_reasoning: false +``` + +#### Example 2: Disable Cache for Sensitive Data + +```yaml +categories: + - name: personal_data + system_prompt: "Handle personal information..." + semantic_cache_enabled: false # Disable cache entirely for privacy + model_scores: + - model: your-model + score: 0.8 + use_reasoning: false +``` + +### Configuration Options + +**Category-Level Fields:** + +- `semantic_cache_enabled` (optional, boolean): Enable/disable caching for this category. If not specified, inherits from global `semantic_cache.enabled`. +- `semantic_cache_similarity_threshold` (optional, float 0.0-1.0): Minimum similarity score for cache hits in this category. If not specified, inherits from global `semantic_cache.similarity_threshold`. + +**Fallback Hierarchy:** + +1. Category-specific `semantic_cache_similarity_threshold` (if set) +2. Global `semantic_cache.similarity_threshold` (if set) +3. `bert_model.threshold` (final fallback) + +### Best Practices + +**Threshold Selection:** + +- **High precision (0.92-0.95)**: health, psychology, law, finance +- **Medium precision (0.85-0.90)**: technical documentation, education +- **Lower precision (0.75-0.82)**: general chat, FAQs, troubleshooting + +**Privacy and Compliance:** + +- Disable caching (`semantic_cache_enabled: false`) for categories handling: + - Personal identifiable information (PII) + - Financial data + - Health records + - Sensitive business information + +**Performance Tuning:** + +- Start with conservative (higher) thresholds +- Monitor cache hit rates per category +- Lower thresholds for categories with low hit rates +- Raise thresholds for categories with incorrect cache hits + ## Common Configuration Examples ### Enable All Security Features diff --git a/website/docs/tutorials/semantic-cache/in-memory-cache.md b/website/docs/tutorials/semantic-cache/in-memory-cache.md index 56214c87..4ba99a8a 100644 --- a/website/docs/tutorials/semantic-cache/in-memory-cache.md +++ b/website/docs/tutorials/semantic-cache/in-memory-cache.md @@ -44,23 +44,71 @@ graph TB semantic_cache: enabled: true backend_type: "memory" - similarity_threshold: 0.8 + similarity_threshold: 0.8 # Global default threshold max_entries: 1000 ttl_seconds: 3600 eviction_policy: "fifo" ``` +### Category-Level Configuration (New) + +Configure cache settings per category for fine-grained control: + +```yaml +semantic_cache: + enabled: true + backend_type: "memory" + similarity_threshold: 0.8 # Global default + max_entries: 1000 + ttl_seconds: 3600 + eviction_policy: "fifo" + +categories: + - name: health + system_prompt: "You are a health expert..." + semantic_cache_enabled: true + semantic_cache_similarity_threshold: 0.95 # Very strict for medical accuracy + model_scores: + - model: your-model + score: 0.5 + use_reasoning: false + + - name: general_chat + system_prompt: "You are a helpful assistant..." + semantic_cache_similarity_threshold: 0.75 # Relaxed for better hit rate + model_scores: + - model: your-model + score: 0.7 + use_reasoning: false + + - name: troubleshooting + # No cache settings - uses global default (0.8) + model_scores: + - model: your-model + score: 0.7 + use_reasoning: false +``` + ### Configuration Options | Parameter | Type | Default | Description | |-----------|------|---------|-------------| -| `enabled` | boolean | `false` | Enable/disable semantic caching | +| `enabled` | boolean | `false` | Enable/disable semantic caching globally | | `backend_type` | string | `"memory"` | Cache backend type (must be "memory") | -| `similarity_threshold` | float | `0.8` | Minimum similarity for cache hits (0.0-1.0) | +| `similarity_threshold` | float | `0.8` | Global minimum similarity for cache hits (0.0-1.0) | | `max_entries` | integer | `1000` | Maximum number of cached entries | | `ttl_seconds` | integer | `3600` | Time-to-live for cache entries (seconds, 0 = no expiration) | | `eviction_policy` | string | `"fifo"` | Eviction policy: `"fifo"`, `"lru"`, `"lfu"` | +### Category-Level Configuration Options + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `semantic_cache_enabled` | boolean | (inherits global) | Enable/disable caching for this category | +| `semantic_cache_similarity_threshold` | float | (inherits global) | Category-specific similarity threshold (0.0-1.0) | + +Category-level settings override global settings. If not specified, the category uses the global cache configuration. + ### Environment Examples #### Development Environment diff --git a/website/docs/tutorials/semantic-cache/overview.md b/website/docs/tutorials/semantic-cache/overview.md index 93460c87..8a259f32 100644 --- a/website/docs/tutorials/semantic-cache/overview.md +++ b/website/docs/tutorials/semantic-cache/overview.md @@ -10,7 +10,15 @@ Uses embeddings and cosine similarity to match queries by meaning rather than ex ### Configurable Thresholds -Adjustable similarity thresholds balance cache hit rates with response quality. +Adjustable similarity thresholds balance cache hit rates with response quality. Thresholds can be set globally or per-category for fine-grained control. + +### Category-Level Control + +**NEW**: Configure cache settings at the category level for precise control over sensitive and general content: + +- **Sensitive categories** (health, psychology, law): Use high thresholds (0.92-0.95) to prevent incorrect cache hits where word nuances matter +- **General categories** (chat, troubleshooting): Use lower thresholds (0.75-0.82) for better cache hit rates +- **Privacy categories**: Disable caching entirely for specific categories ### Multiple Backends