From a6f5607be711b1221910735b1487d0319c1ed1b0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 20 Oct 2025 23:07:20 +0000 Subject: [PATCH 1/7] Initial plan From 79127290c6cb705f55579fca95114325c0d98663 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 20 Oct 2025 23:22:20 +0000 Subject: [PATCH 2/7] Add category-level cache settings: enabled and similarity_threshold Co-authored-by: rootfs <7062400+rootfs@users.noreply.github.com> --- config/config.yaml | 6 + .../pkg/cache/cache_interface.go | 5 + .../pkg/cache/inmemory_cache.go | 127 ++++++++++++++++++ src/semantic-router/pkg/cache/milvus_cache.go | 106 +++++++++++++++ src/semantic-router/pkg/config/config.go | 28 ++++ .../pkg/extproc/request_handler.go | 48 +++++-- 6 files changed, 312 insertions(+), 8 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 667e41f8..279feb67 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -74,6 +74,8 @@ categories: use_reasoning: false - name: psychology system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice." + semantic_cache_enabled: true + semantic_cache_similarity_threshold: 0.92 # High threshold for psychology - sensitive to nuances model_scores: - model: qwen3 score: 0.6 @@ -98,12 +100,16 @@ categories: use_reasoning: false - name: other system_prompt: "You are a helpful and knowledgeable assistant. Provide accurate, helpful responses across a wide range of topics." + semantic_cache_enabled: true + semantic_cache_similarity_threshold: 0.75 # Lower threshold for general chat - less sensitive model_scores: - model: qwen3 score: 0.7 use_reasoning: false - name: health system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies." + semantic_cache_enabled: true + semantic_cache_similarity_threshold: 0.95 # High threshold for health - very sensitive to word changes model_scores: - model: qwen3 score: 0.5 diff --git a/src/semantic-router/pkg/cache/cache_interface.go b/src/semantic-router/pkg/cache/cache_interface.go index f35e165c..fcdf0073 100644 --- a/src/semantic-router/pkg/cache/cache_interface.go +++ b/src/semantic-router/pkg/cache/cache_interface.go @@ -33,6 +33,11 @@ type CacheBackend interface { // Returns the cached response, match status, and any error FindSimilar(model string, query string) ([]byte, bool, error) + // FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold + // This allows category-specific similarity thresholds + // Returns the cached response, match status, and any error + FindSimilarWithThreshold(model string, query string, threshold float32) ([]byte, bool, error) + // Close releases all resources held by the cache backend Close() error diff --git a/src/semantic-router/pkg/cache/inmemory_cache.go b/src/semantic-router/pkg/cache/inmemory_cache.go index 10386420..34e1f4aa 100644 --- a/src/semantic-router/pkg/cache/inmemory_cache.go +++ b/src/semantic-router/pkg/cache/inmemory_cache.go @@ -334,6 +334,133 @@ func (c *InMemoryCache) FindSimilar(model string, query string) ([]byte, bool, e return nil, false, nil } +// FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold +func (c *InMemoryCache) FindSimilarWithThreshold(model string, query string, threshold float32) ([]byte, bool, error) { + start := time.Now() + + if !c.enabled { + observability.Debugf("InMemoryCache.FindSimilarWithThreshold: cache disabled") + return nil, false, nil + } + queryPreview := query + if len(query) > 50 { + queryPreview = query[:50] + "..." + } + observability.Debugf("InMemoryCache.FindSimilarWithThreshold: searching for model='%s', query='%s' (len=%d chars), threshold=%.4f", + model, queryPreview, len(query), threshold) + + // Generate semantic embedding for similarity comparison + queryEmbedding, err := candle_binding.GetEmbedding(query, 0) // Auto-detect dimension + if err != nil { + metrics.RecordCacheOperation("memory", "find_similar", "error", time.Since(start).Seconds()) + return nil, false, fmt.Errorf("failed to generate embedding: %w", err) + } + + c.mu.RLock() + var ( + bestIndex = -1 + bestEntry CacheEntry + bestSimilarity float32 + entriesChecked int + expiredCount int + ) + // Capture the lookup time after acquiring the read lock so TTL checks aren't skewed by embedding work or lock wait + now := time.Now() + + // Compare with completed entries for the same model, tracking only the best match + for entryIndex, entry := range c.entries { + // Skip incomplete entries + if entry.ResponseBody == nil { + continue + } + + // Only consider entries for the same model + if entry.Model != model { + continue + } + + // Skip entries that have expired before considering them + if c.isExpired(entry, now) { + expiredCount++ + continue + } + + // Compute semantic similarity using dot product + var dotProduct float32 + for i := 0; i < len(queryEmbedding) && i < len(entry.Embedding); i++ { + dotProduct += queryEmbedding[i] * entry.Embedding[i] + } + + entriesChecked++ + if bestIndex == -1 || dotProduct > bestSimilarity { + bestSimilarity = dotProduct + bestIndex = entryIndex + } + } + // Snapshot the best entry before releasing the read lock + if bestIndex >= 0 { + bestEntry = c.entries[bestIndex] + } + + // Unlock the read lock since we need the write lock to update the access info + c.mu.RUnlock() + + // Log if any expired entries were skipped + if expiredCount > 0 { + observability.Debugf("InMemoryCache: excluded %d expired entries during search (TTL: %ds)", + expiredCount, c.ttlSeconds) + observability.LogEvent("cache_expired_entries_found", map[string]interface{}{ + "backend": "memory", + "expired_count": expiredCount, + "ttl_seconds": c.ttlSeconds, + }) + } + + // Handle case where no suitable entries exist + if bestIndex < 0 { + atomic.AddInt64(&c.missCount, 1) + observability.Debugf("InMemoryCache.FindSimilarWithThreshold: no entries found with responses") + metrics.RecordCacheOperation("memory", "find_similar", "miss", time.Since(start).Seconds()) + metrics.RecordCacheMiss() + return nil, false, nil + } + + // Check if the best match meets the similarity threshold + if bestSimilarity >= threshold { + atomic.AddInt64(&c.hitCount, 1) + + c.mu.Lock() + c.updateAccessInfo(bestIndex, bestEntry) + c.mu.Unlock() + + observability.Debugf("InMemoryCache.FindSimilarWithThreshold: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes", + bestSimilarity, threshold, len(bestEntry.ResponseBody)) + observability.LogEvent("cache_hit", map[string]interface{}{ + "backend": "memory", + "similarity": bestSimilarity, + "threshold": threshold, + "model": model, + }) + metrics.RecordCacheOperation("memory", "find_similar", "hit", time.Since(start).Seconds()) + metrics.RecordCacheHit() + return bestEntry.ResponseBody, true, nil + } + + atomic.AddInt64(&c.missCount, 1) + observability.Debugf("InMemoryCache.FindSimilarWithThreshold: CACHE MISS - best_similarity=%.4f < threshold=%.4f (checked %d entries)", + bestSimilarity, threshold, entriesChecked) + observability.LogEvent("cache_miss", map[string]interface{}{ + "backend": "memory", + "best_similarity": bestSimilarity, + "threshold": threshold, + "model": model, + "entries_checked": entriesChecked, + }) + metrics.RecordCacheOperation("memory", "find_similar", "miss", time.Since(start).Seconds()) + metrics.RecordCacheMiss() + return nil, false, nil +} + // Close releases all resources held by the cache func (c *InMemoryCache) Close() error { c.mu.Lock() diff --git a/src/semantic-router/pkg/cache/milvus_cache.go b/src/semantic-router/pkg/cache/milvus_cache.go index 4af89184..11053a09 100644 --- a/src/semantic-router/pkg/cache/milvus_cache.go +++ b/src/semantic-router/pkg/cache/milvus_cache.go @@ -591,6 +591,112 @@ func (c *MilvusCache) FindSimilar(model string, query string) ([]byte, bool, err return responseBody, true, nil } +// FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold +func (c *MilvusCache) FindSimilarWithThreshold(model string, query string, threshold float32) ([]byte, bool, error) { + start := time.Now() + + if !c.enabled { + observability.Debugf("MilvusCache.FindSimilarWithThreshold: cache disabled") + return nil, false, nil + } + queryPreview := query + if len(query) > 50 { + queryPreview = query[:50] + "..." + } + observability.Debugf("MilvusCache.FindSimilarWithThreshold: searching for model='%s', query='%s' (len=%d chars), threshold=%.4f", + model, queryPreview, len(query), threshold) + + // Generate semantic embedding for similarity comparison + queryEmbedding, err := candle_binding.GetEmbedding(query, 0) // Auto-detect dimension + if err != nil { + metrics.RecordCacheOperation("milvus", "find_similar", "error", time.Since(start).Seconds()) + return nil, false, fmt.Errorf("failed to generate embedding: %w", err) + } + + ctx := context.Background() + + // Define search parameters + searchParam, err := entity.NewIndexHNSWSearchParam(c.config.Search.Params.Ef) + if err != nil { + return nil, false, fmt.Errorf("failed to create search parameters: %w", err) + } + + // Use Milvus Search for efficient similarity search + searchResult, err := c.client.Search( + ctx, + c.collectionName, + []string{}, + fmt.Sprintf("model == \"%s\" && response_body != \"\"", model), + []string{"response_body"}, + []entity.Vector{entity.FloatVector(queryEmbedding)}, + c.config.Collection.VectorField.Name, + entity.MetricType(c.config.Collection.VectorField.MetricType), + c.config.Search.TopK, + searchParam, + ) + if err != nil { + observability.Debugf("MilvusCache.FindSimilarWithThreshold: search failed: %v", err) + atomic.AddInt64(&c.missCount, 1) + metrics.RecordCacheOperation("milvus", "find_similar", "error", time.Since(start).Seconds()) + metrics.RecordCacheMiss() + return nil, false, nil + } + + if len(searchResult) == 0 || searchResult[0].ResultCount == 0 { + atomic.AddInt64(&c.missCount, 1) + observability.Debugf("MilvusCache.FindSimilarWithThreshold: no entries found") + metrics.RecordCacheOperation("milvus", "find_similar", "miss", time.Since(start).Seconds()) + metrics.RecordCacheMiss() + return nil, false, nil + } + + bestScore := searchResult[0].Scores[0] + if bestScore < threshold { + atomic.AddInt64(&c.missCount, 1) + observability.Debugf("MilvusCache.FindSimilarWithThreshold: CACHE MISS - best_similarity=%.4f < threshold=%.4f", + bestScore, threshold) + observability.LogEvent("cache_miss", map[string]interface{}{ + "backend": "milvus", + "best_similarity": bestScore, + "threshold": threshold, + "model": model, + "collection": c.collectionName, + }) + metrics.RecordCacheOperation("milvus", "find_similar", "miss", time.Since(start).Seconds()) + metrics.RecordCacheMiss() + return nil, false, nil + } + + // Cache Hit + var responseBody []byte + responseBodyColumn, ok := searchResult[0].Fields[0].(*entity.ColumnVarChar) + if ok && responseBodyColumn.Len() > 0 { + responseBody = []byte(responseBodyColumn.Data()[0]) + } + + if responseBody == nil { + observability.Debugf("MilvusCache.FindSimilarWithThreshold: cache hit but response_body is missing or not a string") + atomic.AddInt64(&c.missCount, 1) + metrics.RecordCacheOperation("milvus", "find_similar", "error", time.Since(start).Seconds()) + metrics.RecordCacheMiss() + return nil, false, nil + } + + atomic.AddInt64(&c.hitCount, 1) + observability.Debugf("MilvusCache.FindSimilarWithThreshold: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes", + bestScore, threshold, len(responseBody)) + observability.LogEvent("cache_hit", map[string]interface{}{ + "backend": "milvus", + "similarity": bestScore, + "threshold": threshold, + "model": model, + "collection": c.collectionName, + }) + metrics.RecordCacheOperation("milvus", "find_similar", "hit", time.Since(start).Seconds()) + metrics.RecordCacheHit() + return responseBody, true, nil +} + // Close releases all resources held by the cache func (c *MilvusCache) Close() error { if c.client != nil { diff --git a/src/semantic-router/pkg/config/config.go b/src/semantic-router/pkg/config/config.go index e8c09e7d..8c47d071 100644 --- a/src/semantic-router/pkg/config/config.go +++ b/src/semantic-router/pkg/config/config.go @@ -364,6 +364,12 @@ type Category struct { // "replace": Replace any existing system message with the category-specific prompt // "insert": Prepend the category-specific prompt to the existing system message content SystemPromptMode string `yaml:"system_prompt_mode,omitempty"` + // SemanticCacheEnabled controls whether semantic caching is enabled for this category + // If nil, inherits from global SemanticCache.Enabled setting + SemanticCacheEnabled *bool `yaml:"semantic_cache_enabled,omitempty"` + // SemanticCacheSimilarityThreshold defines the minimum similarity score for cache hits (0.0-1.0) + // If nil, uses the global threshold from SemanticCache.SimilarityThreshold or BertModel.Threshold + SemanticCacheSimilarityThreshold *float32 `yaml:"semantic_cache_similarity_threshold,omitempty"` } // GetModelReasoningFamily returns the reasoning family configuration for a given model name @@ -782,3 +788,25 @@ func (c *RouterConfig) GetCategoryByName(name string) *Category { } return nil } + +// IsCacheEnabledForCategory returns whether semantic caching is enabled for a specific category +// If the category has an explicit setting, it takes precedence; otherwise, uses global setting +func (c *RouterConfig) IsCacheEnabledForCategory(categoryName string) bool { + category := c.GetCategoryByName(categoryName) + if category != nil && category.SemanticCacheEnabled != nil { + return *category.SemanticCacheEnabled + } + // Fall back to global setting + return c.SemanticCache.Enabled +} + +// GetCacheSimilarityThresholdForCategory returns the effective cache similarity threshold for a category +// Priority: category-specific > global semantic_cache > bert_model threshold +func (c *RouterConfig) GetCacheSimilarityThresholdForCategory(categoryName string) float32 { + category := c.GetCategoryByName(categoryName) + if category != nil && category.SemanticCacheSimilarityThreshold != nil { + return *category.SemanticCacheSimilarityThreshold + } + // Fall back to global cache threshold or bert threshold + return c.GetCacheSimilarityThreshold() +} diff --git a/src/semantic-router/pkg/extproc/request_handler.go b/src/semantic-router/pkg/extproc/request_handler.go index 8d375ba6..e90f4745 100644 --- a/src/semantic-router/pkg/extproc/request_handler.go +++ b/src/semantic-router/pkg/extproc/request_handler.go @@ -401,8 +401,24 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo return response, nil } - // Handle caching - if response, shouldReturn := r.handleCaching(ctx); shouldReturn { + // Classify the request early to determine category for cache settings + var categoryName string + if r.Config != nil && r.Config.IsAutoModelName(originalModel) && (len(nonUserMessages) > 0 || userContent != "") { + // Determine text to use for classification + var classificationText string + if len(userContent) > 0 { + classificationText = userContent + } else if len(nonUserMessages) > 0 { + classificationText = strings.Join(nonUserMessages, " ") + } + if classificationText != "" { + categoryName = r.findCategoryForClassification(classificationText) + observability.Debugf("Classified request to category: %s", categoryName) + } + } + + // Handle caching with category-specific settings + if response, shouldReturn := r.handleCaching(ctx, categoryName); shouldReturn { return response, nil } @@ -476,8 +492,8 @@ func (r *OpenAIRouter) performSecurityChecks(ctx *RequestContext, userContent st return nil, false } -// handleCaching handles cache lookup and storage -func (r *OpenAIRouter) handleCaching(ctx *RequestContext) (*ext_proc.ProcessingResponse, bool) { +// handleCaching handles cache lookup and storage with category-specific settings +func (r *OpenAIRouter) handleCaching(ctx *RequestContext, categoryName string) (*ext_proc.ProcessingResponse, bool) { // Extract the model and query for cache lookup requestModel, requestQuery, err := cache.ExtractQueryFromOpenAIRequest(ctx.OriginalRequestBody) if err != nil { @@ -489,20 +505,34 @@ func (r *OpenAIRouter) handleCaching(ctx *RequestContext) (*ext_proc.ProcessingR ctx.RequestModel = requestModel ctx.RequestQuery = requestQuery - if requestQuery != "" && r.Cache.IsEnabled() { + // Check if caching is enabled for this category + cacheEnabled := r.Config.SemanticCache.Enabled + if categoryName != "" { + cacheEnabled = r.Config.IsCacheEnabledForCategory(categoryName) + } + + if requestQuery != "" && r.Cache.IsEnabled() && cacheEnabled { + // Get category-specific threshold + threshold := r.Config.GetCacheSimilarityThreshold() + if categoryName != "" { + threshold = r.Config.GetCacheSimilarityThresholdForCategory(categoryName) + } + // Start cache lookup span spanCtx, span := observability.StartSpan(ctx.TraceContext, observability.SpanCacheLookup) defer span.End() startTime := time.Now() - // Try to find a similar cached response - cachedResponse, found, cacheErr := r.Cache.FindSimilar(requestModel, requestQuery) + // Try to find a similar cached response using category-specific threshold + cachedResponse, found, cacheErr := r.Cache.FindSimilarWithThreshold(requestModel, requestQuery, threshold) lookupTime := time.Since(startTime).Milliseconds() observability.SetSpanAttributes(span, attribute.String(observability.AttrCacheKey, requestQuery), attribute.Bool(observability.AttrCacheHit, found), - attribute.Int64(observability.AttrCacheLookupTimeMs, lookupTime)) + attribute.Int64(observability.AttrCacheLookupTimeMs, lookupTime), + attribute.String(observability.AttrCategoryName, categoryName), + attribute.Float64("cache.threshold", float64(threshold))) if cacheErr != nil { observability.Errorf("Error searching cache: %v", cacheErr) @@ -515,6 +545,8 @@ func (r *OpenAIRouter) handleCaching(ctx *RequestContext) (*ext_proc.ProcessingR "request_id": ctx.RequestID, "model": requestModel, "query": requestQuery, + "category": categoryName, + "threshold": threshold, }) // Return immediate response from cache response := http.CreateCacheHitResponse(cachedResponse, ctx.ExpectStreamingResponse) From 9bff917c36b91fc2493eab940775a54817b3bccb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 20 Oct 2025 23:26:16 +0000 Subject: [PATCH 3/7] Add comprehensive tests for category-level cache settings Co-authored-by: rootfs <7062400+rootfs@users.noreply.github.com> --- src/semantic-router/pkg/config/config.go | 5 + src/semantic-router/pkg/config/config_test.go | 157 ++++++++++++++++++ .../pkg/extproc/caching_test.go | 55 ++++++ 3 files changed, 217 insertions(+) diff --git a/src/semantic-router/pkg/config/config.go b/src/semantic-router/pkg/config/config.go index 8c47d071..9766d473 100644 --- a/src/semantic-router/pkg/config/config.go +++ b/src/semantic-router/pkg/config/config.go @@ -425,6 +425,11 @@ func BoolPtr(b bool) *bool { return &b } +// Float32Ptr returns a pointer to a float32 value (helper for tests and config) +func Float32Ptr(f float32) *float32 { + return &f +} + // validateConfigStructure performs additional validation on the parsed config func validateConfigStructure(cfg *RouterConfig) error { // Ensure all categories have at least one model with scores diff --git a/src/semantic-router/pkg/config/config_test.go b/src/semantic-router/pkg/config/config_test.go index 657d18f1..8a34f399 100644 --- a/src/semantic-router/pkg/config/config_test.go +++ b/src/semantic-router/pkg/config/config_test.go @@ -1753,4 +1753,161 @@ default_model: "test-model" }) }) }) + + Describe("Category-Level Cache Settings", func() { + Context("with category-specific cache configuration", func() { + It("should use category-specific cache enabled setting", func() { + yamlContent := ` +bert_model: + model_id: "test-model" + threshold: 0.7 + +semantic_cache: + enabled: true + similarity_threshold: 0.8 + +categories: + - name: health + semantic_cache_enabled: true + semantic_cache_similarity_threshold: 0.95 + model_scores: + - model: test-model + score: 1.0 + use_reasoning: false + - name: general + semantic_cache_enabled: false + model_scores: + - model: test-model + score: 1.0 + use_reasoning: false + - name: other + model_scores: + - model: test-model + score: 1.0 + use_reasoning: false +` + var cfg config.RouterConfig + err := yaml.Unmarshal([]byte(yamlContent), &cfg) + Expect(err).NotTo(HaveOccurred()) + + // Test category-specific enabled settings + Expect(cfg.IsCacheEnabledForCategory("health")).To(BeTrue()) + Expect(cfg.IsCacheEnabledForCategory("general")).To(BeFalse()) + // "other" should fall back to global setting + Expect(cfg.IsCacheEnabledForCategory("other")).To(BeTrue()) + // Unknown category should also fall back to global + Expect(cfg.IsCacheEnabledForCategory("unknown")).To(BeTrue()) + }) + + It("should use category-specific similarity thresholds", func() { + yamlContent := ` +bert_model: + model_id: "test-model" + threshold: 0.7 + +semantic_cache: + enabled: true + similarity_threshold: 0.8 + +categories: + - name: health + semantic_cache_similarity_threshold: 0.95 + model_scores: + - model: test-model + score: 1.0 + use_reasoning: false + - name: psychology + semantic_cache_similarity_threshold: 0.92 + model_scores: + - model: test-model + score: 1.0 + use_reasoning: false + - name: other + semantic_cache_similarity_threshold: 0.75 + model_scores: + - model: test-model + score: 1.0 + use_reasoning: false + - name: general + model_scores: + - model: test-model + score: 1.0 + use_reasoning: false +` + var cfg config.RouterConfig + err := yaml.Unmarshal([]byte(yamlContent), &cfg) + Expect(err).NotTo(HaveOccurred()) + + // Test category-specific thresholds + Expect(cfg.GetCacheSimilarityThresholdForCategory("health")).To(Equal(float32(0.95))) + Expect(cfg.GetCacheSimilarityThresholdForCategory("psychology")).To(Equal(float32(0.92))) + Expect(cfg.GetCacheSimilarityThresholdForCategory("other")).To(Equal(float32(0.75))) + // "general" should fall back to global semantic_cache threshold + Expect(cfg.GetCacheSimilarityThresholdForCategory("general")).To(Equal(float32(0.8))) + // Unknown category should also fall back + Expect(cfg.GetCacheSimilarityThresholdForCategory("unknown")).To(Equal(float32(0.8))) + }) + + It("should fall back to bert threshold when semantic_cache threshold is not set", func() { + yamlContent := ` +bert_model: + model_id: "test-model" + threshold: 0.6 + +semantic_cache: + enabled: true + +categories: + - name: test + model_scores: + - model: test-model + score: 1.0 + use_reasoning: false +` + var cfg config.RouterConfig + err := yaml.Unmarshal([]byte(yamlContent), &cfg) + Expect(err).NotTo(HaveOccurred()) + + // Should fall back to bert_model.threshold + Expect(cfg.GetCacheSimilarityThresholdForCategory("test")).To(Equal(float32(0.6))) + Expect(cfg.GetCacheSimilarityThreshold()).To(Equal(float32(0.6))) + }) + + It("should handle nil pointers for optional cache settings", func() { + category := config.Category{ + Name: "test", + ModelScores: []config.ModelScore{ + {Model: "test", Score: 1.0, UseReasoning: config.BoolPtr(false)}, + }, + } + + cfg := &config.RouterConfig{ + SemanticCache: struct { + BackendType string `yaml:"backend_type,omitempty"` + Enabled bool `yaml:"enabled"` + SimilarityThreshold *float32 `yaml:"similarity_threshold,omitempty"` + MaxEntries int `yaml:"max_entries,omitempty"` + TTLSeconds int `yaml:"ttl_seconds,omitempty"` + EvictionPolicy string `yaml:"eviction_policy,omitempty"` + BackendConfigPath string `yaml:"backend_config_path,omitempty"` + }{ + Enabled: true, + SimilarityThreshold: config.Float32Ptr(0.8), + }, + BertModel: struct { + ModelID string `yaml:"model_id"` + Threshold float32 `yaml:"threshold"` + UseCPU bool `yaml:"use_cpu"` + }{ + Threshold: 0.7, + }, + Categories: []config.Category{category}, + } + + // Nil values should use defaults + Expect(cfg.IsCacheEnabledForCategory("test")).To(BeTrue()) + Expect(cfg.GetCacheSimilarityThresholdForCategory("test")).To(Equal(float32(0.8))) + }) + }) + }) }) diff --git a/src/semantic-router/pkg/extproc/caching_test.go b/src/semantic-router/pkg/extproc/caching_test.go index fcd4dc79..5f345b79 100644 --- a/src/semantic-router/pkg/extproc/caching_test.go +++ b/src/semantic-router/pkg/extproc/caching_test.go @@ -250,4 +250,59 @@ var _ = Describe("Caching Functionality", func() { Expect(response.GetRequestBody().Response.Status).To(Equal(ext_proc.CommonResponse_CONTINUE)) }) }) + + Describe("Category-Specific Caching", func() { + It("should use category-specific cache settings", func() { + // Create a config with category-specific cache settings + cfg := CreateTestConfig() + cfg.SemanticCache.Enabled = true + cfg.SemanticCache.SimilarityThreshold = config.Float32Ptr(0.8) + + // Add categories with different cache settings + cfg.Categories = []config.Category{ + { + Name: "health", + ModelScores: []config.ModelScore{ + {Model: "model-a", Score: 1.0, UseReasoning: config.BoolPtr(false)}, + }, + SemanticCacheEnabled: config.BoolPtr(true), + SemanticCacheSimilarityThreshold: config.Float32Ptr(0.95), + }, + { + Name: "general", + ModelScores: []config.ModelScore{ + {Model: "model-a", Score: 1.0, UseReasoning: config.BoolPtr(false)}, + }, + SemanticCacheEnabled: config.BoolPtr(false), + SemanticCacheSimilarityThreshold: config.Float32Ptr(0.7), + }, + } + + // Verify category cache settings are correct + Expect(cfg.IsCacheEnabledForCategory("health")).To(BeTrue()) + Expect(cfg.IsCacheEnabledForCategory("general")).To(BeFalse()) + Expect(cfg.GetCacheSimilarityThresholdForCategory("health")).To(Equal(float32(0.95))) + Expect(cfg.GetCacheSimilarityThresholdForCategory("general")).To(Equal(float32(0.7))) + }) + + It("should fall back to global settings when category doesn't specify", func() { + cfg := CreateTestConfig() + cfg.SemanticCache.Enabled = true + cfg.SemanticCache.SimilarityThreshold = config.Float32Ptr(0.8) + + // Add category without cache settings + cfg.Categories = []config.Category{ + { + Name: "test", + ModelScores: []config.ModelScore{ + {Model: "model-a", Score: 1.0, UseReasoning: config.BoolPtr(false)}, + }, + }, + } + + // Should use global settings + Expect(cfg.IsCacheEnabledForCategory("test")).To(BeTrue()) + Expect(cfg.GetCacheSimilarityThresholdForCategory("test")).To(Equal(float32(0.8))) + }) + }) }) From 11324ddc2d47f6e26ea41f69769a9281a0015ed4 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 20 Oct 2025 23:45:48 +0000 Subject: [PATCH 4/7] Update config files and documentation for category-level cache settings - Updated 7 config YAML files (development, production, testing, e2e, and 3 recipes) with commented examples of category-level cache settings - Added comprehensive documentation section explaining category-level cache configuration - Updated semantic cache overview and in-memory cache docs with category-level examples - Added best practices for threshold selection and privacy considerations Co-authored-by: rootfs <7062400+rootfs@users.noreply.github.com> --- config/config.development.yaml | 3 + config/config.e2e.yaml | 9 ++ config/config.production.yaml | 6 + config/config.recipe-accuracy.yaml | 6 + config/config.recipe-latency.yaml | 3 + config/config.recipe-token-efficiency.yaml | 3 + config/config.testing.yaml | 3 + website/docs/installation/configuration.md | 141 +++++++++++++++++- .../semantic-cache/in-memory-cache.md | 54 ++++++- .../docs/tutorials/semantic-cache/overview.md | 10 +- 10 files changed, 231 insertions(+), 7 deletions(-) diff --git a/config/config.development.yaml b/config/config.development.yaml index 86458928..9c03ecdc 100644 --- a/config/config.development.yaml +++ b/config/config.development.yaml @@ -47,6 +47,9 @@ classifier: categories: - name: test system_prompt: "You are a test assistant." + # Example: Category-level cache settings + # semantic_cache_enabled: true + # semantic_cache_similarity_threshold: 0.85 model_scores: - model: test-model score: 1.0 diff --git a/config/config.e2e.yaml b/config/config.e2e.yaml index b588849f..60362cc7 100644 --- a/config/config.e2e.yaml +++ b/config/config.e2e.yaml @@ -107,6 +107,9 @@ categories: score: 0.4 use_reasoning: false - name: psychology + # Example: Strict cache threshold for psychology - clinical nuances matter + # semantic_cache_enabled: true + # semantic_cache_similarity_threshold: 0.92 model_scores: - model: "Model-A" score: 0.6 @@ -156,6 +159,9 @@ categories: score: 0.4 use_reasoning: false - name: other + # Example: Lower threshold for general queries - better cache hit rate + # semantic_cache_enabled: true + # semantic_cache_similarity_threshold: 0.75 model_scores: - model: "Model-B" score: 0.8 @@ -168,6 +174,9 @@ categories: score: 0.6 use_reasoning: false - name: health + # Example: Very strict cache threshold for health - word changes matter medically + # semantic_cache_enabled: true + # semantic_cache_similarity_threshold: 0.95 model_scores: - model: "Model-B" score: 0.8 diff --git a/config/config.production.yaml b/config/config.production.yaml index 9c4dd4f8..2651a4a7 100644 --- a/config/config.production.yaml +++ b/config/config.production.yaml @@ -60,12 +60,18 @@ classifier: categories: - name: math system_prompt: "You are a mathematics expert. Provide step-by-step solutions." + # Example: High threshold for math - precision matters + # semantic_cache_enabled: true + # semantic_cache_similarity_threshold: 0.92 model_scores: - model: openai/gpt-oss-20b score: 1.0 use_reasoning: true - name: other system_prompt: "You are a helpful assistant." + # Example: Lower threshold for general queries - more cache hits + # semantic_cache_enabled: true + # semantic_cache_similarity_threshold: 0.75 model_scores: - model: openai/gpt-oss-20b score: 0.7 diff --git a/config/config.recipe-accuracy.yaml b/config/config.recipe-accuracy.yaml index 584b0291..96bd258b 100644 --- a/config/config.recipe-accuracy.yaml +++ b/config/config.recipe-accuracy.yaml @@ -87,6 +87,9 @@ categories: use_reasoning: true # Enable reasoning for legal analysis - name: psychology system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice." + # Category-level cache override (if global cache is enabled) + # semantic_cache_enabled: true + # semantic_cache_similarity_threshold: 0.92 # Strict for clinical nuances model_scores: - model: openai/gpt-oss-20b score: 1.0 @@ -117,6 +120,9 @@ categories: use_reasoning: false # Default queries don't need reasoning - name: health system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies." + # Category-level cache override (if global cache is enabled) + # semantic_cache_enabled: true + # semantic_cache_similarity_threshold: 0.95 # Very strict - medical accuracy critical model_scores: - model: openai/gpt-oss-20b score: 1.0 diff --git a/config/config.recipe-latency.yaml b/config/config.recipe-latency.yaml index ce31a36f..56a4bf29 100644 --- a/config/config.recipe-latency.yaml +++ b/config/config.recipe-latency.yaml @@ -105,6 +105,9 @@ categories: use_reasoning: false - name: other system_prompt: "Provide helpful responses." + # Category-level cache (optional, already enabled globally with low threshold) + # semantic_cache_enabled: true + # semantic_cache_similarity_threshold: 0.65 # Even lower for general queries model_scores: - model: openai/gpt-oss-20b score: 0.7 diff --git a/config/config.recipe-token-efficiency.yaml b/config/config.recipe-token-efficiency.yaml index 49008db5..16a71f53 100644 --- a/config/config.recipe-token-efficiency.yaml +++ b/config/config.recipe-token-efficiency.yaml @@ -110,6 +110,9 @@ categories: use_reasoning: false - name: other system_prompt: "You are a helpful assistant. Provide concise, accurate responses." + # Category-level cache (optional, already enabled globally) + # semantic_cache_enabled: true + # semantic_cache_similarity_threshold: 0.7 # Match global or slightly lower model_scores: - model: openai/gpt-oss-20b score: 0.7 diff --git a/config/config.testing.yaml b/config/config.testing.yaml index 91722f56..8e4b631f 100644 --- a/config/config.testing.yaml +++ b/config/config.testing.yaml @@ -42,6 +42,9 @@ model_config: categories: - name: other + # Category-level cache settings (optional - falls back to global if not set) + # semantic_cache_enabled: true + # semantic_cache_similarity_threshold: 0.8 model_scores: - model: openai/gpt-oss-20b score: 0.7 diff --git a/website/docs/installation/configuration.md b/website/docs/installation/configuration.md index 340ad847..37d0f8e0 100644 --- a/website/docs/installation/configuration.md +++ b/website/docs/installation/configuration.md @@ -23,7 +23,7 @@ bert_model: semantic_cache: backend_type: "memory" # Options: "memory" or "milvus" enabled: false - similarity_threshold: 0.8 + similarity_threshold: 0.8 # Global default threshold max_entries: 1000 ttl_seconds: 3600 eviction_policy: "fifo" # Options: "fifo", "lru", "lfu" @@ -81,6 +81,9 @@ categories: - model: your-model score: 1.0 use_reasoning: true # Enable reasoning for math problems + # Optional: Category-level cache settings + # semantic_cache_enabled: true + # semantic_cache_similarity_threshold: 0.9 # Higher threshold for math - name: computer science model_scores: - model: your-model @@ -91,6 +94,7 @@ categories: - model: your-model score: 0.8 use_reasoning: false # No reasoning for general queries + # semantic_cache_similarity_threshold: 0.75 # Lower threshold for general queries default_model: your-model @@ -457,13 +461,38 @@ Configure additional features: ```yaml # Semantic Caching semantic_cache: - enabled: true # Enable semantic caching + enabled: true # Enable semantic caching globally backend_type: "memory" # Options: "memory" or "milvus" - similarity_threshold: 0.8 # Cache hit threshold + similarity_threshold: 0.8 # Global default cache hit threshold max_entries: 1000 # Maximum cache entries ttl_seconds: 3600 # Cache expiration time eviction_policy: "fifo" # Options: "fifo", "lru", "lfu" +# Category-Level Cache Configuration (New) +# Override global cache settings for specific categories +categories: + - name: health + semantic_cache_enabled: true + semantic_cache_similarity_threshold: 0.95 # Very strict - medical accuracy critical + model_scores: + - model: your-model + score: 0.5 + use_reasoning: false + + - name: general_chat + semantic_cache_similarity_threshold: 0.75 # Relaxed for better cache hits + model_scores: + - model: your-model + score: 0.7 + use_reasoning: false + + - name: troubleshooting + # No cache settings - uses global default (0.8) + model_scores: + - model: your-model + score: 0.7 + use_reasoning: false + # Tool Auto-Selection tools: enabled: true # Enable automatic tool selection @@ -604,6 +633,112 @@ batch_size_ranges: Access metrics at: `http://localhost:9190/metrics` +## Category-Level Cache Configuration + +**NEW**: Configure semantic cache settings at the category level for fine-grained control over caching behavior. + +### Why Use Category-Level Cache Settings? + +Different categories have different tolerance for semantic variations: + +- **Sensitive categories** (health, psychology, law): Small word changes can have significant meaning differences. Require high similarity thresholds (0.92-0.95). +- **General categories** (chat, troubleshooting): Less sensitive to minor wording changes. Can use lower thresholds (0.75-0.82) for better cache hit rates. +- **Privacy categories**: May need caching disabled entirely for compliance or security reasons. + +### Configuration Examples + +#### Example 1: Mixed Thresholds for Different Categories + +```yaml +semantic_cache: + enabled: true + backend_type: "memory" + similarity_threshold: 0.8 # Global default + +categories: + - name: health + system_prompt: "You are a health expert..." + semantic_cache_enabled: true + semantic_cache_similarity_threshold: 0.95 # Very strict - "headache" vs "severe headache" = different + model_scores: + - model: your-model + score: 0.5 + use_reasoning: false + + - name: psychology + system_prompt: "You are a psychology expert..." + semantic_cache_similarity_threshold: 0.92 # Strict - clinical nuances matter + model_scores: + - model: your-model + score: 0.6 + use_reasoning: false + + - name: general_chat + system_prompt: "You are a helpful assistant..." + semantic_cache_similarity_threshold: 0.75 # Relaxed - "how's the weather" = "what's the weather" + model_scores: + - model: your-model + score: 0.7 + use_reasoning: false + + - name: troubleshooting + system_prompt: "You are a tech support expert..." + # No cache settings - uses global threshold of 0.8 + model_scores: + - model: your-model + score: 0.7 + use_reasoning: false +``` + +#### Example 2: Disable Cache for Sensitive Data + +```yaml +categories: + - name: personal_data + system_prompt: "Handle personal information..." + semantic_cache_enabled: false # Disable cache entirely for privacy + model_scores: + - model: your-model + score: 0.8 + use_reasoning: false +``` + +### Configuration Options + +**Category-Level Fields:** + +- `semantic_cache_enabled` (optional, boolean): Enable/disable caching for this category. If not specified, inherits from global `semantic_cache.enabled`. +- `semantic_cache_similarity_threshold` (optional, float 0.0-1.0): Minimum similarity score for cache hits in this category. If not specified, inherits from global `semantic_cache.similarity_threshold`. + +**Fallback Hierarchy:** + +1. Category-specific `semantic_cache_similarity_threshold` (if set) +2. Global `semantic_cache.similarity_threshold` (if set) +3. `bert_model.threshold` (final fallback) + +### Best Practices + +**Threshold Selection:** + +- **High precision (0.92-0.95)**: health, psychology, law, finance +- **Medium precision (0.85-0.90)**: technical documentation, education +- **Lower precision (0.75-0.82)**: general chat, FAQs, troubleshooting + +**Privacy and Compliance:** + +- Disable caching (`semantic_cache_enabled: false`) for categories handling: + - Personal identifiable information (PII) + - Financial data + - Health records + - Sensitive business information + +**Performance Tuning:** + +- Start with conservative (higher) thresholds +- Monitor cache hit rates per category +- Lower thresholds for categories with low hit rates +- Raise thresholds for categories with incorrect cache hits + ## Common Configuration Examples ### Enable All Security Features diff --git a/website/docs/tutorials/semantic-cache/in-memory-cache.md b/website/docs/tutorials/semantic-cache/in-memory-cache.md index 56214c87..4ba99a8a 100644 --- a/website/docs/tutorials/semantic-cache/in-memory-cache.md +++ b/website/docs/tutorials/semantic-cache/in-memory-cache.md @@ -44,23 +44,71 @@ graph TB semantic_cache: enabled: true backend_type: "memory" - similarity_threshold: 0.8 + similarity_threshold: 0.8 # Global default threshold max_entries: 1000 ttl_seconds: 3600 eviction_policy: "fifo" ``` +### Category-Level Configuration (New) + +Configure cache settings per category for fine-grained control: + +```yaml +semantic_cache: + enabled: true + backend_type: "memory" + similarity_threshold: 0.8 # Global default + max_entries: 1000 + ttl_seconds: 3600 + eviction_policy: "fifo" + +categories: + - name: health + system_prompt: "You are a health expert..." + semantic_cache_enabled: true + semantic_cache_similarity_threshold: 0.95 # Very strict for medical accuracy + model_scores: + - model: your-model + score: 0.5 + use_reasoning: false + + - name: general_chat + system_prompt: "You are a helpful assistant..." + semantic_cache_similarity_threshold: 0.75 # Relaxed for better hit rate + model_scores: + - model: your-model + score: 0.7 + use_reasoning: false + + - name: troubleshooting + # No cache settings - uses global default (0.8) + model_scores: + - model: your-model + score: 0.7 + use_reasoning: false +``` + ### Configuration Options | Parameter | Type | Default | Description | |-----------|------|---------|-------------| -| `enabled` | boolean | `false` | Enable/disable semantic caching | +| `enabled` | boolean | `false` | Enable/disable semantic caching globally | | `backend_type` | string | `"memory"` | Cache backend type (must be "memory") | -| `similarity_threshold` | float | `0.8` | Minimum similarity for cache hits (0.0-1.0) | +| `similarity_threshold` | float | `0.8` | Global minimum similarity for cache hits (0.0-1.0) | | `max_entries` | integer | `1000` | Maximum number of cached entries | | `ttl_seconds` | integer | `3600` | Time-to-live for cache entries (seconds, 0 = no expiration) | | `eviction_policy` | string | `"fifo"` | Eviction policy: `"fifo"`, `"lru"`, `"lfu"` | +### Category-Level Configuration Options + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `semantic_cache_enabled` | boolean | (inherits global) | Enable/disable caching for this category | +| `semantic_cache_similarity_threshold` | float | (inherits global) | Category-specific similarity threshold (0.0-1.0) | + +Category-level settings override global settings. If not specified, the category uses the global cache configuration. + ### Environment Examples #### Development Environment diff --git a/website/docs/tutorials/semantic-cache/overview.md b/website/docs/tutorials/semantic-cache/overview.md index 93460c87..8a259f32 100644 --- a/website/docs/tutorials/semantic-cache/overview.md +++ b/website/docs/tutorials/semantic-cache/overview.md @@ -10,7 +10,15 @@ Uses embeddings and cosine similarity to match queries by meaning rather than ex ### Configurable Thresholds -Adjustable similarity thresholds balance cache hit rates with response quality. +Adjustable similarity thresholds balance cache hit rates with response quality. Thresholds can be set globally or per-category for fine-grained control. + +### Category-Level Control + +**NEW**: Configure cache settings at the category level for precise control over sensitive and general content: + +- **Sensitive categories** (health, psychology, law): Use high thresholds (0.92-0.95) to prevent incorrect cache hits where word nuances matter +- **General categories** (chat, troubleshooting): Use lower thresholds (0.75-0.82) for better cache hit rates +- **Privacy categories**: Disable caching entirely for specific categories ### Multiple Backends From 1b7e9ae232650a2f1048fc28ec688bdf04be5fb4 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 21 Oct 2025 00:13:11 +0000 Subject: [PATCH 5/7] Remove duplicate code in FindSimilar functions Refactored FindSimilar() to delegate to FindSimilarWithThreshold() with default threshold instead of duplicating the entire implementation. This eliminates 226 lines of duplicate code across inmemory_cache.go and milvus_cache.go. Co-authored-by: rootfs <7062400+rootfs@users.noreply.github.com> --- .../pkg/cache/inmemory_cache.go | 126 +----------------- src/semantic-router/pkg/cache/milvus_cache.go | 103 +------------- 2 files changed, 3 insertions(+), 226 deletions(-) diff --git a/src/semantic-router/pkg/cache/inmemory_cache.go b/src/semantic-router/pkg/cache/inmemory_cache.go index 34e1f4aa..5820c5f8 100644 --- a/src/semantic-router/pkg/cache/inmemory_cache.go +++ b/src/semantic-router/pkg/cache/inmemory_cache.go @@ -207,131 +207,9 @@ func (c *InMemoryCache) AddEntry(requestID string, model string, query string, r return nil } -// FindSimilar searches for semantically similar cached requests +// FindSimilar searches for semantically similar cached requests using the default threshold func (c *InMemoryCache) FindSimilar(model string, query string) ([]byte, bool, error) { - start := time.Now() - - if !c.enabled { - observability.Debugf("InMemoryCache.FindSimilar: cache disabled") - return nil, false, nil - } - queryPreview := query - if len(query) > 50 { - queryPreview = query[:50] + "..." - } - observability.Debugf("InMemoryCache.FindSimilar: searching for model='%s', query='%s' (len=%d chars)", - model, queryPreview, len(query)) - - // Generate semantic embedding for similarity comparison - queryEmbedding, err := candle_binding.GetEmbedding(query, 0) // Auto-detect dimension - if err != nil { - metrics.RecordCacheOperation("memory", "find_similar", "error", time.Since(start).Seconds()) - return nil, false, fmt.Errorf("failed to generate embedding: %w", err) - } - - c.mu.RLock() - var ( - bestIndex = -1 - bestEntry CacheEntry - bestSimilarity float32 - entriesChecked int - expiredCount int - ) - // Capture the lookup time after acquiring the read lock so TTL checks aren’t skewed by embedding work or lock wait - now := time.Now() - - // Compare with completed entries for the same model, tracking only the best match - for entryIndex, entry := range c.entries { - // Skip incomplete entries - if entry.ResponseBody == nil { - continue - } - - // Only consider entries for the same model - if entry.Model != model { - continue - } - - // Skip entries that have expired before considering them - if c.isExpired(entry, now) { - expiredCount++ - continue - } - - // Compute semantic similarity using dot product - var dotProduct float32 - for i := 0; i < len(queryEmbedding) && i < len(entry.Embedding); i++ { - dotProduct += queryEmbedding[i] * entry.Embedding[i] - } - - entriesChecked++ - if bestIndex == -1 || dotProduct > bestSimilarity { - bestSimilarity = dotProduct - bestIndex = entryIndex - } - } - // Snapshot the best entry before releasing the read lock - if bestIndex >= 0 { - bestEntry = c.entries[bestIndex] - } - - // Unlock the read lock since we need the write lock to update the access info - c.mu.RUnlock() - - // Log if any expired entries were skipped - if expiredCount > 0 { - observability.Debugf("InMemoryCache: excluded %d expired entries during search (TTL: %ds)", - expiredCount, c.ttlSeconds) - observability.LogEvent("cache_expired_entries_found", map[string]interface{}{ - "backend": "memory", - "expired_count": expiredCount, - "ttl_seconds": c.ttlSeconds, - }) - } - - // Handle case where no suitable entries exist - if bestIndex < 0 { - atomic.AddInt64(&c.missCount, 1) - observability.Debugf("InMemoryCache.FindSimilar: no entries found with responses") - metrics.RecordCacheOperation("memory", "find_similar", "miss", time.Since(start).Seconds()) - metrics.RecordCacheMiss() - return nil, false, nil - } - - // Check if the best match meets the similarity threshold - if bestSimilarity >= c.similarityThreshold { - atomic.AddInt64(&c.hitCount, 1) - - c.mu.Lock() - c.updateAccessInfo(bestIndex, bestEntry) - c.mu.Unlock() - - observability.Debugf("InMemoryCache.FindSimilar: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes", - bestSimilarity, c.similarityThreshold, len(bestEntry.ResponseBody)) - observability.LogEvent("cache_hit", map[string]interface{}{ - "backend": "memory", - "similarity": bestSimilarity, - "threshold": c.similarityThreshold, - "model": model, - }) - metrics.RecordCacheOperation("memory", "find_similar", "hit", time.Since(start).Seconds()) - metrics.RecordCacheHit() - return bestEntry.ResponseBody, true, nil - } - - atomic.AddInt64(&c.missCount, 1) - observability.Debugf("InMemoryCache.FindSimilar: CACHE MISS - best_similarity=%.4f < threshold=%.4f (checked %d entries)", - bestSimilarity, c.similarityThreshold, entriesChecked) - observability.LogEvent("cache_miss", map[string]interface{}{ - "backend": "memory", - "best_similarity": bestSimilarity, - "threshold": c.similarityThreshold, - "model": model, - "entries_checked": entriesChecked, - }) - metrics.RecordCacheOperation("memory", "find_similar", "miss", time.Since(start).Seconds()) - metrics.RecordCacheMiss() - return nil, false, nil + return c.FindSimilarWithThreshold(model, query, c.similarityThreshold) } // FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold diff --git a/src/semantic-router/pkg/cache/milvus_cache.go b/src/semantic-router/pkg/cache/milvus_cache.go index 11053a09..372c0656 100644 --- a/src/semantic-router/pkg/cache/milvus_cache.go +++ b/src/semantic-router/pkg/cache/milvus_cache.go @@ -487,108 +487,7 @@ func (c *MilvusCache) addEntry(id string, requestID string, model string, query // FindSimilar searches for semantically similar cached requests func (c *MilvusCache) FindSimilar(model string, query string) ([]byte, bool, error) { - start := time.Now() - - if !c.enabled { - observability.Debugf("MilvusCache.FindSimilar: cache disabled") - return nil, false, nil - } - queryPreview := query - if len(query) > 50 { - queryPreview = query[:50] + "..." - } - observability.Debugf("MilvusCache.FindSimilar: searching for model='%s', query='%s' (len=%d chars)", - model, queryPreview, len(query)) - - // Generate semantic embedding for similarity comparison - queryEmbedding, err := candle_binding.GetEmbedding(query, 0) // Auto-detect dimension - if err != nil { - metrics.RecordCacheOperation("milvus", "find_similar", "error", time.Since(start).Seconds()) - return nil, false, fmt.Errorf("failed to generate embedding: %w", err) - } - - ctx := context.Background() - - // Define search parameters - searchParam, err := entity.NewIndexHNSWSearchParam(c.config.Search.Params.Ef) - if err != nil { - return nil, false, fmt.Errorf("failed to create search parameters: %w", err) - } - - // Use Milvus Search for efficient similarity search - searchResult, err := c.client.Search( - ctx, - c.collectionName, - []string{}, - fmt.Sprintf("model == \"%s\" && response_body != \"\"", model), - []string{"response_body"}, - []entity.Vector{entity.FloatVector(queryEmbedding)}, - c.config.Collection.VectorField.Name, - entity.MetricType(c.config.Collection.VectorField.MetricType), - c.config.Search.TopK, - searchParam, - ) - if err != nil { - observability.Debugf("MilvusCache.FindSimilar: search failed: %v", err) - atomic.AddInt64(&c.missCount, 1) - metrics.RecordCacheOperation("milvus", "find_similar", "error", time.Since(start).Seconds()) - metrics.RecordCacheMiss() - return nil, false, nil - } - - if len(searchResult) == 0 || searchResult[0].ResultCount == 0 { - atomic.AddInt64(&c.missCount, 1) - observability.Debugf("MilvusCache.FindSimilar: no entries found") - metrics.RecordCacheOperation("milvus", "find_similar", "miss", time.Since(start).Seconds()) - metrics.RecordCacheMiss() - return nil, false, nil - } - - bestScore := searchResult[0].Scores[0] - if bestScore < c.similarityThreshold { - atomic.AddInt64(&c.missCount, 1) - observability.Debugf("MilvusCache.FindSimilar: CACHE MISS - best_similarity=%.4f < threshold=%.4f", - bestScore, c.similarityThreshold) - observability.LogEvent("cache_miss", map[string]interface{}{ - "backend": "milvus", - "best_similarity": bestScore, - "threshold": c.similarityThreshold, - "model": model, - "collection": c.collectionName, - }) - metrics.RecordCacheOperation("milvus", "find_similar", "miss", time.Since(start).Seconds()) - metrics.RecordCacheMiss() - return nil, false, nil - } - - // Cache Hit - var responseBody []byte - responseBodyColumn, ok := searchResult[0].Fields[0].(*entity.ColumnVarChar) - if ok && responseBodyColumn.Len() > 0 { - responseBody = []byte(responseBodyColumn.Data()[0]) - } - - if responseBody == nil { - observability.Debugf("MilvusCache.FindSimilar: cache hit but response_body is missing or not a string") - atomic.AddInt64(&c.missCount, 1) - metrics.RecordCacheOperation("milvus", "find_similar", "error", time.Since(start).Seconds()) - metrics.RecordCacheMiss() - return nil, false, nil - } - - atomic.AddInt64(&c.hitCount, 1) - observability.Debugf("MilvusCache.FindSimilar: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes", - bestScore, c.similarityThreshold, len(responseBody)) - observability.LogEvent("cache_hit", map[string]interface{}{ - "backend": "milvus", - "similarity": bestScore, - "threshold": c.similarityThreshold, - "model": model, - "collection": c.collectionName, - }) - metrics.RecordCacheOperation("milvus", "find_similar", "hit", time.Since(start).Seconds()) - metrics.RecordCacheHit() - return responseBody, true, nil + return c.FindSimilarWithThreshold(model, query, c.similarityThreshold) } // FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold From ae39fe2dcf07a5eedbd77f62c6e010ebeabb7d10 Mon Sep 17 00:00:00 2001 From: Huamin Chen Date: Mon, 20 Oct 2025 20:52:34 -0400 Subject: [PATCH 6/7] Update src/semantic-router/pkg/extproc/request_handler.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/semantic-router/pkg/extproc/request_handler.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/semantic-router/pkg/extproc/request_handler.go b/src/semantic-router/pkg/extproc/request_handler.go index e90f4745..1203c29c 100644 --- a/src/semantic-router/pkg/extproc/request_handler.go +++ b/src/semantic-router/pkg/extproc/request_handler.go @@ -411,10 +411,8 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo } else if len(nonUserMessages) > 0 { classificationText = strings.Join(nonUserMessages, " ") } - if classificationText != "" { - categoryName = r.findCategoryForClassification(classificationText) - observability.Debugf("Classified request to category: %s", categoryName) - } + categoryName = r.findCategoryForClassification(classificationText) + observability.Debugf("Classified request to category: %s", categoryName) } // Handle caching with category-specific settings From 06a32e3806c5d13ed783f04858b4e98f77ac6259 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 21 Oct 2025 01:04:14 +0000 Subject: [PATCH 7/7] Revert changes from unsigned commit ae39fe2 Restored the classificationText empty check that was removed in the previous commit. Co-authored-by: rootfs <7062400+rootfs@users.noreply.github.com> --- src/semantic-router/pkg/extproc/request_handler.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/semantic-router/pkg/extproc/request_handler.go b/src/semantic-router/pkg/extproc/request_handler.go index 1203c29c..e90f4745 100644 --- a/src/semantic-router/pkg/extproc/request_handler.go +++ b/src/semantic-router/pkg/extproc/request_handler.go @@ -411,8 +411,10 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo } else if len(nonUserMessages) > 0 { classificationText = strings.Join(nonUserMessages, " ") } - categoryName = r.findCategoryForClassification(classificationText) - observability.Debugf("Classified request to category: %s", categoryName) + if classificationText != "" { + categoryName = r.findCategoryForClassification(classificationText) + observability.Debugf("Classified request to category: %s", categoryName) + } } // Handle caching with category-specific settings