diff --git a/Makefile b/Makefile index 52aa5e506..b53ca7e76 100644 --- a/Makefile +++ b/Makefile @@ -12,6 +12,7 @@ _run: -f tools/make/docs.mk \ -f tools/make/linter.mk \ -f tools/make/milvus.mk \ + -f tools/make/redis.mk \ -f tools/make/models.mk \ -f tools/make/pre-commit.mk \ -f tools/make/docker.mk \ diff --git a/candle-binding/src/model_architectures/traditional/deberta_v3_test.rs b/candle-binding/src/model_architectures/traditional/deberta_v3_test.rs index 61d148e6e..a6e81efd1 100644 --- a/candle-binding/src/model_architectures/traditional/deberta_v3_test.rs +++ b/candle-binding/src/model_architectures/traditional/deberta_v3_test.rs @@ -29,7 +29,7 @@ fn test_deberta_v3_invalid_path() { #[test] fn test_deberta_v3_debug_format() { // Test that the Debug trait exists - let _type_check: Option> = None::>; + let _type_check: Option> = None; } #[cfg(test)] diff --git a/config/config.redis.yaml b/config/config.redis.yaml new file mode 100644 index 000000000..6b2863d99 --- /dev/null +++ b/config/config.redis.yaml @@ -0,0 +1,341 @@ +bert_model: + model_id: models/all-MiniLM-L12-v2 + threshold: 0.6 + use_cpu: true + +semantic_cache: + enabled: true # Global cache enabled (applies to all requests) + backend_type: "redis" # Using Redis vector database for semantic cache + similarity_threshold: 0.80 # Global threshold (lowered for better matching) + ttl_seconds: 3600 + backend_config_path: "config/semantic-cache/redis.yaml" + # Embedding model for semantic similarity matching + # Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context) + # Default: "bert" (fastest, lowest memory) + embedding_model: "bert" + +tools: + enabled: true + top_k: 3 + similarity_threshold: 0.2 + tools_db_path: "config/tools_db.json" + fallback_to_empty: true + +prompt_guard: + enabled: true # Global default - can be overridden per category with jailbreak_enabled + use_modernbert: true + model_id: "models/jailbreak_classifier_modernbert-base_model" + threshold: 0.7 + use_cpu: true + jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json" + +# vLLM Endpoints Configuration +# IMPORTANT: 'address' field must be a valid IP address (IPv4 or IPv6) +# Supported formats: 127.0.0.1, 192.168.1.1, ::1, 2001:db8::1 +# NOT supported: domain names (example.com), protocol prefixes (http://), paths (/api), ports in address (use 'port' field) +vllm_endpoints: + - name: "local_vllm" + address: "127.0.0.1" # Local vLLM instance + port: 8000 + weight: 1 + +model_config: + "openai/gpt-oss-20b": + reasoning_family: "gpt-oss" # GPT-OSS uses reasoning_effort parameter + preferred_endpoints: ["local_vllm"] + +# Classifier configuration +classifier: + category_model: + model_id: "models/category_classifier_modernbert-base_model" + use_modernbert: true + threshold: 0.6 + use_cpu: true + category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json" + pii_model: + model_id: "models/pii_classifier_modernbert-base_presidio_token_model" + use_modernbert: true + threshold: 0.7 + use_cpu: true + pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json" + +# Categories define domain metadata only (no routing logic) +categories: + - name: business + description: "Business and management related queries" + mmlu_categories: ["business"] + - name: law + description: "Legal questions and law-related topics" + mmlu_categories: ["law"] + - name: psychology + description: "Psychology and mental health topics" + mmlu_categories: ["psychology"] + - name: biology + description: "Biology and life sciences questions" + mmlu_categories: ["biology"] + - name: chemistry + description: "Chemistry and chemical sciences questions" + mmlu_categories: ["chemistry"] + - name: history + description: "Historical questions and cultural topics" + mmlu_categories: ["history"] + - name: other + description: "General knowledge and miscellaneous topics" + mmlu_categories: ["other"] + - name: health + description: "Health and medical information queries" + mmlu_categories: ["health"] + - name: economics + description: "Economics and financial topics" + mmlu_categories: ["economics"] + - name: math + description: "Mathematics and quantitative reasoning" + mmlu_categories: ["math"] + - name: physics + description: "Physics and physical sciences" + mmlu_categories: ["physics"] + - name: computer_science + description: "Computer science and programming" + mmlu_categories: ["computer_science"] + - name: philosophy + description: "Philosophy and ethical questions" + mmlu_categories: ["philosophy"] + - name: engineering + description: "Engineering and technical problem-solving" + mmlu_categories: ["engineering"] + +# Decisions define routing logic with domain-based conditions +# Redis semantic cache is enabled for selected high-value categories +strategy: "priority" + +decisions: + - name: "psychology_decision" + description: "Psychology and mental health topics - with Redis semantic cache" + priority: 100 + rules: + operator: "AND" + conditions: + - type: "domain" + name: "psychology" + modelRefs: + - model: "openai/gpt-oss-20b" + use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice." + - type: "semantic-cache" + configuration: + enabled: true + similarity_threshold: 0.92 + - type: "pii" + configuration: + enabled: true + pii_types_allowed: [] + + - name: "health_decision" + description: "Health and medical information queries - with Redis semantic cache" + priority: 100 + rules: + operator: "AND" + conditions: + - type: "domain" + name: "health" + modelRefs: + - model: "openai/gpt-oss-20b" + use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies." + - type: "semantic-cache" + configuration: + enabled: true + similarity_threshold: 0.95 + - type: "pii" + configuration: + enabled: true + pii_types_allowed: [] + + - name: "general_decision" + description: "General knowledge and miscellaneous topics - with Redis semantic cache" + priority: 50 + rules: + operator: "AND" + conditions: + - type: "domain" + name: "other" + modelRefs: + - model: "openai/gpt-oss-20b" + use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + system_prompt: "You are a helpful and knowledgeable assistant. Provide accurate, helpful responses across a wide range of topics." + - type: "semantic-cache" + configuration: + enabled: true + similarity_threshold: 0.75 + - type: "pii" + configuration: + enabled: true + pii_types_allowed: [] + + # Other categories without semantic-cache for comparison + - name: "business_decision" + description: "Business and management queries" + priority: 100 + rules: + operator: "AND" + conditions: + - type: "domain" + name: "business" + modelRefs: + - model: "openai/gpt-oss-20b" + use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development." + - type: "pii" + configuration: + enabled: true + pii_types_allowed: [] + + - name: "math_decision" + description: "Mathematics and quantitative reasoning" + priority: 100 + rules: + operator: "AND" + conditions: + - type: "domain" + name: "math" + modelRefs: + - model: "openai/gpt-oss-20b" + use_reasoning: true + plugins: + - type: "system_prompt" + configuration: + system_prompt: "You are a mathematics expert. Provide step-by-step solutions, show your work clearly, and explain mathematical concepts in an understandable way." + - type: "pii" + configuration: + enabled: true + pii_types_allowed: [] + + - name: "computer_science_decision" + description: "Computer science and programming" + priority: 100 + rules: + operator: "AND" + conditions: + - type: "domain" + name: "computer_science" + modelRefs: + - model: "openai/gpt-oss-20b" + use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + system_prompt: "You are a computer science expert with knowledge of algorithms, data structures, programming languages, and software engineering." + - type: "pii" + configuration: + enabled: true + pii_types_allowed: [] + +# Router Configuration for Dual-Path Selection +router: + high_confidence_threshold: 0.99 + low_latency_threshold_ms: 2000 + lora_baseline_score: 0.8 + traditional_baseline_score: 0.7 + embedding_baseline_score: 0.75 + success_confidence_threshold: 0.8 + large_batch_threshold: 4 + lora_default_execution_time_ms: 1345 + traditional_default_execution_time_ms: 4567 + default_confidence_threshold: 0.95 + default_max_latency_ms: 5000 + default_batch_size: 4 + default_avg_execution_time_ms: 3000 + lora_default_confidence: 0.99 + traditional_default_confidence: 0.95 + lora_default_success_rate: 0.98 + traditional_default_success_rate: 0.95 + multi_task_lora_weight: 0.30 + single_task_traditional_weight: 0.30 + large_batch_lora_weight: 0.25 + small_batch_traditional_weight: 0.25 + medium_batch_weight: 0.10 + high_confidence_lora_weight: 0.25 + low_confidence_traditional_weight: 0.25 + low_latency_lora_weight: 0.30 + high_latency_traditional_weight: 0.10 + performance_history_weight: 0.20 + traditional_bert_confidence_threshold: 0.95 + traditional_modernbert_confidence_threshold: 0.8 + traditional_pii_detection_threshold: 0.5 + traditional_token_classification_threshold: 0.9 + traditional_dropout_prob: 0.1 + traditional_attention_dropout_prob: 0.1 + tie_break_confidence: 0.5 + +default_model: openai/gpt-oss-20b + +# Reasoning family configurations +reasoning_families: + deepseek: + type: "chat_template_kwargs" + parameter: "thinking" + + qwen3: + type: "chat_template_kwargs" + parameter: "enable_thinking" + + gpt-oss: + type: "reasoning_effort" + parameter: "reasoning_effort" + gpt: + type: "reasoning_effort" + parameter: "reasoning_effort" + +# Global default reasoning effort level +default_reasoning_effort: high + +# API Configuration +api: + batch_classification: + max_batch_size: 100 + concurrency_threshold: 5 + max_concurrency: 8 + metrics: + enabled: true + detailed_goroutine_tracking: true + high_resolution_timing: false + sample_rate: 1.0 + duration_buckets: + [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30] + size_buckets: [1, 2, 5, 10, 20, 50, 100, 200] + +# Embedding Models Configuration +embedding_models: + qwen3_model_path: "models/Qwen3-Embedding-0.6B" + gemma_model_path: "models/embeddinggemma-300m" + use_cpu: true # Set to false for GPU acceleration (requires CUDA) + +# Observability Configuration +observability: + tracing: + enabled: true # Enable distributed tracing for docker-compose stack + provider: "opentelemetry" # Provider: opentelemetry, openinference, openllmetry + exporter: + type: "otlp" # Export spans to Jaeger (via OTLP gRPC) + endpoint: "jaeger:4317" # Jaeger collector inside compose network + insecure: true # Use insecure connection (no TLS) + sampling: + type: "always_on" # Sampling: always_on, always_off, probabilistic + rate: 1.0 # Sampling rate for probabilistic (0.0-1.0) + resource: + service_name: "vllm-semantic-router" + service_version: "v0.1.0" + deployment_environment: "development" + diff --git a/config/semantic-cache/redis.yaml b/config/semantic-cache/redis.yaml new file mode 100644 index 000000000..2fc52a037 --- /dev/null +++ b/config/semantic-cache/redis.yaml @@ -0,0 +1,131 @@ +# Redis Vector Database Configuration for Semantic Cache +# This configuration file contains settings for using Redis with vector search as the semantic cache backend. +# To use this configuration: +# 1. Set backend_type: "redis" in your main config.yaml +# 2. Set backend_config_path: "config/semantic-cache/redis.yaml" in your main config.yaml +# 3. Ensure Redis server with RediSearch module is running and accessible +# 4. Redis Stack or Redis Enterprise with vector search capability is required + +# Redis connection settings +connection: + # Redis server host (change for production deployment) + host: "localhost" # For production: use your Redis cluster endpoint + + # Redis server port + port: 6379 # Standard Redis port + + # Database number (0-15 for standard Redis) + database: 0 + + # Password for authentication (leave empty if no auth required) + password: "" + + # Connection timeout in seconds + timeout: 30 + + # TLS/SSL configuration (recommended for production) + tls: + enabled: false # Set to true for secure connections + cert_file: "" # Path to client certificate + key_file: "" # Path to client private key + ca_file: "" # Path to CA certificate + +# Index settings for vector search +index: + # Name of the search index + name: "semantic_cache_idx" + + # Key prefix for documents in this index + prefix: "doc:" + + # Vector field configuration + vector_field: + # Name of the vector field + name: "embedding" + + # Dimension of the embeddings (auto-detected from model at runtime) + dimension: 384 # This value is ignored - dimension is auto-detected from the embedding model + + # Distance metric for similarity calculation + # Options: COSINE (cosine similarity), L2 (Euclidean distance), IP (inner product) + metric_type: "COSINE" # COSINE is recommended for semantic similarity + + # Index type and parameters + # Options: HNSW (Hierarchical Navigable Small World) or FLAT (brute force) + index_type: "HNSW" # HNSW is recommended for performance + + # Index parameters (only used when index_type is HNSW) + params: + M: 16 # Number of bi-directional links per node (default: 16) + efConstruction: 64 # Size of dynamic candidate list during construction (default: 64) + +# Search configuration +search: + # Number of top results to retrieve for similarity comparison + topk: 1 # We only need the most similar entry for cache lookup + +# Logging and monitoring +logging: + # Log level for Redis client operations (debug, info, warn, error) + level: "info" + + # Enable query/search logging for debugging + enable_query_log: false + + # Enable performance metrics collection + enable_metrics: true + +# Development and debugging settings +development: + # Drop index on startup (WARNING: This will delete all cached data) + drop_index_on_startup: true # Enable for development to test dynamic dimensions + + # Create index if it doesn't exist + auto_create_index: true + + # Print detailed error messages + verbose_errors: true + +# Example configurations for different environments: +# +# Local Development (Docker - Redis Stack): +# connection: +# host: "localhost" +# port: 6379 +# password: "" +# database: 0 +# development: +# drop_index_on_startup: true # Clean start for development +# auto_create_index: true +# +# Production (Redis Enterprise Cloud): +# connection: +# host: "redis-12345.c123.us-east-1-1.ec2.cloud.redislabs.com" +# port: 12345 +# password: "your-secure-password" +# database: 0 +# tls: +# enabled: true +# development: +# drop_index_on_startup: false +# auto_create_index: false # Pre-create indexes in production +# +# Kubernetes Deployment: +# connection: +# host: "redis-service.redis-system.svc.cluster.local" +# port: 6379 +# timeout: 60 # Longer timeout for cluster environments +# password: "${REDIS_PASSWORD}" # Use environment variable +# +# Performance Tuning Notes: +# - For HNSW index: +# * M (16-64): Higher M = better recall, more memory, slower indexing +# * efConstruction (64-512): Higher ef = better index quality, slower indexing +# - For metric_type: +# * COSINE: Best for semantic similarity (normalized vectors) +# * IP: Fast but requires normalized vectors +# * L2: Euclidean distance, good for non-normalized vectors +# - For index_type: +# * HNSW: Fast approximate search, recommended for >10k entries +# * FLAT: Exact search, better for <10k entries or when recall must be 100% + diff --git a/examples/redis-cache-example.go b/examples/redis-cache-example.go new file mode 100644 index 000000000..ba72d0322 --- /dev/null +++ b/examples/redis-cache-example.go @@ -0,0 +1,145 @@ +package main + +import ( + "fmt" + "log" + "time" + + candle_binding "github.com/vllm-project/semantic-router/candle-binding" + "github.com/vllm-project/semantic-router/src/semantic-router/pkg/cache" +) + +func main() { + // Example: Setting up Redis cache backend + fmt.Println("Redis Cache Backend Example") + fmt.Println("===========================") + + // Initialize the embedding model + fmt.Println("\n0. Initializing embedding model...") + err := candle_binding.InitModel("sentence-transformers/all-MiniLM-L6-v2", true) + if err != nil { + log.Fatalf("Failed to initialize embedding model: %v", err) + } + fmt.Println("✓ Embedding model initialized") + + // Configuration for Redis cache + config := cache.CacheConfig{ + BackendType: cache.RedisCacheType, + Enabled: true, + SimilarityThreshold: 0.85, + TTLSeconds: 3600, // Entries expire after 1 hour + BackendConfigPath: "../../config/semantic-cache/redis.yaml", + } + + // Create cache backend + fmt.Println("\n1. Creating Redis cache backend...") + cacheBackend, err := cache.NewCacheBackend(config) + if err != nil { + log.Fatalf("Failed to create cache backend: %v", err) + } + defer cacheBackend.Close() + + fmt.Println("✓ Redis cache backend created successfully") + + // Example cache operations + model := "gpt-4" + query := "What is the capital of France?" + requestID := "req-12345" + requestBody := []byte(`{"model":"gpt-4","messages":[{"role":"user","content":"What is the capital of France?"}]}`) + responseBody := []byte(`{"choices":[{"message":{"content":"The capital of France is Paris."}}]}`) + + // Add entry to cache + fmt.Println("\n2. Adding entry to cache...") + err = cacheBackend.AddEntry(requestID, model, query, requestBody, responseBody) + if err != nil { + log.Fatalf("Failed to add entry: %v", err) + } + fmt.Println("✓ Entry added to cache") + + // Wait a moment for Redis to index the entry + time.Sleep(100 * time.Millisecond) + + // Search for similar entry + fmt.Println("\n3. Searching for similar query...") + similarQuery := "What's the capital city of France?" + cachedResponse, found, err := cacheBackend.FindSimilar(model, similarQuery) + if err != nil { + log.Fatalf("Failed to search cache: %v", err) + } + + if found { + fmt.Println("✓ Cache HIT! Found similar query") + fmt.Printf(" Cached response: %s\n", string(cachedResponse)) + } else { + fmt.Println("✗ Cache MISS - no similar query found") + } + + // Get cache statistics + fmt.Println("\n4. Cache Statistics:") + stats := cacheBackend.GetStats() + fmt.Printf(" Total Entries: %d\n", stats.TotalEntries) + fmt.Printf(" Hits: %d\n", stats.HitCount) + fmt.Printf(" Misses: %d\n", stats.MissCount) + fmt.Printf(" Hit Ratio: %.2f%%\n", stats.HitRatio*100) + + // Example with custom threshold + fmt.Println("\n5. Searching with custom threshold...") + strictQuery := "Paris is the capital of which country?" + cachedResponse, found, err = cacheBackend.FindSimilarWithThreshold(model, strictQuery, 0.75) + if err != nil { + log.Fatalf("Failed to search cache: %v", err) + } + + if found { + fmt.Println("✓ Cache HIT with threshold 0.75") + fmt.Printf(" Cached response: %s\n", string(cachedResponse)) + } else { + fmt.Println("✗ Cache MISS with threshold 0.75") + } + + // Example: Pending request workflow + fmt.Println("\n6. Pending Request Workflow:") + newRequestID := "req-67890" + newQuery := "What is machine learning?" + newRequestBody := []byte(`{"model":"gpt-4","messages":[{"role":"user","content":"What is machine learning?"}]}`) + + fmt.Println(" Adding pending request...") + err = cacheBackend.AddPendingRequest(newRequestID, model, newQuery, newRequestBody) + if err != nil { + log.Fatalf("Failed to add pending request: %v", err) + } + fmt.Println(" ✓ Pending request added") + + // Wait a moment for Redis to index the entry + time.Sleep(100 * time.Millisecond) + + // Simulate getting response from LLM + newResponseBody := []byte(`{"choices":[{"message":{"content":"Machine learning is a subset of AI..."}}]}`) + + fmt.Println(" Updating with response...") + err = cacheBackend.UpdateWithResponse(newRequestID, newResponseBody) + if err != nil { + log.Fatalf("Failed to update response: %v", err) + } + fmt.Println(" ✓ Response updated") + + // Verify the entry is now cached + cachedResponse, found, err = cacheBackend.FindSimilar(model, newQuery) + if err != nil { + log.Fatalf("Failed to search cache: %v", err) + } + + if found { + fmt.Println(" ✓ Entry is now in cache and searchable") + } + + // Final statistics + fmt.Println("\n7. Final Statistics:") + stats = cacheBackend.GetStats() + fmt.Printf(" Total Entries: %d\n", stats.TotalEntries) + fmt.Printf(" Hits: %d\n", stats.HitCount) + fmt.Printf(" Misses: %d\n", stats.MissCount) + fmt.Printf(" Hit Ratio: %.2f%%\n", stats.HitRatio*100) + + fmt.Println("\n✓ Example completed successfully!") +} diff --git a/src/semantic-router/go.mod b/src/semantic-router/go.mod index 28d296e0f..5fa0217ae 100644 --- a/src/semantic-router/go.mod +++ b/src/semantic-router/go.mod @@ -21,6 +21,7 @@ require ( github.com/openai/openai-go v1.12.0 github.com/prometheus/client_golang v1.23.0 github.com/prometheus/client_model v0.6.2 + github.com/redis/go-redis/v9 v9.17.0 github.com/stretchr/testify v1.11.1 github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000 go.opentelemetry.io/otel v1.38.0 @@ -50,6 +51,7 @@ require ( github.com/cockroachdb/logtags v0.0.0-20211118104740-dabe8e521a4f // indirect github.com/cockroachdb/redact v1.1.3 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect github.com/emicklei/go-restful/v3 v3.12.2 // indirect github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect github.com/evanphx/json-patch/v5 v5.9.0 // indirect diff --git a/src/semantic-router/go.sum b/src/semantic-router/go.sum index 7ab363ef9..c41e06bfd 100644 --- a/src/semantic-router/go.sum +++ b/src/semantic-router/go.sum @@ -12,6 +12,10 @@ github.com/bahlo/generic-list-go v0.2.0 h1:5sz/EEAK+ls5wF+NeqDpk5+iNdMDXrh3z3nPn github.com/bahlo/generic-list-go v0.2.0/go.mod h1:2KvAjgMlE5NNynlg/5iLrrCCZ2+5xWbdbCW3pNTGyYg= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs= +github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c= +github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA= +github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0= github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMUs= github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0= github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= @@ -43,6 +47,8 @@ github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dgraph-io/badger v1.6.0/go.mod h1:zwt7syl517jmP8s94KqSxTlM6IMsdhYy6psNgSztDR4= github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2/go.mod h1:SqUrOPUnsFjfmXRMNPybcSiG0BgUW2AuFH8PAnS2iTw= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= github.com/eknkc/amber v0.0.0-20171010120322-cdade1c07385/go.mod h1:0vRUJqYpeSZifjYj7uP3BG/gKcuzL9xWVV/Y+cK33KM= github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU= @@ -272,6 +278,8 @@ github.com/prometheus/common v0.65.0 h1:QDwzd+G1twt//Kwj/Ww6E9FQq1iVMmODnILtW1t2 github.com/prometheus/common v0.65.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8= github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= +github.com/redis/go-redis/v9 v9.17.0 h1:K6E+ZlYN95KSMmZeEQPbU/c++wfmEvfFB17yEAq/VhM= +github.com/redis/go-redis/v9 v9.17.0/go.mod h1:u410H11HMLoB+TP67dz8rL9s6QW2j76l0//kSOd3370= github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= github.com/rogpeppe/go-internal v1.8.1/go.mod h1:JeRgkft04UBgHMgCIwADu4Pn6Mtm5d4nPKWu0nJ5d+o= github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= diff --git a/src/semantic-router/pkg/cache/cache_factory.go b/src/semantic-router/pkg/cache/cache_factory.go index aedf56bd4..36cc47a1c 100644 --- a/src/semantic-router/pkg/cache/cache_factory.go +++ b/src/semantic-router/pkg/cache/cache_factory.go @@ -51,6 +51,17 @@ func NewCacheBackend(config CacheConfig) (CacheBackend, error) { } return NewMilvusCache(options) + case RedisCacheType: + logging.Debugf("Creating Redis cache backend - ConfigPath: %s, TTL: %ds, Threshold: %.3f", + config.BackendConfigPath, config.TTLSeconds, config.SimilarityThreshold) + options := RedisCacheOptions{ + Enabled: config.Enabled, + SimilarityThreshold: config.SimilarityThreshold, + TTLSeconds: config.TTLSeconds, + ConfigPath: config.BackendConfigPath, + } + return NewRedisCache(options) + case HybridCacheType: logging.Debugf("Creating Hybrid cache backend - MaxMemory: %d, TTL: %ds, Threshold: %.3f", config.MaxMemoryEntries, config.TTLSeconds, config.SimilarityThreshold) @@ -110,6 +121,16 @@ func ValidateCacheConfig(config CacheConfig) error { return fmt.Errorf("milvus config file not found: %s", config.BackendConfigPath) } logging.Debugf("Milvus config file found: %s", config.BackendConfigPath) + case RedisCacheType: + if config.BackendConfigPath == "" { + return fmt.Errorf("backend_config_path is required for Redis cache backend") + } + // Ensure the Redis configuration file exists + if _, err := os.Stat(config.BackendConfigPath); os.IsNotExist(err) { + logging.Debugf("Redis config file not found: %s", config.BackendConfigPath) + return fmt.Errorf("redis config file not found: %s", config.BackendConfigPath) + } + logging.Debugf("Redis config file found: %s", config.BackendConfigPath) } return nil @@ -162,5 +183,18 @@ func GetAvailableCacheBackends() []CacheBackendInfo { "TTL support", }, }, + { + Type: RedisCacheType, + Name: "Redis Vector Database", + Description: "High-performance semantic cache powered by Redis with vector search", + Features: []string{ + "Fast in-memory performance", + "Persistent storage with AOF/RDB", + "Scalable with Redis Cluster", + "HNSW and FLAT indexing", + "Wide ecosystem support", + "TTL support", + }, + }, } } diff --git a/src/semantic-router/pkg/cache/cache_interface.go b/src/semantic-router/pkg/cache/cache_interface.go index c79f8cb4c..b00e6eddd 100644 --- a/src/semantic-router/pkg/cache/cache_interface.go +++ b/src/semantic-router/pkg/cache/cache_interface.go @@ -64,6 +64,9 @@ const ( // MilvusCacheType specifies the Milvus vector database backend MilvusCacheType CacheBackendType = "milvus" + // RedisCacheType specifies the Redis vector database backend + RedisCacheType CacheBackendType = "redis" + // HybridCacheType specifies the hybrid HNSW + Milvus backend HybridCacheType CacheBackendType = "hybrid" ) diff --git a/src/semantic-router/pkg/cache/cache_test.go b/src/semantic-router/pkg/cache/cache_test.go index d1f361733..be215dee5 100644 --- a/src/semantic-router/pkg/cache/cache_test.go +++ b/src/semantic-router/pkg/cache/cache_test.go @@ -275,7 +275,7 @@ development: Context("with unsupported backend type", func() { It("should return error for unsupported backend type", func() { config := CacheConfig{ - BackendType: "redis", // Unsupported + BackendType: "unsupported_type", // Unsupported Enabled: true, SimilarityThreshold: 0.8, TTLSeconds: 3600, @@ -492,7 +492,7 @@ development: It("should return information about available backends", func() { backends := GetAvailableCacheBackends() - Expect(backends).To(HaveLen(2)) // Memory and Milvus + Expect(backends).To(HaveLen(3)) // Memory, Milvus, and Redis // Check memory backend info memoryBackend := backends[0] @@ -509,6 +509,14 @@ development: Expect(milvusBackend.Description).To(ContainSubstring("Milvus vector database")) Expect(milvusBackend.Features).To(ContainElement("Highly scalable")) Expect(milvusBackend.Features).To(ContainElement("Persistent storage")) + + // Check Redis backend info + redisBackend := backends[2] + Expect(redisBackend.Type).To(Equal(RedisCacheType)) + Expect(redisBackend.Name).To(Equal("Redis Vector Database")) + Expect(redisBackend.Description).To(ContainSubstring("Redis with vector search")) + Expect(redisBackend.Features).To(ContainElement("Fast in-memory performance")) + Expect(redisBackend.Features).To(ContainElement("TTL support")) }) }) }) diff --git a/src/semantic-router/pkg/cache/redis_cache.go b/src/semantic-router/pkg/cache/redis_cache.go new file mode 100644 index 000000000..aac25a450 --- /dev/null +++ b/src/semantic-router/pkg/cache/redis_cache.go @@ -0,0 +1,770 @@ +package cache + +import ( + "context" + "crypto/md5" + "encoding/binary" + "fmt" + "math" + "os" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/redis/go-redis/v9" + "sigs.k8s.io/yaml" + + candle_binding "github.com/vllm-project/semantic-router/candle-binding" + "github.com/vllm-project/semantic-router/src/semantic-router/pkg/observability/logging" + "github.com/vllm-project/semantic-router/src/semantic-router/pkg/observability/metrics" +) + +// RedisConfig defines the complete configuration structure for Redis cache backend. +type RedisConfig struct { + Connection struct { + Host string `json:"host" yaml:"host"` + Port int `json:"port" yaml:"port"` + Database int `json:"database" yaml:"database"` + Password string `json:"password" yaml:"password"` + Timeout int `json:"timeout" yaml:"timeout"` + TLS struct { + Enabled bool `json:"enabled" yaml:"enabled"` + CertFile string `json:"cert_file" yaml:"cert_file"` + KeyFile string `json:"key_file" yaml:"key_file"` + CAFile string `json:"ca_file" yaml:"ca_file"` + } `json:"tls" yaml:"tls"` + } `json:"connection" yaml:"connection"` + Index struct { + Name string `json:"name" yaml:"name"` + Prefix string `json:"prefix" yaml:"prefix"` + VectorField struct { + Name string `json:"name" yaml:"name"` + Dimension int `json:"dimension" yaml:"dimension"` + MetricType string `json:"metric_type" yaml:"metric_type"` // L2, IP, COSINE + } `json:"vector_field" yaml:"vector_field"` + IndexType string `json:"index_type" yaml:"index_type"` // HNSW or FLAT + Params struct { + M int `json:"M" yaml:"M"` + EfConstruction int `json:"efConstruction" yaml:"efConstruction"` + } `json:"params" yaml:"params"` + } `json:"index" yaml:"index"` + Search struct { + TopK int `json:"topk" yaml:"topk"` + } `json:"search" yaml:"search"` + Development struct { + DropIndexOnStartup bool `json:"drop_index_on_startup" yaml:"drop_index_on_startup"` + AutoCreateIndex bool `json:"auto_create_index" yaml:"auto_create_index"` + VerboseErrors bool `json:"verbose_errors" yaml:"verbose_errors"` + } `json:"development" yaml:"development"` + Logging struct { + Level string `json:"level" yaml:"level"` + EnableQueryLog bool `json:"enable_query_log" yaml:"enable_query_log"` + EnableMetrics bool `json:"enable_metrics" yaml:"enable_metrics"` + } `json:"logging" yaml:"logging"` +} + +// RedisCache provides a scalable semantic cache implementation using Redis with vector search +type RedisCache struct { + client *redis.Client + config *RedisConfig + indexName string + similarityThreshold float32 + ttlSeconds int + enabled bool + hitCount int64 + missCount int64 + lastCleanupTime *time.Time + mu sync.RWMutex +} + +// RedisCacheOptions contains configuration parameters for Redis cache initialization +type RedisCacheOptions struct { + SimilarityThreshold float32 + TTLSeconds int + Enabled bool + ConfigPath string +} + +// NewRedisCache initializes a new Redis-backed semantic cache instance +func NewRedisCache(options RedisCacheOptions) (*RedisCache, error) { + if !options.Enabled { + logging.Debugf("RedisCache: disabled, returning stub") + return &RedisCache{ + enabled: false, + }, nil + } + + // Load Redis configuration from file + logging.Debugf("RedisCache: loading config from %s", options.ConfigPath) + config, err := loadRedisConfig(options.ConfigPath) + if err != nil { + logging.Debugf("RedisCache: failed to load config: %v", err) + return nil, fmt.Errorf("failed to load Redis config: %w", err) + } + logging.Debugf("RedisCache: config loaded - host=%s:%d, index=%s, dimension=auto-detect", + config.Connection.Host, config.Connection.Port, config.Index.Name) + + // Establish connection to Redis server + logging.Debugf("RedisCache: connecting to Redis at %s:%d", config.Connection.Host, config.Connection.Port) + + redisClient := redis.NewClient(&redis.Options{ + Addr: fmt.Sprintf("%s:%d", config.Connection.Host, config.Connection.Port), + Password: config.Connection.Password, + DB: config.Connection.Database, + Protocol: 2, // Use RESP2 protocol for compatibility + }) + + // Test connection + ctx := context.Background() + if config.Connection.Timeout > 0 { + timeout := time.Duration(config.Connection.Timeout) * time.Second + var cancel context.CancelFunc + ctx, cancel = context.WithTimeout(ctx, timeout) + defer cancel() + logging.Debugf("RedisCache: connection timeout set to %s", timeout) + } + + if err := redisClient.Ping(ctx).Err(); err != nil { + logging.Debugf("RedisCache: failed to connect: %v", err) + return nil, fmt.Errorf("failed to connect to Redis: %w", err) + } + logging.Debugf("RedisCache: successfully connected to Redis") + + cache := &RedisCache{ + client: redisClient, + config: config, + indexName: config.Index.Name, + similarityThreshold: options.SimilarityThreshold, + ttlSeconds: options.TTLSeconds, + enabled: options.Enabled, + } + + // Set up the index for vector search + logging.Debugf("RedisCache: initializing index '%s'", config.Index.Name) + if err := cache.initializeIndex(); err != nil { + logging.Debugf("RedisCache: failed to initialize index: %v", err) + redisClient.Close() + return nil, fmt.Errorf("failed to initialize index: %w", err) + } + logging.Debugf("RedisCache: initialization complete") + + return cache, nil +} + +// loadRedisConfig reads and parses the Redis configuration from file +func loadRedisConfig(configPath string) (*RedisConfig, error) { + if configPath == "" { + return nil, fmt.Errorf("redis config path is required") + } + + logging.Debugf("Loading Redis config from: %s", configPath) + + data, err := os.ReadFile(configPath) + if err != nil { + return nil, fmt.Errorf("failed to read config file: %w", err) + } + + var config RedisConfig + if err := yaml.Unmarshal(data, &config); err != nil { + return nil, fmt.Errorf("failed to parse config file: %w", err) + } + + logging.Debugf("Redis config loaded: index=%s, dimension=%d, metric=%s", + config.Index.Name, config.Index.VectorField.Dimension, config.Index.VectorField.MetricType) + + // Apply defaults + if config.Index.VectorField.Name == "" { + config.Index.VectorField.Name = "embedding" + logging.Warnf("VectorField.Name not specified, using default: embedding") + } + if config.Index.VectorField.MetricType == "" { + config.Index.VectorField.MetricType = "COSINE" + } + if config.Index.IndexType == "" { + config.Index.IndexType = "HNSW" + } + if config.Index.Prefix == "" { + config.Index.Prefix = "doc:" + } + // Validate index params for HNSW + if config.Index.IndexType == "HNSW" { + if config.Index.Params.M == 0 { + config.Index.Params.M = 16 + } + if config.Index.Params.EfConstruction == 0 { + config.Index.Params.EfConstruction = 64 + } + } + if config.Search.TopK == 0 { + config.Search.TopK = 1 + } + + return &config, nil +} + +// initializeIndex sets up the Redis index for vector search +func (c *RedisCache) initializeIndex() error { + ctx := context.Background() + + // Check if index exists + _, err := c.client.FTInfo(ctx, c.indexName).Result() + indexExists := err == nil + + // Handle development mode index reset + if c.config.Development.DropIndexOnStartup && indexExists { + if err := c.client.FTDropIndexWithArgs(ctx, c.indexName, &redis.FTDropIndexOptions{ + DeleteDocs: true, + }).Err(); err != nil { + logging.Debugf("RedisCache: failed to drop index: %v", err) + return fmt.Errorf("failed to drop index: %w", err) + } + indexExists = false + logging.Debugf("RedisCache: dropped existing index '%s' for development", c.indexName) + logging.LogEvent("index_dropped", map[string]interface{}{ + "backend": "redis", + "index": c.indexName, + "reason": "development_mode", + }) + } + + // Create index if it doesn't exist + if !indexExists { + if !c.config.Development.AutoCreateIndex { + return fmt.Errorf("index %s does not exist and auto-creation is disabled", c.indexName) + } + + if err := c.createIndex(); err != nil { + logging.Debugf("RedisCache: failed to create index: %v", err) + return fmt.Errorf("failed to create index: %w", err) + } + logging.Debugf("RedisCache: created new index '%s' with dimension %d", + c.indexName, c.config.Index.VectorField.Dimension) + logging.LogEvent("index_created", map[string]interface{}{ + "backend": "redis", + "index": c.indexName, + "dimension": c.config.Index.VectorField.Dimension, + }) + } + + return nil +} + +// createIndex builds the Redis index with the appropriate schema +func (c *RedisCache) createIndex() error { + ctx := context.Background() + + // Determine embedding dimension automatically + testEmbedding, err := candle_binding.GetEmbedding("test", 0) + if err != nil { + return fmt.Errorf("failed to detect embedding dimension: %w", err) + } + actualDimension := len(testEmbedding) + + logging.Debugf("RedisCache.createIndex: auto-detected embedding dimension: %d", actualDimension) + + // Determine distance metric for Redis + var distanceMetric string + switch c.config.Index.VectorField.MetricType { + case "L2": + distanceMetric = "L2" + case "IP": + distanceMetric = "IP" + case "COSINE": + distanceMetric = "COSINE" + default: + logging.Warnf("RedisCache: unknown metric type '%s', defaulting to COSINE", c.config.Index.VectorField.MetricType) + distanceMetric = "COSINE" + } + + // Create vector field arguments based on index type + var vectorArgs *redis.FTVectorArgs + if c.config.Index.IndexType == "HNSW" { + vectorArgs = &redis.FTVectorArgs{ + HNSWOptions: &redis.FTHNSWOptions{ + Type: "FLOAT32", + Dim: actualDimension, + DistanceMetric: distanceMetric, + MaxEdgesPerNode: c.config.Index.Params.M, + MaxAllowedEdgesPerNode: c.config.Index.Params.EfConstruction, + }, + } + } else { + vectorArgs = &redis.FTVectorArgs{ + FlatOptions: &redis.FTFlatOptions{ + Type: "FLOAT32", + Dim: actualDimension, + DistanceMetric: distanceMetric, + }, + } + } + + // Create the index with proper schema + _, err = c.client.FTCreate(ctx, + c.indexName, + &redis.FTCreateOptions{ + OnHash: true, + Prefix: []interface{}{c.config.Index.Prefix}, + }, + &redis.FieldSchema{ + FieldName: "request_id", + FieldType: redis.SearchFieldTypeText, + }, + &redis.FieldSchema{ + FieldName: "model", + FieldType: redis.SearchFieldTypeTag, + }, + &redis.FieldSchema{ + FieldName: "query", + FieldType: redis.SearchFieldTypeText, + }, + &redis.FieldSchema{ + FieldName: "request_body", + FieldType: redis.SearchFieldTypeText, + NoIndex: true, // Don't index large text fields + }, + &redis.FieldSchema{ + FieldName: "response_body", + FieldType: redis.SearchFieldTypeText, + NoIndex: true, // Don't index large text fields + }, + &redis.FieldSchema{ + FieldName: c.config.Index.VectorField.Name, + FieldType: redis.SearchFieldTypeVector, + VectorArgs: vectorArgs, + }, + &redis.FieldSchema{ + FieldName: "timestamp", + FieldType: redis.SearchFieldTypeNumeric, + }, + ).Result() + if err != nil { + return fmt.Errorf("failed to create Redis index: %w", err) + } + + return nil +} + +// IsEnabled returns the current cache activation status +func (c *RedisCache) IsEnabled() bool { + return c.enabled +} + +// AddPendingRequest stores a request that is awaiting its response +func (c *RedisCache) AddPendingRequest(requestID string, model string, query string, requestBody []byte) error { + start := time.Now() + + if !c.enabled { + return nil + } + + // Store incomplete entry for later completion with response + err := c.addEntry("", requestID, model, query, requestBody, nil) + + if err != nil { + metrics.RecordCacheOperation("redis", "add_pending", "error", time.Since(start).Seconds()) + } else { + metrics.RecordCacheOperation("redis", "add_pending", "success", time.Since(start).Seconds()) + } + + return err +} + +// UpdateWithResponse completes a pending request by adding the response +func (c *RedisCache) UpdateWithResponse(requestID string, responseBody []byte) error { + start := time.Now() + + if !c.enabled { + return nil + } + + logging.Debugf("RedisCache.UpdateWithResponse: updating pending entry (request_id: %s, response_size: %d)", + requestID, len(responseBody)) + + // Find the pending entry by request_id + ctx := context.Background() + + // Search for documents with matching request_id using TEXT field syntax (exact match with quotes) + // TAG syntax with {} doesn't work well with UUIDs containing hyphens + query := fmt.Sprintf("@request_id:\"%s\"", requestID) + logging.Infof("UpdateWithResponse: searching with query: %s", query) + + results, err := c.client.FTSearchWithArgs(ctx, + c.indexName, + query, + &redis.FTSearchOptions{ + Return: []redis.FTSearchReturn{ + {FieldName: "model"}, + {FieldName: "query"}, + {FieldName: "request_body"}, + }, + LimitOffset: 0, + Limit: 1, + }, + ).Result() + if err != nil { + logging.Infof("RedisCache.UpdateWithResponse: search failed with query '%s': %v", query, err) + metrics.RecordCacheOperation("redis", "update_response", "error", time.Since(start).Seconds()) + return fmt.Errorf("failed to search pending entry: %w", err) + } + + if results.Total == 0 { + logging.Infof("RedisCache.UpdateWithResponse: no pending entry found with request_id=%s", requestID) + metrics.RecordCacheOperation("redis", "update_response", "error", time.Since(start).Seconds()) + return fmt.Errorf("no pending entry found") + } + + logging.Infof("UpdateWithResponse: found %d result(s) for request_id=%s", results.Total, requestID) + + doc := results.Docs[0] + model := fmt.Sprint(doc.Fields["model"]) + queryStr := fmt.Sprint(doc.Fields["query"]) + requestBodyStr := fmt.Sprint(doc.Fields["request_body"]) + + // Extract document ID from the result + docID := doc.ID + + logging.Debugf("RedisCache.UpdateWithResponse: found pending entry, updating (id: %s, model: %s)", docID, model) + + // Update the document with response body + err = c.addEntry(docID, requestID, model, queryStr, []byte(requestBodyStr), responseBody) + if err != nil { + metrics.RecordCacheOperation("redis", "update_response", "error", time.Since(start).Seconds()) + return fmt.Errorf("failed to update entry: %w", err) + } + + logging.Debugf("RedisCache.UpdateWithResponse: successfully updated entry with response") + metrics.RecordCacheOperation("redis", "update_response", "success", time.Since(start).Seconds()) + + return nil +} + +// AddEntry stores a complete request-response pair in the cache +func (c *RedisCache) AddEntry(requestID string, model string, query string, requestBody, responseBody []byte) error { + start := time.Now() + + if !c.enabled { + return nil + } + + err := c.addEntry("", requestID, model, query, requestBody, responseBody) + + if err != nil { + metrics.RecordCacheOperation("redis", "add_entry", "error", time.Since(start).Seconds()) + } else { + metrics.RecordCacheOperation("redis", "add_entry", "success", time.Since(start).Seconds()) + } + + return err +} + +// floatsToBytes converts float32 slice to byte array for Redis vector storage +func floatsToBytes(fs []float32) []byte { + buf := make([]byte, len(fs)*4) + for i, f := range fs { + u := math.Float32bits(f) + binary.LittleEndian.PutUint32(buf[i*4:], u) + } + return buf +} + +// escapeRedisTagValue escapes special characters (,.-/ and space) in TAG field values for Redis queries. +func escapeRedisTagValue(value string) string { + replacer := strings.NewReplacer( + ",", "\\,", + ".", "\\.", + "-", "\\-", + "/", "\\/", + " ", "\\ ", + ) + return replacer.Replace(value) +} + +// addEntry handles the internal logic for storing entries in Redis +func (c *RedisCache) addEntry(id string, requestID string, model string, query string, requestBody, responseBody []byte) error { + logging.Infof("addEntry called: id='%s', requestID='%s', requestBody_len=%d, responseBody_len=%d", + id, requestID, len(requestBody), len(responseBody)) + + // Generate semantic embedding for the query + embedding, err := candle_binding.GetEmbedding(query, 0) + if err != nil { + return fmt.Errorf("failed to generate embedding: %w", err) + } + + // Generate unique ID if not provided + if id == "" { + id = fmt.Sprintf("%x", md5.Sum(fmt.Appendf(nil, "%s_%s_%d", model, query, time.Now().UnixNano()))) + } + + ctx := context.Background() + + // Convert embedding to bytes + embeddingBytes := floatsToBytes(embedding) + + // Prepare document key with prefix (check if already prefixed to avoid double prefix) + var docKey string + if strings.HasPrefix(id, c.config.Index.Prefix) { + docKey = id // Already has prefix, use as-is + logging.Infof("ID already has prefix, using as-is: %s", docKey) + } else { + docKey = c.config.Index.Prefix + id // Add prefix + logging.Infof("Adding prefix to ID: %s -> %s", id, docKey) + } + + responseBodyStr := string(responseBody) + logging.Infof("Setting response_body field: len=%d, isEmpty=%v", len(responseBodyStr), responseBodyStr == "") + + // Store as Redis hash + err = c.client.HSet(ctx, + docKey, + map[string]interface{}{ + "request_id": requestID, + "model": model, + "query": query, + "request_body": string(requestBody), + "response_body": responseBodyStr, + c.config.Index.VectorField.Name: embeddingBytes, + "timestamp": time.Now().Unix(), + }, + ).Err() + if err != nil { + logging.Debugf("RedisCache.addEntry: HSet failed: %v", err) + return fmt.Errorf("failed to store cache entry: %w", err) + } + + // Set TTL if configured + if c.ttlSeconds > 0 { + c.client.Expire(ctx, docKey, time.Duration(c.ttlSeconds)*time.Second) + } + + logging.Debugf("RedisCache.addEntry: successfully added entry to Redis (key: %s, embedding_dim: %d, request_size: %d, response_size: %d)", + docKey, len(embedding), len(requestBody), len(responseBody)) + logging.LogEvent("cache_entry_added", map[string]interface{}{ + "backend": "redis", + "index": c.indexName, + "request_id": requestID, + "query": query, + "model": model, + "embedding_dimension": len(embedding), + }) + return nil +} + +// FindSimilar searches for semantically similar cached requests +func (c *RedisCache) FindSimilar(model string, query string) ([]byte, bool, error) { + return c.FindSimilarWithThreshold(model, query, c.similarityThreshold) +} + +// FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold +func (c *RedisCache) FindSimilarWithThreshold(model string, query string, threshold float32) ([]byte, bool, error) { + start := time.Now() + + logging.Infof("FindSimilarWithThreshold ENTERED: model=%s, query='%s', threshold=%.2f", model, query, threshold) + + if !c.enabled { + logging.Infof("FindSimilarWithThreshold: cache disabled, returning early") + return nil, false, nil + } + + logging.Infof("FindSimilarWithThreshold: cache enabled, generating embedding for query") + + // Generate semantic embedding for similarity comparison + queryEmbedding, err := candle_binding.GetEmbedding(query, 0) + if err != nil { + metrics.RecordCacheOperation("redis", "find_similar", "error", time.Since(start).Seconds()) + return nil, false, fmt.Errorf("failed to generate embedding: %w", err) + } + + ctx := context.Background() + + // Convert embedding to bytes for Redis query + embeddingBytes := floatsToBytes(queryEmbedding) + + // Build KNN query with model filter (TAG fields require escaped values) + escapedModel := escapeRedisTagValue(model) + knnQuery := fmt.Sprintf("(@model:{%s})=>[KNN %d @%s $vec AS vector_distance]", + escapedModel, c.config.Search.TopK, c.config.Index.VectorField.Name) + + // Execute vector search + searchResult, err := c.client.FTSearchWithArgs(ctx, + c.indexName, + knnQuery, + &redis.FTSearchOptions{ + Return: []redis.FTSearchReturn{ + {FieldName: "vector_distance"}, + {FieldName: "response_body"}, + }, + DialectVersion: 2, + Params: map[string]interface{}{ + "vec": embeddingBytes, + }, + }, + ).Result() + if err != nil { + logging.Infof("RedisCache.FindSimilarWithThreshold: search failed: %v", err) + atomic.AddInt64(&c.missCount, 1) + metrics.RecordCacheOperation("redis", "find_similar", "error", time.Since(start).Seconds()) + metrics.RecordCacheMiss() + return nil, false, nil + } + + logging.Infof("RedisCache.FindSimilarWithThreshold: search returned %d results", searchResult.Total) + + if searchResult.Total == 0 { + atomic.AddInt64(&c.missCount, 1) + logging.Infof("RedisCache.FindSimilarWithThreshold: no entries found - cache miss") + metrics.RecordCacheOperation("redis", "find_similar", "miss", time.Since(start).Seconds()) + metrics.RecordCacheMiss() + return nil, false, nil + } + + // Get best match + bestDoc := searchResult.Docs[0] + + logging.Infof("Extracting fields from best match document...") + + // Extract distance and convert to similarity score + // Redis returns distance, we need to convert based on metric type + distanceVal, ok := bestDoc.Fields["vector_distance"] + if !ok { + logging.Infof("RedisCache.FindSimilarWithThreshold: vector_distance field not found in result") + atomic.AddInt64(&c.missCount, 1) + metrics.RecordCacheOperation("redis", "find_similar", "error", time.Since(start).Seconds()) + metrics.RecordCacheMiss() + return nil, false, nil + } + + var distance float64 + if _, err := fmt.Sscanf(fmt.Sprint(distanceVal), "%f", &distance); err != nil { + logging.Infof("RedisCache.FindSimilarWithThreshold: failed to parse distance value: %v", err) + atomic.AddInt64(&c.missCount, 1) + metrics.RecordCacheOperation("redis", "find_similar", "error", time.Since(start).Seconds()) + metrics.RecordCacheMiss() + return nil, false, nil + } + + // Convert distance to similarity score based on metric type + var similarity float32 + switch c.config.Index.VectorField.MetricType { + case "COSINE": + // COSINE distance in range [0, 2], convert to similarity [0, 1] + similarity = 1.0 - float32(distance)/2.0 + case "IP": + // Inner product: higher is more similar, convert appropriately + similarity = float32(distance) + case "L2": + // L2 distance: lower is more similar, convert to similarity + // Assume max distance for normalization (this is dataset dependent) + similarity = 1.0 / (1.0 + float32(distance)) + default: + similarity = 1.0 - float32(distance) + } + + logging.Infof("Calculated similarity=%.4f, threshold=%.4f, distance=%.4f (metric=%s)", + similarity, threshold, distance, c.config.Index.VectorField.MetricType) + + if similarity < threshold { + atomic.AddInt64(&c.missCount, 1) + logging.Debugf("RedisCache.FindSimilarWithThreshold: cache miss - similarity %.4f below threshold %.4f", + similarity, threshold) + logging.LogEvent("cache_miss", map[string]interface{}{ + "backend": "redis", + "best_similarity": similarity, + "threshold": threshold, + "model": model, + "index": c.indexName, + }) + metrics.RecordCacheOperation("redis", "find_similar", "miss", time.Since(start).Seconds()) + metrics.RecordCacheMiss() + return nil, false, nil + } + + // Extract response body from cache hit + logging.Infof("Attempting to extract response_body field...") + responseBodyVal, ok := bestDoc.Fields["response_body"] + if !ok { + logging.Infof("RedisCache.FindSimilarWithThreshold: cache hit BUT response_body field is MISSING - treating as miss") + atomic.AddInt64(&c.missCount, 1) + metrics.RecordCacheOperation("redis", "find_similar", "error", time.Since(start).Seconds()) + metrics.RecordCacheMiss() + return nil, false, nil + } + + responseBodyStr := fmt.Sprint(responseBodyVal) + if responseBodyStr == "" { + logging.Infof("RedisCache.FindSimilarWithThreshold: cache hit BUT response_body is EMPTY - treating as miss") + atomic.AddInt64(&c.missCount, 1) + metrics.RecordCacheOperation("redis", "find_similar", "error", time.Since(start).Seconds()) + metrics.RecordCacheMiss() + return nil, false, nil + } + + logging.Infof("CACHE HIT: Found cached response, similarity=%.4f, response_size=%d bytes", similarity, len(responseBodyStr)) + + responseBody := []byte(responseBodyStr) + + atomic.AddInt64(&c.hitCount, 1) + logging.Debugf("RedisCache.FindSimilarWithThreshold: cache hit - similarity=%.4f, response_size=%d bytes", + similarity, len(responseBody)) + logging.LogEvent("cache_hit", map[string]interface{}{ + "backend": "redis", + "similarity": similarity, + "threshold": threshold, + "model": model, + "index": c.indexName, + }) + metrics.RecordCacheOperation("redis", "find_similar", "hit", time.Since(start).Seconds()) + metrics.RecordCacheHit() + return responseBody, true, nil +} + +// Close releases all resources held by the cache +func (c *RedisCache) Close() error { + if c.client != nil { + return c.client.Close() + } + return nil +} + +// GetStats provides current cache performance metrics +func (c *RedisCache) GetStats() CacheStats { + c.mu.RLock() + defer c.mu.RUnlock() + + hits := atomic.LoadInt64(&c.hitCount) + misses := atomic.LoadInt64(&c.missCount) + total := hits + misses + + var hitRatio float64 + if total > 0 { + hitRatio = float64(hits) / float64(total) + } + + // Retrieve index statistics from Redis + totalEntries := 0 + if c.enabled && c.client != nil { + ctx := context.Background() + info, err := c.client.FTInfo(ctx, c.indexName).Result() + if err == nil { + // Extract document count from FTInfoResult + totalEntries = info.NumDocs + logging.Debugf("RedisCache.GetStats: index '%s' contains %d entries", + c.indexName, totalEntries) + } else { + logging.Debugf("RedisCache.GetStats: failed to get index stats: %v", err) + } + } + + cacheStats := CacheStats{ + TotalEntries: totalEntries, + HitCount: hits, + MissCount: misses, + HitRatio: hitRatio, + } + + if c.lastCleanupTime != nil { + cacheStats.LastCleanupTime = c.lastCleanupTime + } + + return cacheStats +} diff --git a/src/semantic-router/pkg/extproc/processor_req_body.go b/src/semantic-router/pkg/extproc/processor_req_body.go index 35de4bfd1..14c1160a4 100644 --- a/src/semantic-router/pkg/extproc/processor_req_body.go +++ b/src/semantic-router/pkg/extproc/processor_req_body.go @@ -81,9 +81,12 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo } // Handle caching with decision-specific settings + logging.Infof("About to call handleCaching - decisionName=%s, cacheEnabled=%v", decisionName, r.Config.SemanticCache.Enabled) if response, shouldReturn := r.handleCaching(ctx, decisionName); shouldReturn { + logging.Infof("handleCaching returned a response, returning immediately") return response, nil } + logging.Infof("handleCaching returned no cached response, continuing to model routing") // Handle model selection and routing with pre-computed classification results and selected model return r.handleModelRouting(openAIRequest, originalModel, decisionName, classificationConfidence, reasoningDecision, selectedModel, ctx) diff --git a/src/semantic-router/pkg/extproc/req_filter_cache.go b/src/semantic-router/pkg/extproc/req_filter_cache.go index 7caed3144..76329d661 100644 --- a/src/semantic-router/pkg/extproc/req_filter_cache.go +++ b/src/semantic-router/pkg/extproc/req_filter_cache.go @@ -31,6 +31,9 @@ func (r *OpenAIRouter) handleCaching(ctx *RequestContext, categoryName string) ( cacheEnabled = r.Config.IsCacheEnabledForDecision(categoryName) } + logging.Infof("handleCaching: requestQuery='%s' (len=%d), cacheEnabled=%v, r.Cache.IsEnabled()=%v", + requestQuery, len(requestQuery), cacheEnabled, r.Cache.IsEnabled()) + if requestQuery != "" && r.Cache.IsEnabled() && cacheEnabled { // Get decision-specific threshold threshold := r.Config.GetCacheSimilarityThreshold() @@ -38,6 +41,9 @@ func (r *OpenAIRouter) handleCaching(ctx *RequestContext, categoryName string) ( threshold = r.Config.GetCacheSimilarityThresholdForDecision(categoryName) } + logging.Infof("handleCaching: Performing cache lookup - model=%s, query='%s', threshold=%.2f", + requestModel, requestQuery, threshold) + // Start cache lookup span spanCtx, span := tracing.StartSpan(ctx.TraceContext, tracing.SpanCacheLookup) defer span.End() @@ -47,6 +53,8 @@ func (r *OpenAIRouter) handleCaching(ctx *RequestContext, categoryName string) ( cachedResponse, found, cacheErr := r.Cache.FindSimilarWithThreshold(requestModel, requestQuery, threshold) lookupTime := time.Since(startTime).Milliseconds() + logging.Infof("FindSimilarWithThreshold returned: found=%v, error=%v, lookupTime=%dms", found, cacheErr, lookupTime) + tracing.SetSpanAttributes(span, attribute.String(tracing.AttrCacheKey, requestQuery), attribute.Bool(tracing.AttrCacheHit, found), diff --git a/tools/make/redis.mk b/tools/make/redis.mk new file mode 100644 index 000000000..1aa382373 --- /dev/null +++ b/tools/make/redis.mk @@ -0,0 +1,178 @@ +# ======== redis.mk ======== +# = Everything For Redis = +# ======== redis.mk ======== + +##@ Redis + +# Redis container management +start-redis: ## Start Redis Stack container for testing + @$(LOG_TARGET) + @if $(CONTAINER_RUNTIME) ps --filter "name=redis-semantic-cache" --format "{{.Names}}" | grep -q redis-semantic-cache; then \ + echo "Redis container is already running"; \ + else \ + mkdir -p /tmp/redis-data; \ + $(CONTAINER_RUNTIME) run -d \ + --name redis-semantic-cache \ + -p 6379:6379 \ + -p 8001:8001 \ + -v /tmp/redis-data:/data \ + -e REDIS_ARGS="--save 60 1 --appendonly yes" \ + redis/redis-stack:latest; \ + echo "Waiting for Redis to be ready..."; \ + sleep 5; \ + echo "Redis should be available at localhost:6379"; \ + echo "RedisInsight UI available at http://localhost:8001"; \ + fi + +stop-redis: ## Stop and remove Redis container + @$(LOG_TARGET) + @$(CONTAINER_RUNTIME) stop redis-semantic-cache || true + @$(CONTAINER_RUNTIME) rm redis-semantic-cache || true + @echo "Redis container stopped and removed" + +restart-redis: stop-redis start-redis ## Restart Redis container + +redis-status: ## Show status of Redis container + @$(LOG_TARGET) + @if $(CONTAINER_RUNTIME) ps --filter "name=redis-semantic-cache" --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" | grep -q redis-semantic-cache; then \ + echo "Redis container is running:"; \ + $(CONTAINER_RUNTIME) ps --filter "name=redis-semantic-cache" --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"; \ + else \ + echo "Redis container is not running"; \ + echo "Run 'make start-redis' to start it"; \ + fi + +clean-redis: stop-redis ## Clean up Redis data + @$(LOG_TARGET) + @echo "Cleaning up Redis data..." + @sudo rm -rf /tmp/redis-data || rm -rf /tmp/redis-data + @echo "Redis data directory cleaned" + +# Test semantic cache with Redis backend +test-redis-cache: start-redis rust ## Test semantic cache with Redis backend + @$(LOG_TARGET) + @echo "Testing semantic cache with Redis backend..." + @export LD_LIBRARY_PATH=$${PWD}/candle-binding/target/release && \ + export SR_TEST_MODE=true && \ + cd src/semantic-router && CGO_ENABLED=1 go test -v ./pkg/cache/ -run TestRedisCache + @echo "Consider running 'make stop-redis' when done testing" + +# Test semantic-router with Redis enabled +test-semantic-router-redis: build-router start-redis ## Test semantic-router with Redis cache backend + @$(LOG_TARGET) + @echo "Testing semantic-router with Redis cache backend..." + @export LD_LIBRARY_PATH=$${PWD}/candle-binding/target/release && \ + export SR_TEST_MODE=true && \ + cd src/semantic-router && CGO_ENABLED=1 go test -v ./... + @echo "Consider running 'make stop-redis' when done testing" + +# Run Redis cache example +run-redis-example: start-redis rust ## Run the Redis cache example + @$(LOG_TARGET) + @echo "Running Redis cache example..." + @cd src/semantic-router && \ + export LD_LIBRARY_PATH=$${PWD}/../../candle-binding/target/release && \ + go run ../../examples/redis-cache-example.go + @echo "" + @echo "Example complete! Check Redis using:" + @echo " • redis-cli (command line)" + @echo " • http://localhost:8001 (RedisInsight UI)" + +# Verify Redis installation +verify-redis: start-redis ## Verify Redis installation and vector search capability + @$(LOG_TARGET) + @echo "Verifying Redis installation..." + @echo "" + @echo "1. Testing basic connectivity..." + @$(CONTAINER_RUNTIME) exec redis-semantic-cache redis-cli PING || \ + (echo "❌ Redis connectivity failed" && exit 1) + @echo "✓ Redis is responding" + @echo "" + @echo "2. Checking RediSearch module..." + @$(CONTAINER_RUNTIME) exec redis-semantic-cache redis-cli MODULE LIST | grep -q search || \ + (echo "❌ RediSearch module not found" && exit 1) + @echo "✓ RediSearch module is loaded" + @echo "" + @echo "3. Checking RedisJSON module..." + @$(CONTAINER_RUNTIME) exec redis-semantic-cache redis-cli MODULE LIST | grep -q ReJSON || \ + echo "⚠ RedisJSON module not found (optional)" + @echo "" + @echo "✓ Redis Stack is ready for semantic caching!" + @echo "" + @echo "Access Redis:" + @echo " • CLI: docker exec -it redis-semantic-cache redis-cli" + @echo " • UI: http://localhost:8001" + +# Check Redis data +redis-info: ## Show Redis information and cache statistics + @$(LOG_TARGET) + @echo "Redis Server Information:" + @echo "════════════════════════════════════════" + @$(CONTAINER_RUNTIME) exec redis-semantic-cache redis-cli INFO server | grep -E "redis_version|os|process_id|uptime" + @echo "" + @echo "Memory Usage:" + @echo "════════════════════════════════════════" + @$(CONTAINER_RUNTIME) exec redis-semantic-cache redis-cli INFO memory | grep -E "used_memory_human|used_memory_peak_human" + @echo "" + @echo "Cache Statistics:" + @echo "════════════════════════════════════════" + @$(CONTAINER_RUNTIME) exec redis-semantic-cache redis-cli DBSIZE + @echo "" + @echo "Check for semantic cache index:" + @$(CONTAINER_RUNTIME) exec redis-semantic-cache redis-cli FT._LIST || echo "No indexes found" + +# Redis CLI access +redis-cli: ## Open Redis CLI for interactive commands + @$(LOG_TARGET) + @echo "Opening Redis CLI (type 'exit' to quit)..." + @echo "" + @echo "Useful commands:" + @echo " KEYS doc:* - List all cached documents" + @echo " FT.INFO semantic_cache_idx - Show index info" + @echo " DBSIZE - Count total keys" + @echo " FLUSHDB - Clear all data (careful!)" + @echo "" + @$(CONTAINER_RUNTIME) exec -it redis-semantic-cache redis-cli + +# Benchmark Redis cache performance +benchmark-redis: rust start-redis ## Run Redis cache performance benchmark + @$(LOG_TARGET) + @echo "═══════════════════════════════════════════════════════════" + @echo " Redis Cache Performance Benchmark" + @echo " Testing cache operations with 1000 entries" + @echo "═══════════════════════════════════════════════════════════" + @echo "" + @mkdir -p benchmark_results/redis + @export LD_LIBRARY_PATH=$${PWD}/candle-binding/target/release && \ + export USE_CPU=$${USE_CPU:-false} && \ + export SR_BENCHMARK_MODE=true && \ + cd src/semantic-router/pkg/cache && \ + CGO_ENABLED=1 go test -v -timeout 30m \ + -run='^$$' -bench=BenchmarkRedisCache \ + -benchtime=100x -benchmem . | tee ../../../../benchmark_results/redis/results.txt + @echo "" + @echo "Benchmark complete! Results in: benchmark_results/redis/results.txt" + +# Compare Redis vs Milvus vs In-Memory +benchmark-cache-comparison: rust start-redis start-milvus ## Compare all cache backends + @$(LOG_TARGET) + @echo "═══════════════════════════════════════════════════════════" + @echo " Cache Backend Comparison Benchmark" + @echo " Testing: In-Memory, Redis, Milvus" + @echo "═══════════════════════════════════════════════════════════" + @echo "" + @mkdir -p benchmark_results/comparison + @export LD_LIBRARY_PATH=$${PWD}/candle-binding/target/release && \ + export USE_CPU=$${USE_CPU:-false} && \ + export SR_BENCHMARK_MODE=true && \ + cd src/semantic-router/pkg/cache && \ + CGO_ENABLED=1 go test -v -timeout 60m -tags=milvus \ + -run='^$$' -bench='BenchmarkCacheComparison' \ + -benchtime=50x -benchmem . | tee ../../../../benchmark_results/comparison/results.txt + @echo "" + @echo "Comparison complete! Results in: benchmark_results/comparison/results.txt" + @echo "" + @echo "To clean up:" + @echo " make stop-redis" + @echo " make stop-milvus" + diff --git a/website/docs/tutorials/semantic-cache/redis-cache.md b/website/docs/tutorials/semantic-cache/redis-cache.md new file mode 100644 index 000000000..89f286d08 --- /dev/null +++ b/website/docs/tutorials/semantic-cache/redis-cache.md @@ -0,0 +1,154 @@ +# Redis Semantic Cache + +The Redis cache backend provides persistent, high-performance semantic caching using Redis Stack with RediSearch. This solution offers excellent performance with lower operational complexity compared to specialized vector databases. + +## Overview + +Redis cache is ideal for: + +- **Production environments** requiring fast response times +- **Single-instance or clustered** Redis deployments +- **Medium to large-scale applications** with efficient memory usage +- **Persistent storage** with optional TTL expiration +- **Simplified operations** with familiar Redis tooling + +## Architecture + +```mermaid +graph TB + A[Client Request] --> B[Semantic Cache Instance 1] + A --> C[Semantic Cache Instance 2] + A --> D[Semantic Cache Instance N] + + B --> E[Generate Query Embedding] + C --> E + D --> E + + E --> F[Redis Stack + RediSearch] + F --> G{Similar Vector Found?} + + G -->|Hit| H[Return Cached Response] + G -->|Miss| I[Forward to LLM] + + I --> J[LLM Processing] + J --> K[Store Vector + Response in Redis] + J --> L[Return Response] + + K --> M[Persistent Storage with TTL] + H --> N[Update Hit Metrics] + + style H fill:#90EE90 + style K fill:#FFB6C1 + style M fill:#DDA0DD +``` + +## Configuration + +### Redis Backend Configuration + +Configure in `config/semantic-cache/redis.yaml`: + +```yaml +# config/semantic-cache/redis.yaml +connection: + address: "localhost:6379" + password: "" + db: 0 + pool_size: 10 + max_retries: 3 + dial_timeout_ms: 5000 + read_timeout_ms: 3000 + write_timeout_ms: 3000 + tls: + enabled: false + +index: + name: "semantic_cache_idx" + prefix: "doc:" + vector_field: + name: "embedding" + dimension: 384 # Must match embedding model dimension + algorithm: "HNSW" + metric_type: "COSINE" + hnsw: + m: 16 + ef_construction: 200 + ef_runtime: 10 + +search: + top_k: 5 + +development: + drop_index_on_startup: false + log_level: "info" +``` + +## Setup and Deployment + +Start Redis Stack: + +```bash +# Using Docker +make start-redis + +# Verify Redis is running +docker exec redis-semantic-cache redis-cli PING +``` + +### 2. Configure Semantic Router + +Basic Redis Configuration: + +- Set `backend_type: "redis"` in `config/config.yaml` +- Set `backend_config_path: "config/semantic-cache/redis.yaml"` in `config/config.yaml` + +```yaml +# config/config.yaml +semantic_cache: + enabled: true + backend_type: "redis" + backend_config_path: "config/semantic-cache/redis.yaml" + similarity_threshold: 0.8 + ttl_seconds: 3600 +``` + +Run Semantic Router: + +```bash +# Start router +make run-router +``` + +Run EnvoyProxy: + +```bash +# Start Envoy proxy +make run-envoy +``` + +### 4. Test Redis Cache + +```bash +# Send identical requests to see cache hits +curl -X POST http://localhost:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "MoM", + "messages": [{"role": "user", "content": "What is machine learning?"}] + }' + +# Send similar request (should hit cache due to semantic similarity) +curl -X POST http://localhost:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "MoM", + "messages": [{"role": "user", "content": "Explain machine learning"}] + }' +``` + +## Next Steps + +- **[Milvus Cache](./milvus-cache.md)** - Compare with Milvus vector database +- **[In-Memory Cache](./in-memory-cache.md)** - Compare with in-memory caching +- **[Cache Overview](./overview.md)** - Learn semantic caching concepts +- **[Observability](../observability/overview.md)** - Monitor Redis performance