vllm-project · Xunzhuo · Nov 27, 2025 · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025
@@ -12,6 +12,7 @@ _run:
 		-f tools/make/docs.mk \
 		-f tools/make/linter.mk \
 		-f tools/make/milvus.mk \
+		-f tools/make/redis.mk \
 		-f tools/make/models.mk \
 		-f tools/make/pre-commit.mk \
 		-f tools/make/docker.mk \

@@ -29,7 +29,7 @@ fn test_deberta_v3_invalid_path() {
 #[test]
 fn test_deberta_v3_debug_format() {
     // Test that the Debug trait exists
-    let _type_check: Option<Box<dyn std::fmt::Debug>> = None::<Box<DebertaV3Classifier>>;
+    let _type_check: Option<Box<dyn std::fmt::Debug>> = None;
 }
 
 #[cfg(test)]

@@ -0,0 +1,341 @@
+bert_model:
+  model_id: models/all-MiniLM-L12-v2
+  threshold: 0.6
+  use_cpu: true
+
+semantic_cache:
+  enabled: true  # Global cache enabled (applies to all requests)
+  backend_type: "redis"  # Using Redis vector database for semantic cache
+  similarity_threshold: 0.80  # Global threshold (lowered for better matching)
+  ttl_seconds: 3600
+  backend_config_path: "config/semantic-cache/redis.yaml"
+  # Embedding model for semantic similarity matching
+  # Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context)
+  # Default: "bert" (fastest, lowest memory)
+  embedding_model: "bert"
+
+tools:
+  enabled: true
+  top_k: 3
+  similarity_threshold: 0.2
+  tools_db_path: "config/tools_db.json"
+  fallback_to_empty: true
+
+prompt_guard:
+  enabled: true  # Global default - can be overridden per category with jailbreak_enabled
+  use_modernbert: true
+  model_id: "models/jailbreak_classifier_modernbert-base_model"
+  threshold: 0.7
+  use_cpu: true
+  jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
+
+# vLLM Endpoints Configuration
+# IMPORTANT: 'address' field must be a valid IP address (IPv4 or IPv6)
+# Supported formats: 127.0.0.1, 192.168.1.1, ::1, 2001:db8::1
+# NOT supported: domain names (example.com), protocol prefixes (http://), paths (/api), ports in address (use 'port' field)
+vllm_endpoints:
+  - name: "local_vllm"
+    address: "127.0.0.1"  # Local vLLM instance
+    port: 8000
+    weight: 1
+
+model_config:
+  "openai/gpt-oss-20b":
+    reasoning_family: "gpt-oss"  # GPT-OSS uses reasoning_effort parameter
+    preferred_endpoints: ["local_vllm"]
+
+# Classifier configuration
+classifier:
+  category_model:
+    model_id: "models/category_classifier_modernbert-base_model"
+    use_modernbert: true
+    threshold: 0.6
+    use_cpu: true
+    category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
+  pii_model:
+    model_id: "models/pii_classifier_modernbert-base_presidio_token_model"
+    use_modernbert: true
+    threshold: 0.7
+    use_cpu: true
+    pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json"
+
+# Categories define domain metadata only (no routing logic)
+categories:
+  - name: business
+    description: "Business and management related queries"
+    mmlu_categories: ["business"]
+  - name: law
+    description: "Legal questions and law-related topics"
+    mmlu_categories: ["law"]
+  - name: psychology
+    description: "Psychology and mental health topics"
+    mmlu_categories: ["psychology"]
+  - name: biology
+    description: "Biology and life sciences questions"
+    mmlu_categories: ["biology"]
+  - name: chemistry
+    description: "Chemistry and chemical sciences questions"
+    mmlu_categories: ["chemistry"]
+  - name: history
+    description: "Historical questions and cultural topics"
+    mmlu_categories: ["history"]
+  - name: other
+    description: "General knowledge and miscellaneous topics"
+    mmlu_categories: ["other"]
+  - name: health
+    description: "Health and medical information queries"
+    mmlu_categories: ["health"]
+  - name: economics
+    description: "Economics and financial topics"
+    mmlu_categories: ["economics"]
+  - name: math
+    description: "Mathematics and quantitative reasoning"
+    mmlu_categories: ["math"]
+  - name: physics
+    description: "Physics and physical sciences"
+    mmlu_categories: ["physics"]
+  - name: computer_science
+    description: "Computer science and programming"
+    mmlu_categories: ["computer_science"]
+  - name: philosophy
+    description: "Philosophy and ethical questions"
+    mmlu_categories: ["philosophy"]
+  - name: engineering
+    description: "Engineering and technical problem-solving"
+    mmlu_categories: ["engineering"]
+
+# Decisions define routing logic with domain-based conditions
+# Redis semantic cache is enabled for selected high-value categories
+strategy: "priority"
+
+decisions:
+  - name: "psychology_decision"
+    description: "Psychology and mental health topics - with Redis semantic cache"
+    priority: 100
+    rules:
+      operator: "AND"
+      conditions:
+        - type: "domain"
+          name: "psychology"
+    modelRefs:
+      - model: "openai/gpt-oss-20b"
+        use_reasoning: false
+    plugins:
+      - type: "system_prompt"
+        configuration:
+          system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice."
+      - type: "semantic-cache"
+        configuration:
+          enabled: true
+          similarity_threshold: 0.92
+      - type: "pii"
+        configuration:
+          enabled: true
+          pii_types_allowed: []
+
+  - name: "health_decision"
+    description: "Health and medical information queries - with Redis semantic cache"
+    priority: 100
+    rules:
+      operator: "AND"
+      conditions:
+        - type: "domain"
+          name: "health"
+    modelRefs:
+      - model: "openai/gpt-oss-20b"
+        use_reasoning: false
+    plugins:
+      - type: "system_prompt"
+        configuration:
+          system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies."
+      - type: "semantic-cache"
+        configuration:
+          enabled: true
+          similarity_threshold: 0.95
+      - type: "pii"
+        configuration:
+          enabled: true
+          pii_types_allowed: []
+
+  - name: "general_decision"
+    description: "General knowledge and miscellaneous topics - with Redis semantic cache"
+    priority: 50
+    rules:
+      operator: "AND"
+      conditions:
+        - type: "domain"
+          name: "other"
+    modelRefs:
+      - model: "openai/gpt-oss-20b"
+        use_reasoning: false
+    plugins:
+      - type: "system_prompt"
+        configuration:
+          system_prompt: "You are a helpful and knowledgeable assistant. Provide accurate, helpful responses across a wide range of topics."
+      - type: "semantic-cache"
+        configuration:
+          enabled: true
+          similarity_threshold: 0.75
+      - type: "pii"
+        configuration:
+          enabled: true
+          pii_types_allowed: []
+
+  # Other categories without semantic-cache for comparison
+  - name: "business_decision"
+    description: "Business and management queries"
+    priority: 100
+    rules:
+      operator: "AND"
+      conditions:
+        - type: "domain"
+          name: "business"
+    modelRefs:
+      - model: "openai/gpt-oss-20b"
+        use_reasoning: false
+    plugins:
+      - type: "system_prompt"
+        configuration:
+          system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development."
+      - type: "pii"
+        configuration:
+          enabled: true
+          pii_types_allowed: []
+
+  - name: "math_decision"
+    description: "Mathematics and quantitative reasoning"
+    priority: 100
+    rules:
+      operator: "AND"
+      conditions:
+        - type: "domain"
+          name: "math"
+    modelRefs:
+      - model: "openai/gpt-oss-20b"
+        use_reasoning: true
+    plugins:
+      - type: "system_prompt"
+        configuration:
+          system_prompt: "You are a mathematics expert. Provide step-by-step solutions, show your work clearly, and explain mathematical concepts in an understandable way."
+      - type: "pii"
+        configuration:
+          enabled: true
+          pii_types_allowed: []
+
+  - name: "computer_science_decision"
+    description: "Computer science and programming"
+    priority: 100
+    rules:
+      operator: "AND"
+      conditions:
+        - type: "domain"
+          name: "computer_science"
+    modelRefs:
+      - model: "openai/gpt-oss-20b"
+        use_reasoning: false
+    plugins:
+      - type: "system_prompt"
+        configuration:
+          system_prompt: "You are a computer science expert with knowledge of algorithms, data structures, programming languages, and software engineering."
+      - type: "pii"
+        configuration:
+          enabled: true
+          pii_types_allowed: []
+
+# Router Configuration for Dual-Path Selection
+router:
+  high_confidence_threshold: 0.99
+  low_latency_threshold_ms: 2000
+  lora_baseline_score: 0.8
+  traditional_baseline_score: 0.7
+  embedding_baseline_score: 0.75
+  success_confidence_threshold: 0.8
+  large_batch_threshold: 4
+  lora_default_execution_time_ms: 1345
+  traditional_default_execution_time_ms: 4567
+  default_confidence_threshold: 0.95
+  default_max_latency_ms: 5000
+  default_batch_size: 4
+  default_avg_execution_time_ms: 3000
+  lora_default_confidence: 0.99
+  traditional_default_confidence: 0.95
+  lora_default_success_rate: 0.98
+  traditional_default_success_rate: 0.95
+  multi_task_lora_weight: 0.30
+  single_task_traditional_weight: 0.30
+  large_batch_lora_weight: 0.25
+  small_batch_traditional_weight: 0.25
+  medium_batch_weight: 0.10
+  high_confidence_lora_weight: 0.25
+  low_confidence_traditional_weight: 0.25
+  low_latency_lora_weight: 0.30
+  high_latency_traditional_weight: 0.10
+  performance_history_weight: 0.20
+  traditional_bert_confidence_threshold: 0.95
+  traditional_modernbert_confidence_threshold: 0.8
+  traditional_pii_detection_threshold: 0.5
+  traditional_token_classification_threshold: 0.9
+  traditional_dropout_prob: 0.1
+  traditional_attention_dropout_prob: 0.1
+  tie_break_confidence: 0.5
+
+default_model: openai/gpt-oss-20b
+
+# Reasoning family configurations
+reasoning_families:
+  deepseek:
+    type: "chat_template_kwargs"
+    parameter: "thinking"
+
+  qwen3:
+    type: "chat_template_kwargs"
+    parameter: "enable_thinking"
+
+  gpt-oss:
+    type: "reasoning_effort"
+    parameter: "reasoning_effort"
+  gpt:
+    type: "reasoning_effort"
+    parameter: "reasoning_effort"
+
+# Global default reasoning effort level
+default_reasoning_effort: high
+
+# API Configuration
+api:
+  batch_classification:
+    max_batch_size: 100
+    concurrency_threshold: 5
+    max_concurrency: 8
+    metrics:
+      enabled: true
+      detailed_goroutine_tracking: true
+      high_resolution_timing: false
+      sample_rate: 1.0
+      duration_buckets:
+        [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
+      size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]
+
+# Embedding Models Configuration
+embedding_models:
+  qwen3_model_path: "models/Qwen3-Embedding-0.6B"
+  gemma_model_path: "models/embeddinggemma-300m"
+  use_cpu: true  # Set to false for GPU acceleration (requires CUDA)
+
+# Observability Configuration
+observability:
+  tracing:
+    enabled: true  # Enable distributed tracing for docker-compose stack
+    provider: "opentelemetry"  # Provider: opentelemetry, openinference, openllmetry
+    exporter:
+      type: "otlp"  # Export spans to Jaeger (via OTLP gRPC)
+      endpoint: "jaeger:4317"  # Jaeger collector inside compose network
+      insecure: true  # Use insecure connection (no TLS)
+    sampling:
+      type: "always_on"  # Sampling: always_on, always_off, probabilistic
+      rate: 1.0  # Sampling rate for probabilistic (0.0-1.0)
+    resource:
+      service_name: "vllm-semantic-router"
+      service_version: "v0.1.0"
+      deployment_environment: "development"
+