Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ _run:
-f tools/make/docs.mk \
-f tools/make/linter.mk \
-f tools/make/milvus.mk \
-f tools/make/redis.mk \
-f tools/make/models.mk \
-f tools/make/pre-commit.mk \
-f tools/make/docker.mk \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ fn test_deberta_v3_invalid_path() {
#[test]
fn test_deberta_v3_debug_format() {
// Test that the Debug trait exists
let _type_check: Option<Box<dyn std::fmt::Debug>> = None::<Box<DebertaV3Classifier>>;
let _type_check: Option<Box<dyn std::fmt::Debug>> = None;
}

#[cfg(test)]
Expand Down
341 changes: 341 additions & 0 deletions config/config.redis.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,341 @@
bert_model:
model_id: models/all-MiniLM-L12-v2
threshold: 0.6
use_cpu: true

semantic_cache:
enabled: true # Global cache enabled (applies to all requests)
backend_type: "redis" # Using Redis vector database for semantic cache
similarity_threshold: 0.80 # Global threshold (lowered for better matching)
ttl_seconds: 3600
backend_config_path: "config/semantic-cache/redis.yaml"
# Embedding model for semantic similarity matching
# Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context)
# Default: "bert" (fastest, lowest memory)
embedding_model: "bert"

tools:
enabled: true
top_k: 3
similarity_threshold: 0.2
tools_db_path: "config/tools_db.json"
fallback_to_empty: true

prompt_guard:
enabled: true # Global default - can be overridden per category with jailbreak_enabled
use_modernbert: true
model_id: "models/jailbreak_classifier_modernbert-base_model"
threshold: 0.7
use_cpu: true
jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"

# vLLM Endpoints Configuration
# IMPORTANT: 'address' field must be a valid IP address (IPv4 or IPv6)
# Supported formats: 127.0.0.1, 192.168.1.1, ::1, 2001:db8::1
# NOT supported: domain names (example.com), protocol prefixes (http://), paths (/api), ports in address (use 'port' field)
vllm_endpoints:
- name: "local_vllm"
address: "127.0.0.1" # Local vLLM instance
port: 8000
weight: 1

model_config:
"openai/gpt-oss-20b":
reasoning_family: "gpt-oss" # GPT-OSS uses reasoning_effort parameter
preferred_endpoints: ["local_vllm"]

# Classifier configuration
classifier:
category_model:
model_id: "models/category_classifier_modernbert-base_model"
use_modernbert: true
threshold: 0.6
use_cpu: true
category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
pii_model:
model_id: "models/pii_classifier_modernbert-base_presidio_token_model"
use_modernbert: true
threshold: 0.7
use_cpu: true
pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json"

# Categories define domain metadata only (no routing logic)
categories:
- name: business
description: "Business and management related queries"
mmlu_categories: ["business"]
- name: law
description: "Legal questions and law-related topics"
mmlu_categories: ["law"]
- name: psychology
description: "Psychology and mental health topics"
mmlu_categories: ["psychology"]
- name: biology
description: "Biology and life sciences questions"
mmlu_categories: ["biology"]
- name: chemistry
description: "Chemistry and chemical sciences questions"
mmlu_categories: ["chemistry"]
- name: history
description: "Historical questions and cultural topics"
mmlu_categories: ["history"]
- name: other
description: "General knowledge and miscellaneous topics"
mmlu_categories: ["other"]
- name: health
description: "Health and medical information queries"
mmlu_categories: ["health"]
- name: economics
description: "Economics and financial topics"
mmlu_categories: ["economics"]
- name: math
description: "Mathematics and quantitative reasoning"
mmlu_categories: ["math"]
- name: physics
description: "Physics and physical sciences"
mmlu_categories: ["physics"]
- name: computer_science
description: "Computer science and programming"
mmlu_categories: ["computer_science"]
- name: philosophy
description: "Philosophy and ethical questions"
mmlu_categories: ["philosophy"]
- name: engineering
description: "Engineering and technical problem-solving"
mmlu_categories: ["engineering"]

# Decisions define routing logic with domain-based conditions
# Redis semantic cache is enabled for selected high-value categories
strategy: "priority"

decisions:
- name: "psychology_decision"
description: "Psychology and mental health topics - with Redis semantic cache"
priority: 100
rules:
operator: "AND"
conditions:
- type: "domain"
name: "psychology"
modelRefs:
- model: "openai/gpt-oss-20b"
use_reasoning: false
plugins:
- type: "system_prompt"
configuration:
system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice."
- type: "semantic-cache"
configuration:
enabled: true
similarity_threshold: 0.92
- type: "pii"
configuration:
enabled: true
pii_types_allowed: []

- name: "health_decision"
description: "Health and medical information queries - with Redis semantic cache"
priority: 100
rules:
operator: "AND"
conditions:
- type: "domain"
name: "health"
modelRefs:
- model: "openai/gpt-oss-20b"
use_reasoning: false
plugins:
- type: "system_prompt"
configuration:
system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies."
- type: "semantic-cache"
configuration:
enabled: true
similarity_threshold: 0.95
- type: "pii"
configuration:
enabled: true
pii_types_allowed: []

- name: "general_decision"
description: "General knowledge and miscellaneous topics - with Redis semantic cache"
priority: 50
rules:
operator: "AND"
conditions:
- type: "domain"
name: "other"
modelRefs:
- model: "openai/gpt-oss-20b"
use_reasoning: false
plugins:
- type: "system_prompt"
configuration:
system_prompt: "You are a helpful and knowledgeable assistant. Provide accurate, helpful responses across a wide range of topics."
- type: "semantic-cache"
configuration:
enabled: true
similarity_threshold: 0.75
- type: "pii"
configuration:
enabled: true
pii_types_allowed: []

# Other categories without semantic-cache for comparison
- name: "business_decision"
description: "Business and management queries"
priority: 100
rules:
operator: "AND"
conditions:
- type: "domain"
name: "business"
modelRefs:
- model: "openai/gpt-oss-20b"
use_reasoning: false
plugins:
- type: "system_prompt"
configuration:
system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development."
- type: "pii"
configuration:
enabled: true
pii_types_allowed: []

- name: "math_decision"
description: "Mathematics and quantitative reasoning"
priority: 100
rules:
operator: "AND"
conditions:
- type: "domain"
name: "math"
modelRefs:
- model: "openai/gpt-oss-20b"
use_reasoning: true
plugins:
- type: "system_prompt"
configuration:
system_prompt: "You are a mathematics expert. Provide step-by-step solutions, show your work clearly, and explain mathematical concepts in an understandable way."
- type: "pii"
configuration:
enabled: true
pii_types_allowed: []

- name: "computer_science_decision"
description: "Computer science and programming"
priority: 100
rules:
operator: "AND"
conditions:
- type: "domain"
name: "computer_science"
modelRefs:
- model: "openai/gpt-oss-20b"
use_reasoning: false
plugins:
- type: "system_prompt"
configuration:
system_prompt: "You are a computer science expert with knowledge of algorithms, data structures, programming languages, and software engineering."
- type: "pii"
configuration:
enabled: true
pii_types_allowed: []

# Router Configuration for Dual-Path Selection
router:
high_confidence_threshold: 0.99
low_latency_threshold_ms: 2000
lora_baseline_score: 0.8
traditional_baseline_score: 0.7
embedding_baseline_score: 0.75
success_confidence_threshold: 0.8
large_batch_threshold: 4
lora_default_execution_time_ms: 1345
traditional_default_execution_time_ms: 4567
default_confidence_threshold: 0.95
default_max_latency_ms: 5000
default_batch_size: 4
default_avg_execution_time_ms: 3000
lora_default_confidence: 0.99
traditional_default_confidence: 0.95
lora_default_success_rate: 0.98
traditional_default_success_rate: 0.95
multi_task_lora_weight: 0.30
single_task_traditional_weight: 0.30
large_batch_lora_weight: 0.25
small_batch_traditional_weight: 0.25
medium_batch_weight: 0.10
high_confidence_lora_weight: 0.25
low_confidence_traditional_weight: 0.25
low_latency_lora_weight: 0.30
high_latency_traditional_weight: 0.10
performance_history_weight: 0.20
traditional_bert_confidence_threshold: 0.95
traditional_modernbert_confidence_threshold: 0.8
traditional_pii_detection_threshold: 0.5
traditional_token_classification_threshold: 0.9
traditional_dropout_prob: 0.1
traditional_attention_dropout_prob: 0.1
tie_break_confidence: 0.5

default_model: openai/gpt-oss-20b

# Reasoning family configurations
reasoning_families:
deepseek:
type: "chat_template_kwargs"
parameter: "thinking"

qwen3:
type: "chat_template_kwargs"
parameter: "enable_thinking"

gpt-oss:
type: "reasoning_effort"
parameter: "reasoning_effort"
gpt:
type: "reasoning_effort"
parameter: "reasoning_effort"

# Global default reasoning effort level
default_reasoning_effort: high

# API Configuration
api:
batch_classification:
max_batch_size: 100
concurrency_threshold: 5
max_concurrency: 8
metrics:
enabled: true
detailed_goroutine_tracking: true
high_resolution_timing: false
sample_rate: 1.0
duration_buckets:
[0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]

# Embedding Models Configuration
embedding_models:
qwen3_model_path: "models/Qwen3-Embedding-0.6B"
gemma_model_path: "models/embeddinggemma-300m"
use_cpu: true # Set to false for GPU acceleration (requires CUDA)

# Observability Configuration
observability:
tracing:
enabled: true # Enable distributed tracing for docker-compose stack
provider: "opentelemetry" # Provider: opentelemetry, openinference, openllmetry
exporter:
type: "otlp" # Export spans to Jaeger (via OTLP gRPC)
endpoint: "jaeger:4317" # Jaeger collector inside compose network
insecure: true # Use insecure connection (no TLS)
sampling:
type: "always_on" # Sampling: always_on, always_off, probabilistic
rate: 1.0 # Sampling rate for probabilistic (0.0-1.0)
resource:
service_name: "vllm-semantic-router"
service_version: "v0.1.0"
deployment_environment: "development"

Loading
Loading