Skip to content
2 changes: 1 addition & 1 deletion benchmarks/_plotter_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def convert_to_dataframe_from_benchmark(benchmark: "Benchmark") -> tuple:
"tn_list": benchmark.tn_list,
"fn_list": benchmark.fn_list,
"latency_direct_list": benchmark.latency_direct_list,
"latency_vectorq_list": benchmark.latency_vcach_list,
"latency_vectorq_list": benchmark.latency_vcache_list,
}
df = pd.DataFrame(data)

Expand Down
89 changes: 40 additions & 49 deletions benchmarks/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
HNSWLibVectorDB,
SimilarityMetricType,
)
from vcache.vcache_core.similarity_evaluator import SimilarityEvaluator
from vcache.vcache_core.similarity_evaluator.strategies.llm_comparison import (
LLMComparisonSimilarityEvaluator,
)
Expand Down Expand Up @@ -101,7 +102,7 @@ class Dataset(Enum):
ECOMMERCE_DATASET = "ecommerce_dataset"


class GenerateResultsOnly(Enum):
class GeneratePlotsOnly(Enum):
YES = True
NO = False

Expand All @@ -112,62 +113,37 @@ class GenerateResultsOnly(Enum):


MAX_SAMPLES: int = 60000
CONFIDENCE_INTERVALS_ITERATIONS: int = 5
IS_LLM_JUDGE_BENCHMARK: bool = False
DISABLE_PROGRESS_BAR: bool = True
CONFIDENCE_INTERVALS_ITERATIONS: int = 2
DISABLE_PROGRESS_BAR: bool = False
KEEP_SPLIT: int = 100

RUN_COMBINATIONS: List[
Tuple[EmbeddingModel, LargeLanguageModel, Dataset, GenerateResultsOnly]
Tuple[EmbeddingModel, LargeLanguageModel, Dataset, GeneratePlotsOnly]
] = [
(
EmbeddingModel.GTE,
LargeLanguageModel.LLAMA_3_8B,
Dataset.SEM_BENCHMARK_SEARCH_QUERIES,
GenerateResultsOnly.YES,
Dataset.SEM_BENCHMARK_CLASSIFICATION,
GeneratePlotsOnly.NO,
StringComparisonSimilarityEvaluator(),
),
(
EmbeddingModel.GTE,
LargeLanguageModel.GPT_4O_MINI,
Dataset.SEM_BENCHMARK_ARENA,
GenerateResultsOnly.YES,
),
(
EmbeddingModel.E5_LARGE_V2,
LargeLanguageModel.GPT_4O_MINI,
Dataset.SEM_BENCHMARK_ARENA,
GenerateResultsOnly.YES,
),
(
EmbeddingModel.E5_LARGE_V2,
LargeLanguageModel.LLAMA_3_8B,
Dataset.SEM_BENCHMARK_CLASSIFICATION,
GenerateResultsOnly.YES,
),
(
EmbeddingModel.GTE,
LargeLanguageModel.LLAMA_3_8B,
Dataset.SEM_BENCHMARK_CLASSIFICATION,
GenerateResultsOnly.YES,
),
(
EmbeddingModel.GTE,
LargeLanguageModel.LLAMA_3_70B,
Dataset.SEM_BENCHMARK_CLASSIFICATION,
GenerateResultsOnly.YES,
GeneratePlotsOnly.NO,
LLMComparisonSimilarityEvaluator(),
),
]

BASELINES_TO_RUN: List[Baseline] = [
# Baseline.IID,
# Baseline.GPTCache,
# Baseline.VCacheLocal,
Baseline.VCacheLocal,
# Baseline.BerkeleyEmbedding,
# Baseline.VCacheBerkeleyEmbedding,
]

DATASETS_TO_RUN: List[str] = [Dataset.SEM_BENCHMARK_SEARCH_QUERIES]

STATIC_THRESHOLDS: List[float] = [
0.80,
0.81,
Expand Down Expand Up @@ -220,7 +196,7 @@ def stats_set_up(self):
self.tn_list: List[int] = []
self.fn_list: List[int] = []
self.latency_direct_list: List[float] = []
self.latency_vcach_list: List[float] = []
self.latency_vcache_list: List[float] = []
self.observations_dict: Dict[str, Dict[str, float]] = {}
self.gammas_dict: Dict[str, float] = {}
self.t_hats_dict: Dict[str, float] = {}
Expand Down Expand Up @@ -465,7 +441,7 @@ def dump_results_to_json(self):
"tn_list": self.tn_list,
"fn_list": self.fn_list,
"latency_direct_list": self.latency_direct_list,
"latency_vectorq_list": self.latency_vcach_list,
"latency_vectorq_list": self.latency_vcache_list,
"observations_dict": self.observations_dict,
"gammas_dict": self.gammas_dict,
"t_hats_dict": self.t_hats_dict,
Expand Down Expand Up @@ -498,12 +474,8 @@ def __run_baseline(
timestamp: str,
delta: float,
threshold: float,
similarity_evaluator: SimilarityEvaluator,
):
if IS_LLM_JUDGE_BENCHMARK:
similarity_evaluator = LLMComparisonSimilarityEvaluator()
else:
similarity_evaluator = StringComparisonSimilarityEvaluator()

vcache_config: VCacheConfig = VCacheConfig(
inference_engine=BenchmarkInferenceEngine(),
embedding_engine=BenchmarkEmbeddingEngine(),
Expand Down Expand Up @@ -547,8 +519,15 @@ def main():

timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M")

for embedding_model, llm_model, dataset, generate_results_only in RUN_COMBINATIONS:
for (
embedding_model,
llm_model,
dataset,
generate_plots_only,
similarity_evaluator,
) in RUN_COMBINATIONS:
try:
print(f"DatasetPath: {datasets_dir}, Dataset: {dataset.value}")
dataset_file = os.path.join(datasets_dir, f"{dataset.value}.json")
logging.info(
f"Running benchmark for dataset: {dataset}, embedding model: {embedding_model.value[1]}, LLM model: {llm_model.value[1]}\n"
Expand All @@ -557,7 +536,10 @@ def main():

#####################################################
### Baseline: vCache Local
if Baseline.VCacheLocal in BASELINES_TO_RUN and not generate_results_only:
if (
Baseline.VCacheLocal in BASELINES_TO_RUN
and not generate_plots_only.value
):
for delta in DELTAS:
for i in range(0, CONFIDENCE_INTERVALS_ITERATIONS):
path = os.path.join(
Expand All @@ -583,11 +565,15 @@ def main():
timestamp=timestamp,
delta=delta,
threshold=-1,
similarity_evaluator=similarity_evaluator,
)

#####################################################
### Baseline: vCache Global
if Baseline.VCacheGlobal in BASELINES_TO_RUN and not generate_results_only:
if (
Baseline.VCacheGlobal in BASELINES_TO_RUN
and not generate_plots_only.value
):
for delta in DELTAS:
path = os.path.join(
results_dir,
Expand All @@ -612,13 +598,14 @@ def main():
timestamp=timestamp,
delta=delta,
threshold=-1,
similarity_evaluator=similarity_evaluator,
)

#####################################################
### Baseline: Berkeley Embedding
if (
Baseline.BerkeleyEmbedding in BASELINES_TO_RUN
and not generate_results_only
and not generate_plots_only.value
):
for threshold in STATIC_THRESHOLDS:
if embedding_model == EmbeddingModel.E5_MISTRAL_7B:
Expand Down Expand Up @@ -658,13 +645,14 @@ def main():
timestamp=timestamp,
delta=-1,
threshold=threshold,
similarity_evaluator=similarity_evaluator,
)

#####################################################
### Baseline: vCache + Berkeley Embedding
if (
Baseline.VCacheBerkeleyEmbedding in BASELINES_TO_RUN
and not generate_results_only
and not generate_plots_only.value
):
for delta in DELTAS:
for i in range(0, CONFIDENCE_INTERVALS_ITERATIONS):
Expand Down Expand Up @@ -707,11 +695,12 @@ def main():
timestamp=timestamp,
delta=delta,
threshold=-1,
similarity_evaluator=similarity_evaluator,
)

#####################################################
### Baseline: IID Local
if Baseline.IID in BASELINES_TO_RUN and not generate_results_only:
if Baseline.IID in BASELINES_TO_RUN and not generate_plots_only.value:
for delta in DELTAS:
for i in range(0, CONFIDENCE_INTERVALS_ITERATIONS):
path = os.path.join(
Expand All @@ -737,11 +726,12 @@ def main():
timestamp=timestamp,
delta=delta,
threshold=-1,
similarity_evaluator=similarity_evaluator,
)

#####################################################
### Baseline: GPTCache
if Baseline.GPTCache in BASELINES_TO_RUN and not generate_results_only:
if Baseline.GPTCache in BASELINES_TO_RUN and not generate_plots_only.value:
for threshold in STATIC_THRESHOLDS:
path = os.path.join(
results_dir,
Expand All @@ -764,6 +754,7 @@ def main():
timestamp=timestamp,
delta=-1,
threshold=threshold,
similarity_evaluator=similarity_evaluator,
)

#####################################################
Expand Down
4 changes: 4 additions & 0 deletions tests/ReadMe.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,11 @@ vCache includes both **unit tests** and **integration tests** to ensure correctn
Unit tests verify the **logic of individual module strategies** (e.g., caching policies, embedding engines, similarity evaluators) in isolation.
They are designed to be fast, deterministic, and independent of external services.

#### Running Unit Tests

```bash
python -m pytest tests/unit/
```

### Integration Tests

Expand Down
114 changes: 114 additions & 0 deletions tests/integration/test_concurrency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import random
import time
import unittest
from concurrent.futures import ThreadPoolExecutor
from unittest.mock import MagicMock, patch

from dotenv import load_dotenv

from vcache import (
DynamicLocalThresholdPolicy,
HNSWLibVectorDB,
InMemoryEmbeddingMetadataStorage,
LangChainEmbeddingEngine,
StringComparisonSimilarityEvaluator,
VCache,
VCacheConfig,
)
from vcache.vcache_policy.strategies.dynamic_local_threshold import _Action

load_dotenv()


class TestConcurrency(unittest.TestCase):
def test_async_label_generation_and_timeout(self):
similarity_evaluator = StringComparisonSimilarityEvaluator()

mock_answers_similar = MagicMock()

def answers_similar(a, b):
if "Return 'xxxxxxxxx' as the answer" in a:
time.sleep(10)
print(f"Answers Similar (Execution time: 10s) => a: {a}, b: {b}\n")
return True
else:
execution_time = random.uniform(0.5, 3)
time.sleep(execution_time)
print(
f"Answers Similar (Execution time: {execution_time}s) => a: {a}, b: {b}\n"
)
return True

mock_answers_similar.side_effect = answers_similar

config = VCacheConfig(
embedding_engine=LangChainEmbeddingEngine(
model_name="sentence-transformers/all-mpnet-base-v2"
),
vector_db=HNSWLibVectorDB(),
embedding_metadata_storage=InMemoryEmbeddingMetadataStorage(),
similarity_evaluator=similarity_evaluator,
)

with DynamicLocalThresholdPolicy(delta=0.05) as policy:
vcache: VCache = VCache(config, policy)
vcache.vcache_policy.setup(config)

with (
patch.object(
policy.similarity_evaluator,
"answers_similar",
new=mock_answers_similar,
),
patch.object(
policy.bayesian, "select_action", return_value=_Action.EXPLORE
),
):
initial_prompt = "What is the capital of Germany?"
vcache.infer(prompt=initial_prompt)

concurrent_prompts_chunk_1 = [
"What is the capital of Germany?Germany's capital?",
"Capital of Germany is...",
"Return 'xxxxxxxxx' as the answer", # This is the slow prompt
"Berlin is the capital of what country?",
]
concurrent_prompts_chunk_2 = [
"Which city is the seat of the German government?",
"What is Germany's primary city?",
"Tell me about Berlin.",
"Is Frankfurt the capital of Germany?",
"What's the main city of Germany?",
"Where is the German government located?",
]

def do_inference(prompt):
prompt_index = total_prompts.index(prompt)
print(f"Inferring prompt {prompt_index}: {prompt}\n")
vcache.infer(prompt=prompt)

total_prompts = concurrent_prompts_chunk_1 + concurrent_prompts_chunk_2
with ThreadPoolExecutor(max_workers=len(total_prompts)) as executor:
executor.map(do_inference, concurrent_prompts_chunk_1)
time.sleep(1.5)
executor.map(do_inference, concurrent_prompts_chunk_2)

all_metadata_objects = vcache.vcache_config.embedding_metadata_storage.get_all_embedding_metadata_objects()
final_observation_count = len(all_metadata_objects)

for i, metadata_object in enumerate(all_metadata_objects):
print(f"metadata_object {i}: {metadata_object}")

print(f"\nfinal_observation_count: {final_observation_count}")

assert final_observation_count == 1, (
f"Expected 1 metadata object, got {final_observation_count}"
)
# We expect the 'slow prompt' to be the only prompt not being part of the observations
assert len(all_metadata_objects[0].observations) == 12, (
f"Expected 12 observations (10 + 2 initial labels), got {len(all_metadata_objects[0].observations)}"
)


if __name__ == "__main__":
unittest.main()
File renamed without changes.
Loading