### Imports

In [None]:
import hnswlib
import numpy as np
import time
from collections import defaultdict
from sklearn.cluster import KMeans


##### HNSW Filtering Benchmark Notebook

This notebook performs benchmarking of inline HNSW (Hierarchical Navigable Small World) filtering performance using two approaches:

##### 1. Uniform Distribution (`LabelFilteredANNEvaluator`)
- Tests filtering on uniformly distributed labels (a, b, c)
- Equal distribution (~33.3% each)

##### 2. Skewed Distribution (`LabelFilteredANNEvaluator1`)
- Same evaluation but with skewed label distribution:
  - Label a: 60%
  - Label b: 30%
  - Label c: 10%

##### Key Metrics Evaluated
- Build Time
- Query Latency (filtered vs unfiltered)
- Filter Specificity
- Recall Scores
- Filter Friction (Latency Overhead and Recall Impact)

The inline filtering function takes Id and returns True or False.

Results are printed showing performance metrics for both distribution strategies, allowing comparison of filtering effectiveness under different data distributions.

In [108]:
class LabelFilteredANNEvaluator:
    """
    Design Metrics for Filtered ANN Search:
    1. Query Latency:
       - Measures search time with/without filters
       - Compares overhead of filtering
    2. Accuracy Impact:
       - Recall@k: proportion of true nearest neighbors found
       - How filtering affects quality of results
    3. Filter Friction:
       - Filter specificity: proportion of points passing filter
       - Impact of label distribution on performance
    """
    def __init__(self, dim=16, num_elements=3000):
        self.dim = dim
        self.num_elements = num_elements
        self.metrics = defaultdict(list)

    def generate_uniform_labeled_data(self):
        """Generate uniform data with three labels distributed equally"""
        self.data = np.float32(np.random.random((self.num_elements, self.dim)))
        
        num_per_label = self.num_elements // 3
        self.labels = np.array(['a'] * num_per_label + 
                             ['b'] * num_per_label + 
                             ['c'] * (self.num_elements - 2 * num_per_label))
        p = np.random.permutation(len(self.data))
        self.data = self.data[p]
        self.labels = self.labels[p]
        unique, counts = np.unique(self.labels, return_counts=True)
        self.metrics['label_distribution'] = dict(zip(unique, counts / len(self.labels)))
        return self.data, self.labels
 
    def build_index(self):
        """Build HNSW index with the generated data"""
        self.index = hnswlib.Index(space='cosine', dim=self.dim)
        self.index.init_index(max_elements=self.num_elements, ef_construction=100, M=16)
        self.index.set_ef(20)
        self.index.set_num_threads(1)
        start_time = time.time()
        self.index.add_items(self.data, ids=np.arange(self.num_elements))
        build_time = time.time() - start_time
        self.metrics['build_time'] = build_time

    def create_label_filter(self, target_label):
        """Create filter function for a specific label"""
        def filter_function(idx):
            return self.labels[idx] == target_label
        return filter_function
    
    def calculate_recall(self, filtered_results, true_results, query_points, target_label, k):
        """
    Calculate recall@k for filtered nearest neighbor search results.
    
    Args:
        filtered_results: Results from filtered knn search (n_queries x k)
        true_results: Results from unfiltered knn search (n_queries x k)
        query_points: Query points used for search (n_queries x dim)
        target_label: Label to filter for
        k: Number of nearest neighbors
    
    Returns:
        float: Average recall@k across all queries
        """
        recall = 0
        n_queries = len(query_points)
    
        target_mask = self.labels == target_label
        target_data = self.data[target_mask]
        target_indices = np.where(target_mask)[0]
    
        for i in range(n_queries):
            distances = np.linalg.norm(target_data - query_points[i], axis=1)
            true_neighbor_indices = target_indices[np.argsort(distances)[:k]]
        
            filtered_neighbor_indices = filtered_results[i]
        
            intersection = set(filtered_neighbor_indices) & set(true_neighbor_indices)
            recall += len(intersection) / k
    
        return recall / n_queries

    def evaluate_query_performance(self, num_queries=100, k=10):
        """Evaluate query performance with comprehensive metrics"""

        query_points = np.float32(np.random.random((num_queries, self.dim)))
        
        #  Unfiltered
        start_time = time.time()
        unfiltered_labels, unfiltered_distances = self.index.knn_query(query_points, k=k, num_threads=1)
        unfiltered_time = time.time() - start_time
        
        filter_times = {}
        recall_scores = {}
        filter_specificity = {}
        result_counts = {}
        
        # Per each label metrics
        for label in ['a', 'b', 'c']:
            filter_func = self.create_label_filter(label)
            
            # Latency
            start_time = time.time()
            filtered_labels, filtered_distances = self.index.knn_query(
                query_points, k=k, num_threads=1, filter=filter_func
            )
            filter_time = time.time() - start_time
            filter_times[label] = filter_time / num_queries
            # total_requested = len(self.labels)
            # matching_label = np.sum(self.labels == label)
            # result_counts[label] = matching_label / total_requested
            
            # Filter Specificity
            points_passing_filter = sum(filter_func(i) for i in range(self.num_elements))
            filter_specificity[label] = points_passing_filter / self.num_elements
            
            # Accuracy Impact (Recall)
            recall_scores[label] = self.calculate_recall(
    filtered_labels, 
    unfiltered_labels,
    query_points, 
    label,         
    k              
)

        self.metrics['query_latency'] = {
            'unfiltered': unfiltered_time / num_queries,
            'filtered': filter_times
        }
        # self.metrics['filter_specificity'] = result_counts
        self.metrics['filter_specificity'] = filter_specificity
        self.metrics['recall_scores'] = recall_scores
        self.metrics['filter_friction'] = {
            'latency_overhead': {label: filter_times[label]/self.metrics['query_latency']['unfiltered'] 
                        for label in filter_times},
            'specificity': filter_specificity,
            'recall_impact': recall_scores
}
        
        return self.metrics
    
    # def evaluate_query_performance(self, num_queries=100, k=10):
    #     """Evaluate query performance with different label filters"""
    #     query_points = np.float32(np.random.random((num_queries, self.dim)))
        
    #     start_time = time.time()
    #     unfiltered_labels, unfiltered_distances = self.index.knn_query(query_points, k=k, num_threads=1)
    #     unfiltered_time = time.time() - start_time
        
    #     filter_times = {}
    #     result_counts = {}
    #     filter_specificity = {}

        
    #     for label in ['a', 'b', 'c']:
    #         filter_func = self.create_label_filter(label)
            
    #         start_time = time.time()
    #         filtered_labels, filtered_distances = self.index.knn_query(
    #             query_points, k=k, num_threads=1, filter=filter_func
    #         )
    #         filter_time = time.time() - start_time
            
    #         filter_times[label] = filter_time / num_queries
    #         total_requested = len(self.labels)
    #         matching_label = np.sum(self.labels == label)
    #         result_counts[label] = matching_label / total_requested
    #         # Count how many points pass the filter
    #         points_passing_filter = sum(filter_func(i) for i in range(self.num_elements))
    #         filter_specificity[label] = points_passing_filter / self.num_elements
        
    #     self.metrics['query_latency'] = {
    #         'unfiltered': unfiltered_time / num_queries,
    #         'filtered': filter_times
    #     }
    #     # self.metrics['filter_specificity'] = result_counts

    
    #     self.metrics['filter_specificity'] = filter_specificity
        
    #     return self.metrics

In [109]:
class LabelFilteredANNEvaluator1:
    """
    Design Metrics for Filtered ANN Search:
    1. Query Latency:
       - Measures search time with/without filters
       - Compares overhead of filtering
    2. Accuracy Impact:
       - Recall@k: proportion of true nearest neighbors found
       - How filtering affects quality of results
    3. Filter Friction:
       - Filter specificity: proportion of points passing filter
       - Impact of label distribution on performance
    """
    def __init__(self, dim=16, num_elements=3000):
        self.dim = dim
        self.num_elements = num_elements
        self.metrics = defaultdict(list)

   
    def generate_skewed_labeled_data(self):
        """Generate skewed data with three labels distributed as 60%, 30%, 10%"""
        self.data = np.float32(np.random.random((self.num_elements, self.dim)))
    
        label_a_count = int(self.num_elements * 0.6)  # 60%
        label_b_count = int(self.num_elements * 0.3)  # 30%
        label_c_count = self.num_elements - label_a_count - label_b_count  # Remaining (10%)
    
        self.labels = np.array(['a'] * label_a_count + 
                          ['b'] * label_b_count + 
                          ['c'] * label_c_count)
    
        p = np.random.permutation(len(self.data))
        self.data = self.data[p]
        self.labels = self.labels[p]
    
        unique, counts = np.unique(self.labels, return_counts=True)
        self.metrics['label_distribution'] = dict(zip(unique, counts / len(self.labels)))
    
        return self.data, self.labels
    
    def build_index(self):
        """Build HNSW index with the generated data"""
        self.index = hnswlib.Index(space='cosine', dim=self.dim)
        self.index.init_index(max_elements=self.num_elements, ef_construction=100, M=16)
        self.index.set_ef(20)
        self.index.set_num_threads(1)
        start_time = time.time()
        self.index.add_items(self.data, ids=np.arange(self.num_elements))
        build_time = time.time() - start_time
        self.metrics['build_time'] = build_time

    def create_label_filter(self, target_label):
        """Create filter function for a specific label"""
        def filter_function(idx):
            return self.labels[idx] == target_label
        return filter_function
    
    def calculate_recall(self, filtered_results, true_results, query_points, target_label, k):
        """
    Calculate recall@k for filtered nearest neighbor search results.
    
    Args:
        filtered_results: Results from filtered knn search (n_queries x k)
        true_results: Results from unfiltered knn search (n_queries x k)
        query_points: Query points used for search (n_queries x dim)
        target_label: Label to filter for
        k: Number of nearest neighbors
    
    Returns:
        float: Average recall@k across all queries
        """
        recall = 0
        n_queries = len(query_points)
    
        target_mask = self.labels == target_label
        target_data = self.data[target_mask]
        target_indices = np.where(target_mask)[0]
    
        for i in range(n_queries):
            distances = np.linalg.norm(target_data - query_points[i], axis=1)
            true_neighbor_indices = target_indices[np.argsort(distances)[:k]]
        
            filtered_neighbor_indices = filtered_results[i]
        
            intersection = set(filtered_neighbor_indices) & set(true_neighbor_indices)
            recall += len(intersection) / k
    
        return recall / n_queries

    def evaluate_query_performance(self, num_queries=100, k=10):
        """Evaluate query performance with comprehensive metrics"""

        query_points = np.float32(np.random.random((num_queries, self.dim)))
        
        # Unfiltered
        start_time = time.time()
        unfiltered_labels, unfiltered_distances = self.index.knn_query(query_points, k=k, num_threads=1)
        unfiltered_time = time.time() - start_time
        
        filter_times = {}
        recall_scores = {}
        filter_specificity = {}
        result_counts = {}
        
        # Per each label metrics
        for label in ['a', 'b', 'c']:
            filter_func = self.create_label_filter(label)
            
            # Latency
            start_time = time.time()
            filtered_labels, filtered_distances = self.index.knn_query(
                query_points, k=k, num_threads=1, filter=filter_func
            )
            filter_time = time.time() - start_time
            filter_times[label] = filter_time / num_queries
            # total_requested = len(self.labels)
            # matching_label = np.sum(self.labels == label)
            # result_counts[label] = matching_label / total_requested
            
            # Filter Specificity
            points_passing_filter = sum(filter_func(i) for i in range(self.num_elements))
            filter_specificity[label] = points_passing_filter / self.num_elements
            
            # Accuracy Impact (Recall)
            recall_scores[label] = self.calculate_recall(
    filtered_labels, 
    unfiltered_labels,
    query_points,
    label,         
    k            
)

        self.metrics['query_latency'] = {
            'unfiltered': unfiltered_time / num_queries,
            'filtered': filter_times
        }
        # self.metrics['filter_specificity'] = result_counts
        self.metrics['filter_specificity'] = filter_specificity
        self.metrics['recall_scores'] = recall_scores
        self.metrics['filter_friction'] = {
            'latency_overhead': {label: filter_times[label]/self.metrics['query_latency']['unfiltered'] 
                        for label in filter_times},
            'specificity': filter_specificity,
            'recall_impact': recall_scores
}
        
        return self.metrics
    
    # def evaluate_query_performance(self, num_queries=100, k=10):
    #     """Evaluate query performance with different label filters"""
    #     query_points = np.float32(np.random.random((num_queries, self.dim)))
        
    #     start_time = time.time()
    #     unfiltered_labels, unfiltered_distances = self.index.knn_query(query_points, k=k, num_threads=1)
    #     unfiltered_time = time.time() - start_time
        
    #     filter_times = {}
    #     result_counts = {}
    #     filter_specificity = {}

        
    #     for label in ['a', 'b', 'c']:
    #         filter_func = self.create_label_filter(label)
            
    #         start_time = time.time()
    #         filtered_labels, filtered_distances = self.index.knn_query(
    #             query_points, k=k, num_threads=1, filter=filter_func
    #         )
    #         filter_time = time.time() - start_time
            
    #         filter_times[label] = filter_time / num_queries
    #         total_requested = len(self.labels)
    #         matching_label = np.sum(self.labels == label)
    #         result_counts[label] = matching_label / total_requested
    #         # Count how many points pass the filter
    #         points_passing_filter = sum(filter_func(i) for i in range(self.num_elements))
    #         filter_specificity[label] = points_passing_filter / self.num_elements
        
    #     self.metrics['query_latency'] = {
    #         'unfiltered': unfiltered_time / num_queries,
    #         'filtered': filter_times
    #     }
    #     # self.metrics['filter_specificity'] = result_counts

    
    #     self.metrics['filter_specificity'] = filter_specificity
        
    #     return self.metrics

In [110]:
def print_evaluation_results(metrics):
    print("\nEvaluation Results:")
    print(f"Build Time: {metrics['build_time']:.3f} seconds")
    
    print("\nLabel Distribution:")
    for label, freq in metrics['label_distribution'].items():
        print(f"- Label {label}: {freq*100:.1f}%")
    
    print(f"\nQuery Latency:")
    print(f"- Unfiltered: {metrics['query_latency']['unfiltered']*1000:.2f} ms per query")
    for label, latency in metrics['query_latency']['filtered'].items():
        print(f"- Filtered (label {label}): {latency*1000:.2f} ms per query")
    
    print(f"\nFilter Specificity:")
    for label, specificity in metrics['filter_specificity'].items():
        print(f"- Label {label}: {specificity*100:.1f}%")
        
    print("\nRecall Scores:")
    for label, recall in metrics['recall_scores'].items():
        print(f"- Label {label}: {recall*100:.1f}%")
        
    print("\nFilter Friction:")
    print("Latency Overhead:")
    for label, overhead in metrics['filter_friction']['latency_overhead'].items():
        print(f"- Label {label}: {overhead:.2f}x")
    print("Recall Impact:")
    for label, impact in metrics['filter_friction']['recall_impact'].items():
        print(f"- Label {label}: {impact*100:.1f}%")

In [111]:
def run_labeled_evaluation():
    evaluator = LabelFilteredANNEvaluator()
    
    print("Generating labeled uniform data...")
    evaluator.generate_uniform_labeled_data()
    
    print("Building index...")
    evaluator.build_index()
    
    print("Evaluating performance...")
    metrics = evaluator.evaluate_query_performance()
    
    print_evaluation_results(metrics)

In [112]:
def run_labeled_evaluation1():
    evaluator = LabelFilteredANNEvaluator1()
    
    print("Generating labeled skewed data...")
    evaluator.generate_skewed_labeled_data()
    
    print("Building index...")
    evaluator.build_index()
    
    print("Evaluating performance...")
    metrics = evaluator.evaluate_query_performance()
    
    print_evaluation_results(metrics)

In [113]:
if __name__ == "__main__":
    run_labeled_evaluation()

Generating labeled uniform data...
Building index...
Evaluating performance...

Evaluation Results:
Build Time: 0.268 seconds

Label Distribution:
- Label a: 33.3%
- Label b: 33.3%
- Label c: 33.3%

Query Latency:
- Unfiltered: 0.02 ms per query
- Filtered (label a): 0.18 ms per query
- Filtered (label b): 0.16 ms per query
- Filtered (label c): 0.21 ms per query

Filter Specificity:
- Label a: 33.3%
- Label b: 33.3%
- Label c: 33.3%

Recall Scores:
- Label a: 72.7%
- Label b: 72.9%
- Label c: 73.8%

Filter Friction:
Latency Overhead:
- Label a: 9.00x
- Label b: 8.00x
- Label c: 10.50x
Recall Impact:
- Label a: 72.7%
- Label b: 72.9%
- Label c: 73.8%


In [114]:
if __name__ == "__main__":
    run_labeled_evaluation1()

Generating labeled skewed data...
Building index...
Evaluating performance...

Evaluation Results:
Build Time: 0.201 seconds

Label Distribution:
- Label a: 60.0%
- Label b: 30.0%
- Label c: 10.0%

Query Latency:
- Unfiltered: 0.06 ms per query
- Filtered (label a): 0.11 ms per query
- Filtered (label b): 0.24 ms per query
- Filtered (label c): 0.39 ms per query

Filter Specificity:
- Label a: 60.0%
- Label b: 30.0%
- Label c: 10.0%

Recall Scores:
- Label a: 69.8%
- Label b: 66.4%
- Label c: 71.8%

Filter Friction:
Latency Overhead:
- Label a: 1.84x
- Label b: 4.00x
- Label c: 6.50x
Recall Impact:
- Label a: 69.8%
- Label b: 66.4%
- Label c: 71.8%


##### HNSW Parameter Notes

For the ef parameter, I set it bigger than k, which I found recommended in an article. While this increases build time,I also used cosine similarity as space metric (L2 norm showed similar results) it's worth noting that these parameters are considered in the benchmark of HNSW itself without filtering. Since my focus is on benchmarking the inline filtering, I just fixed these parameters for this implementation.

These HNSW parameters primarily affect non-filtered search:

ef_construction: higher → better recall, slower build

ef: higher → better recall, slower search

M: more connections → better recall, more memory usage

Filtering Metrics:

Specificity = (Number of points passing filter) / (Total number of points) (selectivity of the filter)

Recall = |Retrieved True Nearest Neighbors ∩ Actual True Nearest Neighbors| / |Actual True Nearest Neighbors|

Latency Overhead = Filtered Query Time / Unfiltered Query Time

Warning: search with a filter works slow in python in multithreaded mode, therefore we set num_threads=1


#### Results

The results show that, as expected, the query latency for unfiltered search (0.02ms) is significantly faster than inline filtered search (0.18-0.21ms) in the uniform distribution case. This is evident from:

Query Latency:

Unfiltered: 0.02ms
Filtered: ranges from 0.18-0.21ms across labels


Label Distribution and Specificity both show perfect uniform distribution:

Each label (a, b, c) represents exactly 33.3% of data


Performance Impact:

Latency Overhead shows filtering is 9x-10x slower than unfiltered search
Recall scores around 72% for all labels, showing consistent accuracy across uniformly distributed data



The uniform distribution case demonstrates that filtering adds significant overhead but maintains consistent performance across labels due to their equal distribution.




The skewed distribution shows a notable pattern in performance impact:

Query Latency varies significantly with label frequency:

Label 'a' (60% of data): fastest filtered search at 0.11ms
Label 'b' (30% of data): slower at 0.24ms
Label 'c' (10% of data): slowest at 0.39ms
Unfiltered remains fast at 0.06ms


Latency Overhead shows inverse relationship with label frequency:

Most frequent (label 'a'): lowest overhead at 1.84x
Least frequent (label 'c'): highest overhead at 6.50x


Interestingly, Recall scores remain relatively consistent (66-71%) despite skewed distribution, suggesting filtering maintains accuracy regardless of label frequency.

This demonstrates that filtering performance is strongly influenced by label frequency, with rare labels incurring significantly higher latency overhead.


#### Observation (relationship between label specificity and query latency in the inline filtering results):

Label 'a':

Uniform: 33.3% specificity → 0.18ms latency
Skewed: 60.0% specificity → 0.11ms latency
Shows faster performance with higher specificity


Label 'c':

Uniform: 33.3% specificity → 0.21ms latency
Skewed: 10.0% specificity → 0.39ms latency
Shows much slower performance with lower specificity



This suggests that filter performance is better when searching for more frequent labels (higher specificity) and worse when searching for rare labels (lower specificity). The relationship appears to be inverse - as specificity decreases, latency increases significantly.


Code source: https://github.com/nmslib/hnswlib

### Correlated Data using K means cluster 

This data is positively correlated as for HNSW it searches by distance the nearest k neighbors, so for the case of negative correlation we may inverse the search by multiplying by -1 if possible knowing that the HNSW index does not support that.

In [2]:
class LabelFilteredANNEvaluator2:
    """
    Design Metrics for Filtered ANN Search:
    1. Query Latency:
       - Measures search time with/without filters
       - Compares overhead of filtering
    2. Accuracy Impact:
       - Recall@k: proportion of true nearest neighbors found
       - How filtering affects quality of results
    3. Filter Friction:
       - Filter specificity: proportion of points passing filter
       - Impact of label distribution on performance
    """
    def __init__(self, dim=16, num_elements=3000):
        self.dim = dim
        self.num_elements = num_elements
        self.metrics = defaultdict(list)

   
    def generate_correlated_data(self, correlation_strength=0.7):
        """
        Generate data where points with same labels are clustered together.
        Args:
            correlation_strength: Float between 0 and 1, controls cluster tightness
        """
        # Generate cluster centers for each label
        num_clusters = 3  # one per label
        cluster_centers = np.random.random((num_clusters, self.dim))
        
        # Initialize data array
        self.data = np.zeros((self.num_elements, self.dim), dtype=np.float32)
        
        # Generate points for each label/cluster
        points_per_cluster = self.num_elements // num_clusters
        self.labels = []
        
        for i, label in enumerate(['a', 'b', 'c']):
            start_idx = i * points_per_cluster
            end_idx = start_idx + points_per_cluster if i < 2 else self.num_elements
            
            # Generate random points
            cluster_points = np.random.random((end_idx - start_idx, self.dim))
            
            # Move points closer to their cluster center
            cluster_points = (1 - correlation_strength) * cluster_points + correlation_strength * cluster_centers[i]
            
            # Store points and labels
            self.data[start_idx:end_idx] = cluster_points
            self.labels.extend([label] * (end_idx - start_idx))
        
        self.labels = np.array(self.labels)
        
        # Shuffle data and labels together
        p = np.random.permutation(len(self.data))
        self.data = self.data[p]
        self.labels = self.labels[p]
        
        # Calculate distribution
        unique, counts = np.unique(self.labels, return_counts=True)
        self.metrics['label_distribution'] = dict(zip(unique, counts / len(self.labels)))
        
        return self.data, self.labels
    
    def build_index(self):
        """Build HNSW index with the generated data"""
        self.index = hnswlib.Index(space='cosine', dim=self.dim)
        self.index.init_index(max_elements=self.num_elements, ef_construction=100, M=16)
        self.index.set_ef(20)
        self.index.set_num_threads(1)
        start_time = time.time()
        self.index.add_items(self.data, ids=np.arange(self.num_elements))
        build_time = time.time() - start_time
        self.metrics['build_time'] = build_time

    def create_label_filter(self, target_label):
        """Create filter function for a specific label"""
        def filter_function(idx):
            return self.labels[idx] == target_label
        return filter_function
    
    def calculate_recall(self, filtered_results, true_results, query_points, target_label, k):
        """
    Calculate recall@k for filtered nearest neighbor search results.
    
    Args:
        filtered_results: Results from filtered knn search (n_queries x k)
        true_results: Results from unfiltered knn search (n_queries x k)
        query_points: Query points used for search (n_queries x dim)
        target_label: Label to filter for
        k: Number of nearest neighbors
    
    Returns:
        float: Average recall@k across all queries
        """
        recall = 0
        n_queries = len(query_points)
    
        target_mask = self.labels == target_label
        target_data = self.data[target_mask]
        target_indices = np.where(target_mask)[0]
    
        for i in range(n_queries):
            distances = np.linalg.norm(target_data - query_points[i], axis=1)
            true_neighbor_indices = target_indices[np.argsort(distances)[:k]]
        
            filtered_neighbor_indices = filtered_results[i]
        
            intersection = set(filtered_neighbor_indices) & set(true_neighbor_indices)
            recall += len(intersection) / k
    
        return recall / n_queries

    def evaluate_query_performance(self, num_queries=100, k=10):
        """Evaluate query performance with comprehensive metrics"""

        query_points = np.float32(np.random.random((num_queries, self.dim)))
        
        # Unfiltered
        start_time = time.time()
        unfiltered_labels, unfiltered_distances = self.index.knn_query(query_points, k=k, num_threads=1)
        unfiltered_time = time.time() - start_time
        
        filter_times = {}
        recall_scores = {}
        filter_specificity = {}
        result_counts = {}
        
        # Per each label metrics
        for label in ['a', 'b', 'c']:
            filter_func = self.create_label_filter(label)
            
            # Latency
            start_time = time.time()
            filtered_labels, filtered_distances = self.index.knn_query(
                query_points, k=k, num_threads=1, filter=filter_func
            )
            filter_time = time.time() - start_time
            filter_times[label] = filter_time / num_queries
            # total_requested = len(self.labels)
            # matching_label = np.sum(self.labels == label)
            # result_counts[label] = matching_label / total_requested
            
            # Filter Specificity
            points_passing_filter = sum(filter_func(i) for i in range(self.num_elements))
            filter_specificity[label] = points_passing_filter / self.num_elements
            
            # Accuracy Impact (Recall)
            recall_scores[label] = self.calculate_recall(
    filtered_labels, 
    unfiltered_labels,
    query_points,
    label,         
    k            
)

        self.metrics['query_latency'] = {
            'unfiltered': unfiltered_time / num_queries,
            'filtered': filter_times
        }
        # self.metrics['filter_specificity'] = result_counts
        self.metrics['filter_specificity'] = filter_specificity
        self.metrics['recall_scores'] = recall_scores
        self.metrics['filter_friction'] = {
            'latency_overhead': {label: filter_times[label]/self.metrics['query_latency']['unfiltered'] 
                        for label in filter_times},
            'specificity': filter_specificity,
            'recall_impact': recall_scores
}
        
        return self.metrics
    
    # def evaluate_query_performance(self, num_queries=100, k=10):
    #     """Evaluate query performance with different label filters"""
    #     query_points = np.float32(np.random.random((num_queries, self.dim)))
        
    #     start_time = time.time()
    #     unfiltered_labels, unfiltered_distances = self.index.knn_query(query_points, k=k, num_threads=1)
    #     unfiltered_time = time.time() - start_time
        
    #     filter_times = {}
    #     result_counts = {}
    #     filter_specificity = {}

        
    #     for label in ['a', 'b', 'c']:
    #         filter_func = self.create_label_filter(label)
            
    #         start_time = time.time()
    #         filtered_labels, filtered_distances = self.index.knn_query(
    #             query_points, k=k, num_threads=1, filter=filter_func
    #         )
    #         filter_time = time.time() - start_time
            
    #         filter_times[label] = filter_time / num_queries
    #         total_requested = len(self.labels)
    #         matching_label = np.sum(self.labels == label)
    #         result_counts[label] = matching_label / total_requested
    #         # Count how many points pass the filter
    #         points_passing_filter = sum(filter_func(i) for i in range(self.num_elements))
    #         filter_specificity[label] = points_passing_filter / self.num_elements
        
    #     self.metrics['query_latency'] = {
    #         'unfiltered': unfiltered_time / num_queries,
    #         'filtered': filter_times
    #     }
    #     # self.metrics['filter_specificity'] = result_counts

    
    #     self.metrics['filter_specificity'] = filter_specificity
        
    #     return self.metrics

In [3]:
def print_evaluation_results(metrics):
    print("\nEvaluation Results:")
    print(f"Build Time: {metrics['build_time']:.3f} seconds")
    
    print("\nLabel Distribution:")
    for label, freq in metrics['label_distribution'].items():
        print(f"- Label {label}: {freq*100:.1f}%")
    
    print(f"\nQuery Latency:")
    print(f"- Unfiltered: {metrics['query_latency']['unfiltered']*1000:.2f} ms per query")
    for label, latency in metrics['query_latency']['filtered'].items():
        print(f"- Filtered (label {label}): {latency*1000:.2f} ms per query")
    
    print(f"\nFilter Specificity:")
    for label, specificity in metrics['filter_specificity'].items():
        print(f"- Label {label}: {specificity*100:.1f}%")
        
    print("\nRecall Scores:")
    for label, recall in metrics['recall_scores'].items():
        print(f"- Label {label}: {recall*100:.1f}%")
        
    print("\nFilter Friction:")
    print("Latency Overhead:")
    for label, overhead in metrics['filter_friction']['latency_overhead'].items():
        print(f"- Label {label}: {overhead:.2f}x")
    print("Recall Impact:")
    for label, impact in metrics['filter_friction']['recall_impact'].items():
        print(f"- Label {label}: {impact*100:.1f}%")

In [6]:
def run_labeled_evaluation2():
    evaluator = LabelFilteredANNEvaluator2()
    
    print("Generating labeled correlated data...")
    evaluator.generate_correlated_data()
    
    print("Building index...")
    evaluator.build_index()
    
    print("Evaluating performance...")
    metrics = evaluator.evaluate_query_performance()
    
    print_evaluation_results(metrics)

In [7]:
if __name__ == "__main__":
    run_labeled_evaluation2()

Generating labeled correlated data...
Building index...
Evaluating performance...

Evaluation Results:
Build Time: 0.686 seconds

Label Distribution:
- Label a: 33.3%
- Label b: 33.3%
- Label c: 33.3%

Query Latency:
- Unfiltered: 0.05 ms per query
- Filtered (label a): 2.60 ms per query
- Filtered (label b): 2.91 ms per query
- Filtered (label c): 3.10 ms per query

Filter Specificity:
- Label a: 33.3%
- Label b: 33.3%
- Label c: 33.3%

Recall Scores:
- Label a: 75.9%
- Label b: 70.7%
- Label c: 72.2%

Filter Friction:
Latency Overhead:
- Label a: 48.67x
- Label b: 54.34x
- Label c: 57.99x
Recall Impact:
- Label a: 75.9%
- Label b: 70.7%
- Label c: 72.2%


#### Results

The results show that, as expected, the query latency for unfiltered search (0.05ms) is significantly faster than inline filtered search (2.60-3.10ms) in the correlated distribution case. 
The build time is slower for correlated due to clustered structure.

Label Distribution and Specificity are the same as the uniform distribution.

Each label (a, b, c) represents exactly 33.3% of data.


Performance Impact:

Latency Overhead shows filtering is 48x-57x slower than unfiltered search .This makes sense because clustered data changes how HNSW traverses the index.
Recall scores around 70%-75% for all labels as it shows the impact of clustering on search accuracy.

The correlated distribution case demonstrates that filtering adds significant overhead and 
shows a notable pattern in performance impact due to clustered structure.


In [17]:
class LabelFilteredANNEvaluator3:
    """
    Design Metrics for Filtered ANN Search:
    1. Query Latency:
       - Measures search time with/without filters
       - Compares overhead of filtering
    2. Accuracy Impact:
       - Recall@k: proportion of true nearest neighbors found
       - How filtering affects quality of results
    3. Filter Friction:
       - Filter specificity: proportion of points passing filter
       - Impact of label distribution on performance
    """
    def __init__(self, dim=16, num_elements=3000):
        self.dim = dim
        self.num_elements = num_elements
        self.metrics = defaultdict(list)

    def generate_all_a_data(self):
        """Generate data where all points have label 'a' (100% specificity)"""
        self.data = np.float32(np.random.random((self.num_elements, self.dim)))
        self.labels = np.array(['a'] * self.num_elements)
    
        unique, counts = np.unique(self.labels, return_counts=True)
        self.metrics['label_distribution'] = dict(zip(unique, counts / len(self.labels)))
    
        return self.data, self.labels
 
    def build_index(self):
        """Build HNSW index with the generated data"""
        self.index = hnswlib.Index(space='cosine', dim=self.dim)
        self.index.init_index(max_elements=self.num_elements, ef_construction=100, M=16)
        self.index.set_ef(20)
        self.index.set_num_threads(1)
        start_time = time.time()
        self.index.add_items(self.data, ids=np.arange(self.num_elements))
        build_time = time.time() - start_time
        self.metrics['build_time'] = build_time

    def create_label_filter(self, target_label):
        """Create filter function for a specific label"""
        def filter_function(idx):
            return self.labels[idx] == target_label
        return filter_function
    
    def calculate_recall(self, filtered_results, true_results, query_points, target_label, k):
        """
    Calculate recall@k for filtered nearest neighbor search results.
    
    Args:
        filtered_results: Results from filtered knn search (n_queries x k)
        true_results: Results from unfiltered knn search (n_queries x k)
        query_points: Query points used for search (n_queries x dim)
        target_label: Label to filter for
        k: Number of nearest neighbors
    
    Returns:
        float: Average recall@k across all queries
        """
        recall = 0
        n_queries = len(query_points)
    
        target_mask = self.labels == target_label
        target_data = self.data[target_mask]
        target_indices = np.where(target_mask)[0]
    
        for i in range(n_queries):
            distances = np.linalg.norm(target_data - query_points[i], axis=1)
            true_neighbor_indices = target_indices[np.argsort(distances)[:k]]
        
            filtered_neighbor_indices = filtered_results[i]
        
            intersection = set(filtered_neighbor_indices) & set(true_neighbor_indices)
            recall += len(intersection) / k
    
        return recall / n_queries

    def evaluate_query_performance(self, num_queries=100, k=10):
        """Evaluate query performance with comprehensive metrics"""

        query_points = np.float32(np.random.random((num_queries, self.dim)))
        
        #  Unfiltered
        start_time = time.time()
        unfiltered_labels, unfiltered_distances = self.index.knn_query(query_points, k=k, num_threads=1)
        unfiltered_time = time.time() - start_time
        
        filter_times = {}
        recall_scores = {}
        filter_specificity = {}
        result_counts = {}
        
        # Per each label metrics
        for label in ['a']:
            filter_func = self.create_label_filter(label)
            
            # Latency
            start_time = time.time()
            filtered_labels, filtered_distances = self.index.knn_query(
                query_points, k=k, num_threads=1, filter=filter_func
            )
            filter_time = time.time() - start_time
            filter_times[label] = filter_time / num_queries
            # total_requested = len(self.labels)
            # matching_label = np.sum(self.labels == label)
            # result_counts[label] = matching_label / total_requested
            
            # Filter Specificity
            points_passing_filter = sum(filter_func(i) for i in range(self.num_elements))
            filter_specificity[label] = points_passing_filter / self.num_elements
            
            # Accuracy Impact (Recall)
            recall_scores[label] = self.calculate_recall(
    filtered_labels, 
    unfiltered_labels,
    query_points, 
    label,         
    k              
)

        self.metrics['query_latency'] = {
            'unfiltered': unfiltered_time / num_queries,
            'filtered': filter_times
        }
        # self.metrics['filter_specificity'] = result_counts
        self.metrics['filter_specificity'] = filter_specificity
        self.metrics['recall_scores'] = recall_scores
        self.metrics['filter_friction'] = {
            'latency_overhead': {label: filter_times[label]/self.metrics['query_latency']['unfiltered'] 
                        for label in filter_times},
            'specificity': filter_specificity,
            'recall_impact': recall_scores
}
        
        return self.metrics
    
    # def evaluate_query_performance(self, num_queries=100, k=10):
    #     """Evaluate query performance with different label filters"""
    #     query_points = np.float32(np.random.random((num_queries, self.dim)))
        
    #     start_time = time.time()
    #     unfiltered_labels, unfiltered_distances = self.index.knn_query(query_points, k=k, num_threads=1)
    #     unfiltered_time = time.time() - start_time
        
    #     filter_times = {}
    #     result_counts = {}
    #     filter_specificity = {}

        
    #     for label in ['a', 'b', 'c']:
    #         filter_func = self.create_label_filter(label)
            
    #         start_time = time.time()
    #         filtered_labels, filtered_distances = self.index.knn_query(
    #             query_points, k=k, num_threads=1, filter=filter_func
    #         )
    #         filter_time = time.time() - start_time
            
    #         filter_times[label] = filter_time / num_queries
    #         total_requested = len(self.labels)
    #         matching_label = np.sum(self.labels == label)
    #         result_counts[label] = matching_label / total_requested
    #         # Count how many points pass the filter
    #         points_passing_filter = sum(filter_func(i) for i in range(self.num_elements))
    #         filter_specificity[label] = points_passing_filter / self.num_elements
        
    #     self.metrics['query_latency'] = {
    #         'unfiltered': unfiltered_time / num_queries,
    #         'filtered': filter_times
    #     }
    #     # self.metrics['filter_specificity'] = result_counts

    
    #     self.metrics['filter_specificity'] = filter_specificity
        
    #     return self.metrics

In [18]:
def print_evaluation_results(metrics):
    print("\nEvaluation Results:")
    print(f"Build Time: {metrics['build_time']:.3f} seconds")
    
    print("\nLabel Distribution:")
    for label, freq in metrics['label_distribution'].items():
        print(f"- Label {label}: {freq*100:.1f}%")
    
    print(f"\nQuery Latency:")
    print(f"- Unfiltered: {metrics['query_latency']['unfiltered']*1000:.2f} ms per query")
    for label, latency in metrics['query_latency']['filtered'].items():
        print(f"- Filtered (label {label}): {latency*1000:.2f} ms per query")
    
    print(f"\nFilter Specificity:")
    for label, specificity in metrics['filter_specificity'].items():
        print(f"- Label {label}: {specificity*100:.1f}%")
        
    print("\nRecall Scores:")
    for label, recall in metrics['recall_scores'].items():
        print(f"- Label {label}: {recall*100:.1f}%")
        
    print("\nFilter Friction:")
    print("Latency Overhead:")
    for label, overhead in metrics['filter_friction']['latency_overhead'].items():
        print(f"- Label {label}: {overhead:.2f}x")
    print("Recall Impact:")
    for label, impact in metrics['filter_friction']['recall_impact'].items():
        print(f"- Label {label}: {impact*100:.1f}%")

In [19]:
def run_labeled_evaluation3():
    evaluator = LabelFilteredANNEvaluator3()
    
    print("Generating labeled data with only one label...")
    evaluator.generate_all_a_data()
    
    print("Building index...")
    evaluator.build_index()
    
    print("Evaluating performance...")
    metrics = evaluator.evaluate_query_performance()
    
    print_evaluation_results(metrics)

In [20]:
if __name__ == "__main__":
    run_labeled_evaluation3()

Generating labeled data with only one label...
Building index...
Evaluating performance...

Evaluation Results:
Build Time: 0.402 seconds

Label Distribution:
- Label a: 100.0%

Query Latency:
- Unfiltered: 0.05 ms per query
- Filtered (label a): 0.14 ms per query

Filter Specificity:
- Label a: 100.0%

Recall Scores:
- Label a: 69.3%

Filter Friction:
Latency Overhead:
- Label a: 2.96x
Recall Impact:
- Label a: 69.3%


#### Results

The performance comparison shows the filter overhead even when all points have label 'a':

Unfiltered search: 0.05ms
Filtered search (100% label 'a'): 0.14ms

2.96x overhead despite all points passing filter
Better than 33.33% case but similar to 60% case
Shows baseline filter overhead exists even with 100% passing



This demonstrates that even when all points pass the filter, the mere presence of filtering logic adds latency compared to unfiltered search.