# Entity Resolution Evaluation Notebook

This notebook evaluates the results of the entity resolution pipeline, analyzing the performance of various stages and the final clustering outcomes.

In [None]:
import os
import sys
import json
import yaml
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import jsonlines

# Add parent directory to path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Import pipeline modules as needed
from src.utils import compute_vector_similarity, compute_levenshtein_similarity

## Load Configuration and Results

In [None]:
# Load configuration
config_path = Path('../config.yml')
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

# Set paths
output_dir = Path(config['general']['output_dir'])
analysis_dir = output_dir / 'analysis'
detailed_dir = output_dir / 'detailed'

## 1. Summary Report Analysis

In [None]:
# Load summary report
summary_path = output_dir / 'summary_report.json'
if summary_path.exists():
    with open(summary_path, 'r') as f:
        summary = json.load(f)
    
    # Display summary information
    print(f"Pipeline executed on: {summary['timestamp']}\n")
    print(f"Mode: {summary['config']['mode']}")
    print(f"Embedding model: {summary['config']['openai_model']}\n")
    
    print("=== Preprocessing ===")
    print(f"Unique strings: {summary['preprocessing']['unique_strings']}")
    print(f"Records: {summary['preprocessing']['records']}")
    print(f"Person IDs: {summary['preprocessing']['person_ids']}\n")
    
    print("=== Classification and Clustering ===")
    print(f"Matches: {summary['classification']['matches']}")
    print(f"Match threshold: {summary['classification']['threshold']}")
    print(f"Clusters: {summary['clustering']['clusters']}")
    print(f"Clustering algorithm: {summary['clustering']['algorithm']}\n")
    
    # Calculate derived metrics
    match_rate = summary['classification']['matches'] / summary['preprocessing']['person_ids']
    avg_cluster_size = summary['preprocessing']['person_ids'] / summary['clustering']['clusters'] \
        if summary['clustering']['clusters'] > 0 else 0
    
    print("=== Derived Metrics ===")
    print(f"Match rate: {match_rate:.4f} matches per person")
    print(f"Average cluster size: {avg_cluster_size:.2f} persons per cluster")
else:
    print("Summary report not found")

## 2. Preprocessing Analysis

In [None]:
# Load preprocessing statistics
preprocessing_stats_path = analysis_dir / 'preprocessing_stats.json'
if preprocessing_stats_path.exists():
    with open(preprocessing_stats_path, 'r') as f:
        preprocessing_stats = json.load(f)
    
    # Display field distribution
    field_counts = {field: data['unique_strings'] for field, data in preprocessing_stats['fields'].items()}
    field_df = pd.DataFrame({
        'Field': field_counts.keys(),
        'Unique Strings': field_counts.values()
    })
    
    plt.figure(figsize=(10, 6))
    sns.barplot(data=field_df, x='Field', y='Unique Strings')
    plt.title('Unique Strings by Field')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    # Display frequency distribution
    print("\n=== String Frequency Statistics ===")
    print(f"Min frequency: {preprocessing_stats['frequency_distribution']['min']}")
    print(f"Max frequency: {preprocessing_stats['frequency_distribution']['max']}")
    print(f"Mean frequency: {preprocessing_stats['frequency_distribution']['mean']:.2f}")
    print(f"Median frequency: {preprocessing_stats['frequency_distribution']['median']:.2f}")
    
    # Plot frequency distribution (first 20 values)
    freq_dist = preprocessing_stats['frequency_distribution']['counts']
    freqs = [int(k) for k in freq_dist.keys()][:20]
    counts = [freq_dist[str(f)] for f in freqs]
    
    plt.figure(figsize=(10, 6))
    plt.bar(freqs, counts)
    plt.title('String Frequency Distribution')
    plt.xlabel('Frequency')
    plt.ylabel('Count')
    plt.yscale('log')
    plt.tight_layout()
    plt.show()
else:
    print("Preprocessing statistics not found")

## 3. Classification Analysis

In [None]:
# Load classification metrics
classification_metrics_path = detailed_dir / 'classification_metrics.json'
if classification_metrics_path.exists():
    with open(classification_metrics_path, 'r') as f:
        classification_metrics = json.load(f)
    
    # Display training metrics
    if 'test_metrics' in classification_metrics:
        print("=== Test Set Metrics ===")
        test_metrics = classification_metrics['test_metrics']
        print(f"Precision: {test_metrics['precision']:.4f}")
        print(f"Recall: {test_metrics['recall']:.4f}")
        print(f"F1 Score: {test_metrics['f1']:.4f}\n")
    
    # Display feature importance if available
    if 'feature_importance' in classification_metrics:
        feature_importance = classification_metrics['feature_importance']
        feature_df = pd.DataFrame({
            'Feature': feature_importance.keys(),
            'Importance': feature_importance.values()
        }).sort_values('Importance', ascending=False)
        
        print("=== Feature Importance ===")
        display(feature_df)
        
        plt.figure(figsize=(12, 6))
        sns.barplot(data=feature_df, x='Importance', y='Feature')
        plt.title('Feature Importance')
        plt.tight_layout()
        plt.show()
else:
    print("Classification metrics not found")
    
# Load match pairs
match_pairs_path = detailed_dir / 'match_pairs.csv'
if match_pairs_path.exists():
    match_pairs_df = pd.read_csv(match_pairs_path)
    
    print(f"\n=== Match Pairs Analysis ===")
    print(f"Total match pairs: {len(match_pairs_df)}")
    
    # Analyze confidence distribution
    print(f"\nConfidence Distribution:")
    print(f"Min: {match_pairs_df['confidence'].min():.4f}")
    print(f"Max: {match_pairs_df['confidence'].max():.4f}")
    print(f"Mean: {match_pairs_df['confidence'].mean():.4f}")
    print(f"Median: {match_pairs_df['confidence'].median():.4f}")
    
    plt.figure(figsize=(10, 6))
    plt.hist(match_pairs_df['confidence'], bins=20, alpha=0.7)
    plt.title('Match Confidence Distribution')
    plt.xlabel('Confidence')
    plt.ylabel('Count')
    plt.axvline(config['classification']['match_threshold'], color='red', linestyle='--', 
                label=f"Threshold: {config['classification']['match_threshold']}")
    plt.legend()
    plt.tight_layout()
    plt.show()
else:
    print("Match pairs not found")

## 4. Clustering Analysis

In [None]:
# Load clustering statistics
clustering_stats_path = analysis_dir / 'clustering_stats.json'
if clustering_stats_path.exists():
    with open(clustering_stats_path, 'r') as f:
        clustering_stats = json.load(f)
    
    print("=== Clustering Statistics ===")
    print(f"Total clusters: {clustering_stats['total_clusters']}")
    print(f"Total entities: {clustering_stats['total_entities']}")
    
    # Cluster size distribution
    print(f"\nCluster Size Distribution:")
    print(f"Min size: {clustering_stats['cluster_size_distribution']['min']}")
    print(f"Max size: {clustering_stats['cluster_size_distribution']['max']}")
    print(f"Mean size: {clustering_stats['cluster_size_distribution']['mean']:.2f}")
    print(f"Median size: {clustering_stats['cluster_size_distribution']['median']:.2f}")
    
    # Plot cluster size histogram
    bins = clustering_stats['cluster_size_histogram']['bins']
    counts = clustering_stats['cluster_size_histogram']['counts']
    
    plt.figure(figsize=(10, 6))
    plt.bar(bins[:-1], counts, width=(bins[1]-bins[0]))
    plt.title('Cluster Size Distribution')
    plt.xlabel('Cluster Size')
    plt.ylabel('Count')
    plt.yscale('log')  # Log scale for better visualization
    plt.tight_layout()
    plt.show()
else:
    print("Clustering statistics not found")
    
# Load cluster assignments
clusters_path = detailed_dir / 'clusters.csv'
if clusters_path.exists():
    clusters_df = pd.read_csv(clusters_path)
    
    # Analyze cluster distribution
    cluster_sizes = clusters_df.groupby('cluster_id')['entity_id'].count().reset_index()
    cluster_sizes = cluster_sizes.rename(columns={'entity_id': 'size'})
    
    # Top 10 largest clusters
    top_clusters = cluster_sizes.sort_values('size', ascending=False).head(10)
    
    print("\n=== Top 10 Largest Clusters ===")
    display(top_clusters)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(data=top_clusters, x='cluster_id', y='size')
    plt.title('Top 10 Largest Clusters')
    plt.xlabel('Cluster ID')
    plt.ylabel('Size')
    plt.tight_layout()
    plt.show()
else:
    print("Cluster assignments not found")

## 5. Entity Resolution Quality Analysis

If ground truth data is available, we can evaluate the quality of entity resolution.

In [None]:
# Load ground truth data if available
ground_truth_path = Path(config['dataset']['ground_truth_file'])
if ground_truth_path.exists():
    ground_truth_df = pd.read_csv(ground_truth_path)
    
    print(f"Ground truth data: {len(ground_truth_df)} pairs")
    print(f"Positive pairs: {ground_truth_df['match'].sum()}")
    print(f"Negative pairs: {len(ground_truth_df) - ground_truth_df['match'].sum()}")
    
    # If we have match pairs, we can compute precision, recall, and F1 score
    if 'match_pairs_df' in locals():
        # Create a set of match pairs from our results
        predicted_pairs = set()
        for _, row in match_pairs_df.iterrows():
            pair = tuple(sorted([row['entity1'], row['entity2']]))
            predicted_pairs.add(pair)
        
        # Create a set of match pairs from ground truth
        true_pairs = set()
        for _, row in ground_truth_df.iterrows():
            if row['match']:
                pair = tuple(sorted([row['left'], row['right']]))
                true_pairs.add(pair)
        
        # Compute metrics
        true_positives = len(predicted_pairs.intersection(true_pairs))
        false_positives = len(predicted_pairs - true_pairs)
        false_negatives = len(true_pairs - predicted_pairs)
        
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        print("\n=== Entity Resolution Quality ===")
        print(f"True Positives: {true_positives}")
        print(f"False Positives: {false_positives}")
        print(f"False Negatives: {false_negatives}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
else:
    print("Ground truth data not found")

## 6. Feature Analysis

Analyze feature distributions and correlations.

In [None]:
# This would require access to the feature vectors
# As a placeholder, we can visualize the feature importance if available
if 'feature_df' in locals():
    # Plot feature importance as a pie chart for top 8 features
    top_features = feature_df.head(8)
    plt.figure(figsize=(10, 8))
    plt.pie(top_features['Importance'], labels=top_features['Feature'], autopct='%1.1f%%')
    plt.title('Top 8 Features by Importance')
    plt.tight_layout()
    plt.show()
else:
    print("Feature data not available")

## 7. Model Weights Analysis

Analyze the learned model weights.

In [None]:
# Load model weights if available
from src.classification import Classifier

# This requires loading the classifier state from a checkpoint
# As a placeholder, we can visualize the feature importance from earlier
if 'feature_importance' in locals():
    print("Model weights analysis would go here, using feature importance as proxy")
else:
    print("Model weights not available")

## 8. Runtime Performance Analysis

In [None]:
# Load logfile and extract timing information
logfile_path = Path(config['monitoring']['logging']['file'])
if logfile_path.exists():
    # Simple parsing of log lines containing timing information
    import re
    
    # Extract timing information
    timing_pattern = r'(\w+) completed in ([\d\.]+) seconds'
    timings = {}
    
    with open(logfile_path, 'r') as f:
        for line in f:
            match = re.search(timing_pattern, line)
            if match:
                stage = match.group(1).lower()
                time = float(match.group(2))
                timings[stage] = time
    
    if timings:
        # Display timing information
        timing_df = pd.DataFrame({
            'Stage': timings.keys(),
            'Time (seconds)': timings.values()
        }).sort_values('Time (seconds)', ascending=False)
        
        print("=== Runtime Performance ===")
        display(timing_df)
        
        plt.figure(figsize=(12, 6))
        sns.barplot(data=timing_df, x='Time (seconds)', y='Stage')
        plt.title('Pipeline Stage Runtime Performance')
        plt.tight_layout()
        plt.show()
    else:
        print("No timing information found in logfile")
else:
    print("Logfile not found")

## 9. Conclusions and Recommendations

Based on the analysis above, here are some key findings and recommendations:

1. **Entity Resolution Quality**:
   - Precision: [Value from analysis]
   - Recall: [Value from analysis]
   - F1 Score: [Value from analysis]
   - Recommendation: [Based on metrics]

2. **Feature Importance**:
   - Top features: [List top features]
   - Recommendation: [Based on feature analysis]

3. **Clustering Performance**:
   - Total clusters: [Value from analysis]
   - Cluster size distribution: [Brief description]
   - Recommendation: [Based on clustering analysis]

4. **Runtime Performance**:
   - Bottlenecks: [Identify slowest stages]
   - Recommendation: [Based on timing analysis]

5. **Overall Recommendations**:
   - [General recommendations for improving the entity resolution pipeline]
   - [Suggestions for potential optimizations]
   - [Ideas for further exploration]