# Entity Resolution Data Exploration Notebook

This notebook explores the Yale University Library catalog data to better understand patterns and characteristics that will inform the entity resolution process.

In [None]:
import os
import sys
import json
import yaml
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import re
from collections import Counter
from sklearn.manifold import TSNE
import weaviate

# Add parent directory to path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Import utilities
from src.utils import compute_vector_similarity, compute_levenshtein_similarity

# Set plot style
plt.style.use('ggplot')
sns.set(style="whitegrid")

## Load Configuration and Data

In [None]:
# Load configuration
config_path = Path('../config.yml')
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

# Set data directory
data_dir = Path(config['dataset']['input_dir'])
ground_truth_file = Path(config['dataset']['ground_truth_file'])

In [None]:
# List available data files
csv_files = list(data_dir.glob('*.csv'))
print(f"Found {len(csv_files)} CSV files in {data_dir}")

# Load ground truth data if available
if ground_truth_file.exists():
    ground_truth_df = pd.read_csv(ground_truth_file)
    print(f"Ground truth data: {len(ground_truth_df)} labeled pairs")
    print(f"Positive examples: {ground_truth_df['match'].sum()} ({ground_truth_df['match'].mean()*100:.1f}%)")
    print(f"Negative examples: {len(ground_truth_df) - ground_truth_df['match'].sum()} ({(1-ground_truth_df['match'].mean())*100:.1f}%)")
else:
    print("Ground truth file not found.")

## 1. Initial Data Inspection

Let's load a sample of the data to understand its structure and characteristics.

In [None]:
# Load a sample of CSV files (limited to first 5 for exploration)
sample_files = csv_files[:5]
dfs = []

for file in sample_files:
    try:
        df = pd.read_csv(file)
        df['source_file'] = file.name
        dfs.append(df)
        print(f"Loaded {file.name}: {len(df)} records")
    except Exception as e:
        print(f"Error loading {file.name}: {e}")

# Combine into a single dataframe for exploration
if dfs:
    sample_df = pd.concat(dfs, ignore_index=True)
    print(f"\nCombined sample dataset: {len(sample_df)} records")
    
    # Display the first few rows
    display(sample_df.head())
    
    # Display column information
    print("\nColumn information:")
    for col in sample_df.columns:
        if col != 'source_file':
            na_count = sample_df[col].isna().sum()
            na_percent = na_count / len(sample_df) * 100
            print(f"{col}: {na_count} null values ({na_percent:.1f}%)")
else:
    print("No data loaded")

## 2. Person Entity Analysis

Let's analyze the 'person' field to understand name patterns and characteristics.

In [None]:
if 'sample_df' in locals() and 'person' in sample_df.columns:
    # Count unique person names
    unique_persons = sample_df['person'].unique()
    print(f"Total unique person names: {len(unique_persons)}")
    
    # Frequency distribution
    person_counts = sample_df['person'].value_counts()
    
    print(f"\nTop 10 most frequent person names:")
    display(person_counts.head(10))
    
    # Plot frequency distribution
    plt.figure(figsize=(12, 6))
    person_counts.head(20).plot(kind='bar')
    plt.title('Top 20 Most Frequent Person Names')
    plt.xlabel('Person Name')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
    # Analyze name patterns
    print("\nName pattern analysis:")
    
    # Check for names with birth/death years
    birth_death_pattern = re.compile(r',\s*(\d{4})-(\d{4}|\?)')
    names_with_years = [name for name in unique_persons if birth_death_pattern.search(str(name))]
    
    print(f"Names with birth/death years: {len(names_with_years)} ({len(names_with_years)/len(unique_persons)*100:.1f}%)")
    
    if names_with_years:
        print("\nSample names with birth/death years:")
        for name in names_with_years[:5]:
            print(f"  - {name}")
    
    # Check for names with suffixes (Jr., Sr., III, etc.)
    suffix_pattern = re.compile(r',\s+(Jr\.|Sr\.|I{2,}|IV|V)\b')
    names_with_suffix = [name for name in unique_persons if suffix_pattern.search(str(name))]
    
    print(f"\nNames with suffixes (Jr., Sr., etc.): {len(names_with_suffix)} ({len(names_with_suffix)/len(unique_persons)*100:.1f}%)")
    
    if names_with_suffix:
        print("\nSample names with suffixes:")
        for name in names_with_suffix[:5]:
            print(f"  - {name}")
    
    # Distribution of name lengths
    name_lengths = [len(str(name)) for name in unique_persons]
    
    plt.figure(figsize=(10, 6))
    sns.histplot(name_lengths, bins=30, kde=True)
    plt.title('Distribution of Person Name Lengths')
    plt.xlabel('Name Length (characters)')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    
    # Analyze structure of names (Last, First vs. First Last)
    comma_names = [name for name in unique_persons if ',' in str(name)]
    print(f"\nNames in 'Last, First' format: {len(comma_names)} ({len(comma_names)/len(unique_persons)*100:.1f}%)")
    print(f"Names in other formats: {len(unique_persons) - len(comma_names)} ({(len(unique_persons) - len(comma_names))/len(unique_persons)*100:.1f}%)")
else:
    print("Person data not available for analysis")

## 3. Role and Title Analysis

Let's analyze the roles and titles associated with persons to understand the context of their contributions.

In [None]:
if 'sample_df' in locals():
    # Analyze roles distribution
    if 'roles' in sample_df.columns:
        role_counts = sample_df['roles'].value_counts()
        
        print(f"Unique roles: {len(role_counts)}")
        print("\nTop 10 most common roles:")
        display(role_counts.head(10))
        
        # Plot role distribution
        plt.figure(figsize=(12, 6))
        role_counts.head(10).plot(kind='bar')
        plt.title('Top 10 Most Common Roles')
        plt.xlabel('Role')
        plt.ylabel('Frequency')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
    else:
        print("Roles data not available for analysis")
    
    # Analyze title characteristics
    if 'title' in sample_df.columns:
        # Title length distribution
        title_lengths = sample_df['title'].str.len()
        
        print(f"\nTitle length statistics:")
        print(f"Min: {title_lengths.min()}")
        print(f"Max: {title_lengths.max()}")
        print(f"Mean: {title_lengths.mean():.1f}")
        print(f"Median: {title_lengths.median():.1f}")
        
        plt.figure(figsize=(10, 6))
        sns.histplot(title_lengths, bins=30, kde=True)
        plt.title('Distribution of Title Lengths')
        plt.xlabel('Title Length (characters)')
        plt.ylabel('Frequency')
        plt.tight_layout()
        plt.show()
    else:
        print("Title data not available for analysis")
else:
    print("Sample data not available for analysis")

## 4. Temporal Analysis

Let's analyze temporal aspects of the data to understand the distribution of publication dates and person lifespans.

In [None]:
if 'sample_df' in locals():
    # Extract years from provision field
    if 'provision' in sample_df.columns:
        # Extract years using regex
        year_pattern = re.compile(r'\b(1[5-9]\d{2}|20\d{2})\b')
        
        years = []
        for provision in sample_df['provision'].dropna():
            matches = year_pattern.findall(str(provision))
            if matches:
                years.append(int(matches[0]))  # Take the first year found
        
        if years:
            print(f"Extracted {len(years)} publication years")
            
            # Year distribution
            year_counts = pd.Series(years).value_counts().sort_index()
            
            print(f"\nPublication year range: {min(years)} - {max(years)}")
            
            plt.figure(figsize=(15, 6))
            year_counts.plot(kind='line')
            plt.title('Publication Year Distribution')
            plt.xlabel('Year')
            plt.ylabel('Frequency')
            plt.grid(True)
            plt.tight_layout()
            plt.show()
            
            # Group by century
            centuries = []
            for year in years:
                century = (year // 100) + 1
                centuries.append(f"{century}th century")
            
            century_counts = pd.Series(centuries).value_counts().sort_index()
            
            plt.figure(figsize=(10, 6))
            century_counts.plot(kind='bar')
            plt.title('Distribution by Century')
            plt.xlabel('Century')
            plt.ylabel('Frequency')
            plt.tight_layout()
            plt.show()
        else:
            print("No publication years extracted")
    else:
        print("Provision data not available for temporal analysis")
    
    # Extract birth/death years from person names
    if 'person' in sample_df.columns:
        birth_death_pattern = re.compile(r',\s*(\d{4})-(\d{4}|\?)')
        
        birth_years = []
        death_years = []
        lifespans = []
        
        for name in sample_df['person'].dropna():
            match = birth_death_pattern.search(str(name))
            if match:
                birth_year = int(match.group(1))
                birth_years.append(birth_year)
                
                death_year_str = match.group(2)
                if death_year_str != '?':
                    death_year = int(death_year_str)
                    death_years.append(death_year)
                    lifespans.append(death_year - birth_year)
        
        if birth_years:
            print(f"\n\nExtracted {len(birth_years)} birth years and {len(death_years)} death years")
            
            # Birth year distribution
            plt.figure(figsize=(15, 6))
            sns.histplot(birth_years, bins=30, kde=True)
            plt.title('Birth Year Distribution')
            plt.xlabel('Birth Year')
            plt.ylabel('Frequency')
            plt.grid(True)
            plt.tight_layout()
            plt.show()
            
            # Lifespan distribution
            if lifespans:
                print(f"\nLifespan statistics:")
                print(f"Min: {min(lifespans)} years")
                print(f"Max: {max(lifespans)} years")
                print(f"Mean: {np.mean(lifespans):.1f} years")
                print(f"Median: {np.median(lifespans):.1f} years")
                
                plt.figure(figsize=(10, 6))
                sns.histplot(lifespans, bins=20, kde=True)
                plt.title('Lifespan Distribution')
                plt.xlabel('Lifespan (years)')
                plt.ylabel('Frequency')
                plt.grid(True)
                plt.tight_layout()
                plt.show()
        else:
            print("No birth/death years extracted")
else:
    print("Sample data not available for analysis")

## 5. Subject Analysis

Let's analyze the subjects associated with persons to understand the distribution of topics.

In [None]:
if 'sample_df' in locals() and 'subjects' in sample_df.columns:
    # Count null values
    null_count = sample_df['subjects'].isna().sum()
    print(f"Subjects with null values: {null_count} ({null_count/len(sample_df)*100:.1f}%)")
    
    # Process non-null subjects
    non_null_subjects = sample_df['subjects'].dropna()
    
    # Split subjects by delimiter (assuming semicolon or comma)
    all_subjects = []
    for subject_str in non_null_subjects:
        # Try different delimiters
        if ';' in subject_str:
            subjects = [s.strip() for s in subject_str.split(';')]
        elif ',' in subject_str:
            subjects = [s.strip() for s in subject_str.split(',')]
        else:
            subjects = [subject_str.strip()]
        all_subjects.extend(subjects)
    
    # Count subject frequency
    subject_counts = Counter(all_subjects)
    
    print(f"\nTotal unique subjects: {len(subject_counts)}")
    print(f"Total subject mentions: {sum(subject_counts.values())}")
    
    # Most common subjects
    print("\nTop 20 most common subjects:")
    for subject, count in subject_counts.most_common(20):
        print(f"  - {subject}: {count}")
    
    # Plot subject distribution
    plt.figure(figsize=(12, 8))
    top_subjects = dict(subject_counts.most_common(15))
    plt.bar(top_subjects.keys(), top_subjects.values())
    plt.title('Top 15 Most Common Subjects')
    plt.xlabel('Subject')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
    # Subject length distribution
    subject_lengths = [len(subject) for subject in all_subjects]
    
    plt.figure(figsize=(10, 6))
    sns.histplot(subject_lengths, bins=30, kde=True)
    plt.title('Distribution of Subject Lengths')
    plt.xlabel('Subject Length (characters)')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
else:
    print("Subject data not available for analysis")

## 6. Ground Truth Analysis

Let's analyze the ground truth data to understand the characteristics of matching and non-matching pairs.

In [None]:
if 'ground_truth_df' in locals():
    print(f"Analyzing {len(ground_truth_df)} labeled pairs")
    
    # Split into matches and non-matches
    matches_df = ground_truth_df[ground_truth_df['match'] == True]
    non_matches_df = ground_truth_df[ground_truth_df['match'] == False]
    
    print(f"Matches: {len(matches_df)} ({len(matches_df)/len(ground_truth_df)*100:.1f}%)")
    print(f"Non-matches: {len(non_matches_df)} ({len(non_matches_df)/len(ground_truth_df)*100:.1f}%)")
    
    # If we have access to the sample data, try to find some example pairs
    if 'sample_df' in locals():
        print("\nSample matching pairs:")
        match_count = 0
        
        for _, row in matches_df.head(5).iterrows():
            left_id = row['left']
            right_id = row['right']
            
            left_record = sample_df[sample_df['personId'] == left_id]
            right_record = sample_df[sample_df['personId'] == right_id]
            
            if not left_record.empty and not right_record.empty:
                match_count += 1
                print(f"\nExample {match_count}:")
                print(f"Left:  {left_record['person'].values[0]} - {left_record['title'].values[0][:50]}...")
                print(f"Right: {right_record['person'].values[0]} - {right_record['title'].values[0][:50]}...")
        
        if match_count == 0:
            print("No matching examples found in the sample data")
        
        print("\nSample non-matching pairs:")
        non_match_count = 0
        
        for _, row in non_matches_df.head(5).iterrows():
            left_id = row['left']
            right_id = row['right']
            
            left_record = sample_df[sample_df['personId'] == left_id]
            right_record = sample_df[sample_df['personId'] == right_id]
            
            if not left_record.empty and not right_record.empty:
                non_match_count += 1
                print(f"\nExample {non_match_count}:")
                print(f"Left:  {left_record['person'].values[0]} - {left_record['title'].values[0][:50]}...")
                print(f"Right: {right_record['person'].values[0]} - {right_record['title'].values[0][:50]}...")
        
        if non_match_count == 0:
            print("No non-matching examples found in the sample data")
else:
    print("Ground truth data not available for analysis")

## 7. Name Similarity Analysis

Let's analyze the similarity between person names to understand potential matching challenges.

In [None]:
if 'sample_df' in locals() and 'person' in sample_df.columns:
    # Sample a subset of unique person names
    unique_persons = sample_df['person'].unique()
    
    # Limit to 1000 names for performance
    if len(unique_persons) > 1000:
        np.random.seed(42)  # For reproducibility
        unique_persons = np.random.choice(unique_persons, 1000, replace=False)
    
    print(f"Analyzing similarities among {len(unique_persons)} unique person names")
    
    # Compute Levenshtein similarities for a sample of pairs
    similarities = []
    sample_size = min(10000, len(unique_persons) * (len(unique_persons) - 1) // 2)
    
    for _ in range(sample_size):
        # Sample two different names
        idx1, idx2 = np.random.choice(len(unique_persons), 2, replace=False)
        name1 = str(unique_persons[idx1])
        name2 = str(unique_persons[idx2])
        
        # Compute similarity
        similarity = compute_levenshtein_similarity(name1, name2)
        similarities.append(similarity)
    
    # Analyze similarity distribution
    print(f"\nLevenshtein similarity statistics:")
    print(f"Min: {min(similarities):.4f}")
    print(f"Max: {max(similarities):.4f}")
    print(f"Mean: {np.mean(similarities):.4f}")
    print(f"Median: {np.median(similarities):.4f}")
    
    plt.figure(figsize=(10, 6))
    sns.histplot(similarities, bins=50, kde=True)
    plt.title('Distribution of Levenshtein Similarities Between Person Names')
    plt.xlabel('Similarity (0-1)')
    plt.ylabel('Frequency')
    plt.axvline(x=0.7, color='r', linestyle='--', label='Typical similarity threshold (0.7)')
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    # Find examples of highly similar but different names
    high_similarity_pairs = []
    similarity_threshold = 0.8
    
    for _ in range(1000):  # Sample more pairs to find high similarity examples
        idx1, idx2 = np.random.choice(len(unique_persons), 2, replace=False)
        name1 = str(unique_persons[idx1])
        name2 = str(unique_persons[idx2])
        
        # Skip exact matches
        if name1 == name2:
            continue
        
        similarity = compute_levenshtein_similarity(name1, name2)
        if similarity >= similarity_threshold:
            high_similarity_pairs.append((name1, name2, similarity))
    
    # Display high similarity pairs
    if high_similarity_pairs:
        print(f"\n{len(high_similarity_pairs)} pairs with similarity >= {similarity_threshold}:")
        for name1, name2, similarity in sorted(high_similarity_pairs, key=lambda x: x[2], reverse=True)[:10]:
            print(f"  - '{name1}' / '{name2}': {similarity:.4f}")
    else:
        print(f"\nNo pairs found with similarity >= {similarity_threshold}")
else:
    print("Person data not available for similarity analysis")

## 8. Vector Embedding Visualization

Let's generate and visualize some example embeddings to understand how they might differentiate entities.

In [None]:
# Note: This requires access to OpenAI API, so it may not run in all environments
import os
from openai import OpenAI

# Optional: Load OpenAI API key from environment or config
api_key = os.environ.get(config['openai']['api_key_env'])

if api_key:
    client = OpenAI(api_key=api_key)
    
    if 'sample_df' in locals() and 'person' in sample_df.columns:
        # Sample a small subset of person names for visualization
        sample_names = sample_df['person'].dropna().unique()
        if len(sample_names) > 30:
            np.random.seed(42)  # For reproducibility
            sample_names = np.random.choice(sample_names, 30, replace=False)
        
        print(f"Generating embeddings for {len(sample_names)} person names")
        
        try:
            # Generate embeddings
            response = client.embeddings.create(
                model=config['openai']['embedding_model'],
                input=[str(name) for name in sample_names]
            )
            
            # Extract embeddings
            embeddings = [data.embedding for data in response.data]
            
            # Convert to numpy array
            embeddings_array = np.array(embeddings)
            
            print(f"Embedding shape: {embeddings_array.shape}")
            
            # Reduce dimensionality for visualization
            tsne = TSNE(n_components=2, perplexity=5, random_state=42)
            embeddings_2d = tsne.fit_transform(embeddings_array)
            
            # Create dataframe for plotting
            viz_df = pd.DataFrame({
                'x': embeddings_2d[:, 0],
                'y': embeddings_2d[:, 1],
                'name': [str(name) for name in sample_names]
            })
            
            # Plot embeddings
            plt.figure(figsize=(12, 10))
            sns.scatterplot(data=viz_df, x='x', y='y', s=100)
            
            # Add name labels
            for _, row in viz_df.iterrows():
                plt.text(row['x'], row['y'], row['name'].split(',')[0], fontsize=9)
            
            plt.title('t-SNE Visualization of Person Name Embeddings')
            plt.xlabel('t-SNE Dimension 1')
            plt.ylabel('t-SNE Dimension 2')
            plt.tight_layout()
            plt.show()
            
            # Compute pairwise cosine similarities
            similarities = np.zeros((len(embeddings), len(embeddings)))
            for i in range(len(embeddings)):
                for j in range(len(embeddings)):
                    similarities[i, j] = compute_vector_similarity(
                        embeddings_array[i], embeddings_array[j], metric='cosine'
                    )
            
            # Create similarity heatmap
            plt.figure(figsize=(12, 10))
            sns.heatmap(similarities, annot=False, cmap='viridis',
                       xticklabels=[name.split(',')[0] for name in sample_names],
                       yticklabels=[name.split(',')[0] for name in sample_names])
            plt.title('Cosine Similarity Heatmap of Person Name Embeddings')
            plt.tight_layout()
            plt.show()
            
            # Find most similar pairs
            similar_pairs = []
            for i in range(len(embeddings)):
                for j in range(i+1, len(embeddings)):
                    similarity = similarities[i, j]
                    similar_pairs.append((sample_names[i], sample_names[j], similarity))
            
            # Sort by similarity
            similar_pairs.sort(key=lambda x: x[2], reverse=True)
            
            print("\nMost similar name pairs (by embedding):")
            for name1, name2, similarity in similar_pairs[:10]:
                print(f"  - '{name1}' / '{name2}': {similarity:.4f}")
        
        except Exception as e:
            print(f"Error generating embeddings: {e}")
else:
    print("OpenAI API key not available for embedding visualization")

## 9. Entity Resolution Challenges

Based on our exploration, here are the key challenges for entity resolution in this dataset:

### 9.1 Name Variation Challenges

1. **Name Format Variations**:
   - Last, First vs. First Last
   - Inclusion/exclusion of middle names or initials
   - Variations in suffixes (Jr., Sr., III)

2. **Name Completeness**:
   - Some entries have birth/death years, others don't
   - Abbreviated vs. full names
   - Inclusion/exclusion of titles (Dr., Prof.)

3. **Common Names**:
   - Many common names that could refer to different individuals
   - Need to use contextual information from other fields

### 9.2 Temporal Challenges

1. **Publication Date vs. Author Lifetime**:
   - Publications may occur long after an author's death
   - Historical figures may have works published centuries later

2. **Missing Temporal Information**:
   - Many records lack birth/death years
   - Publication dates may be uncertain or missing

### 9.3 Role and Context Challenges

1. **Multiple Roles**:
   - Same person may appear with different roles across records
   - Need to distinguish between primary and secondary roles

2. **Subject Matter Diversity**:
   - Same person may be associated with diverse subjects
   - Need to account for multidisciplinary individuals

### 9.4 Data Quality Challenges

1. **Missing Values**:
   - Significant proportion of records with null values in some fields
   - Need robust imputation strategies

2. **Inconsistent Formats**:
   - Variations in how the same information is recorded
   - Need to normalize data before comparison

### 9.5 Multilingual Challenges

1. **Name Transliteration**:
   - Same person may have differently transliterated names
   - Names from non-Latin scripts may have variant spellings

2. **Title/Subject Translation**:
   - Same work may have titles in different languages
   - Subject terms may be in multiple languages

## 10. Conclusions and Recommendations

Based on our exploration, here are key recommendations for the entity resolution pipeline:

### 10.1 Feature Engineering Recommendations

1. **Name-Based Features**:
   - Create specialized features for name comparison (normalized Levenshtein)
   - Implement logic to handle birth/death years as strong signals
   - Develop features for name parts (first name, last name, middle initial)

2. **Contextual Features**:
   - Use title and subject similarity as contextual signals
   - Develop temporal overlap features that account for posthumous publications
   - Create interaction features between name and context similarities

3. **Vector-Based Features**:
   - Use field-specific embeddings for specialized comparison
   - Create composite embeddings that combine multiple fields
   - Implement harmonic means of similarities to handle missing values

### 10.2 Architecture Recommendations

1. **Robust Imputation**:
   - Implement vector-based hot deck imputation for missing fields
   - Create specialized imputation strategies for temporal data

2. **Similarity Thresholds**:
   - Use adaptive thresholds based on name uniqueness
   - Implement higher thresholds for common names

3. **Classification Strategy**:
   - Implement a two-stage approach: blocking followed by detailed comparison
   - Use embeddings for efficient blocking
   - Prioritize precision over recall for ambiguous cases

4. **Result Validation**:
   - Implement logical constraints to ensure consistency
   - Flag potentially problematic matches for human review
   - Create a feedback mechanism to learn from corrections

### 10.3 Implementation Considerations

1. **Scalability**:
   - Optimize batch size for API rate limits
   - Implement efficient vector indexing with Weaviate
   - Use parallel processing for feature extraction

2. **Evaluation Strategy**:
   - Develop specialized metrics for library catalog contexts
   - Create visualization tools for cluster analysis
   - Implement detailed error analysis capabilities

3. **Extensibility**:
   - Design for multilingual support from the beginning
   - Create configurable feature sets for different collection types
   - Implement modular architecture for easy component updates