In [1]:
# Cell 1: Import libraries
import spacy
import os 
import json
import re
from collections import Counter
import pandas as pd
import xml.etree.ElementTree as ET # Added for XML parsing

print("Libraries imported.")

Libraries imported.


## Load Cleaned Text
Read the content of the cleaned text file created by `00_pre_proc.ipynb`.

In [3]:
# Cell 2: Load cleaned text
input_file_path = "../data/pp_cleaned.txt" # Make sure this path is correct
cleaned_text = None

try:
    with open(input_file_path, 'r', encoding='utf-8') as file:
        cleaned_text = file.read()
    print(f"Successfully loaded cleaned text from: {input_file_path}")
    print(f"Text length: {len(cleaned_text)} characters")
except FileNotFoundError:
    print(f"Error: Cleaned text file not found at {input_file_path}")
except Exception as e:
    print(f"An error occurred loading the file: {e}")

Successfully loaded cleaned text from: ../data/pp_cleaned.txt
Text length: 723733 characters


## Load spaCy Model - First attempt with _lg then _trf

In [None]:
# Cell 3: Load spaCy model
nlp = None
if cleaned_text:
    try:
        # Make sure you have downloaded the model: python -m spacy download en_core_web_lg
        nlp = spacy.load("en_core_web_trf") 
        print("spaCy model loaded.")
        print("Pipeline components:", nlp.pipe_names) # Should show 'ner' among others
    except OSError:
        print("Error: spaCy model not found.")
        print("Download it by running: python -m spacy download name")
    except Exception as e:
        print(f"An error occurred loading the spaCy model: {e}")
else:
    print("Skipping spaCy model loading as cleaned_text is not available.")

In [None]:
# Cell 4: Process text with spaCy NER pipeline
doc = None
if cleaned_text and nlp:
     print("Processing text with spaCy NER pipeline (this may take some time)...")
     # Increase max_length if needed, but be mindful of memory usage
     # nlp.max_length = len(cleaned_text) + 100 
     try:
         doc = nlp(cleaned_text)
         print("Text processing complete.")
     except ValueError as ve:
         print(f"ValueError during processing: {ve}")
         print("The text might be too long for the default spaCy model settings.")
         print("Consider increasing nlp.max_length or processing in chunks.")
     except Exception as e:
         print(f"An unexpected error occurred during text processing: {e}")
else:
     print("Skipping text processing as cleaned_text or nlp model is not available.")

## Process Text for Named Entities

## Extract PERSON Entities

In [None]:
# Cell 5: Extract PERSON entities found by NER
person_mentions = []
if doc:
    print("\n--- Extracting PERSON Entities ---")
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            # Store text, start/end character offsets
            person_mentions.append({
                "text": ent.text,
                "start_char": ent.start_char,
                "end_char": ent.end_char
            })
    print(f"Found {len(person_mentions)} PERSON mentions.")
    # Optional: Print first few mentions
    if person_mentions:
         print("First 10 mentions:", person_mentions[:10])
else:
     print("Skipping extraction as doc object is not available.")

In [None]:
# Cell 6: Save raw PERSON mentions (optional but good for debugging)
if person_mentions:
    output_data_path = "../data/ner_person_mentions_trf.json" # Choose your output format/name

    print(f"\nSaving extracted PERSON mentions to {output_data_path}...")
    try:
        # Ensure data directory exists
        os.makedirs(os.path.dirname(output_data_path), exist_ok=True)
        with open(output_data_path, 'w', encoding='utf-8') as f:
            json.dump(person_mentions, f, ensure_ascii=False, indent=4)
        print("Raw NER results saved successfully.")
    except Exception as e:
        print(f"Error saving NER results: {e}")
else:
    print("\nNo PERSON mentions extracted to save.")

## We should improve NER - Second attempt way better

In [8]:
from transformers import pipeline

# --- Load your full cleaned_text variable ---
# (Make sure this is the complete text)
# cleaned_text = ...

# --- Initialize the pipeline ---
literary_ner = pipeline("ner", model="compnet-renard/bert-base-cased-literary-NER", aggregation_strategy="first")

# --- Define Chunking Parameters (Example) ---
# You'll need to determine good values based on model limits (e.g., target ~400-500 tokens)
# For simplicity, let's use character count here, but token count is better.
chunk_size = 2000 # Number of characters per chunk (adjust!)
overlap = 200    # Number of characters overlap (adjust!)

# --- Process in Chunks ---
all_person_mentions = []
all_location_mentions = []
current_pos = 0

print(f"DEBUG: Starting chunk processing. Total text length: {len(cleaned_text)}")

while current_pos < len(cleaned_text):
    chunk_start = current_pos
    chunk_end = min(current_pos + chunk_size, len(cleaned_text))
    text_chunk = cleaned_text[chunk_start:chunk_end]

    # print(f"DEBUG: Processing chunk: {chunk_start} - {chunk_end}") # Optional debug

    if not text_chunk.strip(): # Skip empty chunks if any
        current_pos += chunk_size - overlap
        continue

    try:
         # Run NER on the chunk
         ner_results_chunk = literary_ner(text_chunk)

         # Process results for this chunk
         for entity in ner_results_chunk:
             # Adjust character offsets to be relative to the full text
             original_start = chunk_start + entity['start']
             original_end = chunk_start + entity['end']

             mention_data = {
                 "text": entity['word'],
                 "start_char": original_start,
                 "end_char": original_end
                 # Add score if needed: "score": entity['score']
             }

             if entity.get('entity_group') == 'PER':
                 all_person_mentions.append(mention_data)
             elif entity.get('entity_group') == 'LOC':
                 all_location_mentions.append(mention_data)

    except Exception as e:
         print(f"ERROR processing chunk {chunk_start}-{chunk_end}: {e}") # Log errors

    # Move to the next chunk position
    # If it's the last chunk, stop
    if chunk_end == len(cleaned_text):
        break
    current_pos += chunk_size - overlap # Move forward, maintaining overlap


print(f"DEBUG: Chunk processing finished.")
print(f"DEBUG: Total PERSON mentions found across all chunks: {len(all_person_mentions)}")
print(f"DEBUG: Total LOCATION mentions found across all chunks: {len(all_location_mentions)}")

# --- Remove duplicates ---
# Entities might be detected in overlapping regions, need deduplication
# Simple deduplication based on exact start/end/text match
unique_person_mentions_set = set()
unique_person_mentions = []
for mention in all_person_mentions:
    mention_tuple = (mention['text'], mention['start_char'], mention['end_char'])
    if mention_tuple not in unique_person_mentions_set:
        unique_person_mentions_set.add(mention_tuple)
        unique_person_mentions.append(mention)

print(f"DEBUG: Unique PERSON mentions after deduplication: {len(unique_person_mentions)}")


# --- Now use unique_person_mentions for filtering, saving, consolidation ---
person_mentions = unique_person_mentions # Assign to the variable name used later

# After your NER processing and deduplication...
if person_mentions:
    import json
    output_data_path = "../data/ner_person_mentions_bert.json"
    
    # Save to JSON file
    try:
        with open(output_data_path, 'w', encoding='utf-8') as f:
            json.dump(person_mentions, f, ensure_ascii=False, indent=2)
        print(f"Successfully saved {len(person_mentions)} person mentions to {output_data_path}")
    except Exception as e:
        print(f"Error saving to JSON: {e}")
else:
    print("No person mentions found to save")

# Optional: Print first few entries to verify the data
print("\nFirst few person mentions:")
for mention in person_mentions[:5]:
    print(mention)

Device set to use cpu


DEBUG: Starting chunk processing. Total text length: 723733
DEBUG: Chunk processing finished.
DEBUG: Total PERSON mentions found across all chunks: 5498
DEBUG: Total LOCATION mentions found across all chunks: 538
DEBUG: Unique PERSON mentions after deduplication: 5198
Successfully saved 5198 person mentions to ../data/ner_person_mentions_bert.json

First few person mentions:
{'text': 'Jane Austen', 'start_char': 28, 'end_char': 39}
{'text': 'George Saintsbury', 'start_char': 62, 'end_char': 79}
{'text': 'Hugh Thomson Ruskin', 'start_char': 101, 'end_char': 123}
{'text': 'George Allen', 'start_char': 179, 'end_char': 191}
{'text': 'Walt Whitman', 'start_char': 294, 'end_char': 306}


## Filtering

In [12]:

# List of standalone titles/honorifics to filter if they appear alone
standalone_titles = {"mr", "mrs", "miss", "ms", "dr", "lady", "sir", "colonel", "captain", "lord"} # Add more
# Characters considered punctuation for stripping/checking
import string
punctuation_chars = string.punctuation # Gets '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

# --- Implement Post-NER Filtering ---
filtered_person_mentions = []
if person_mentions: # Use the list generated by the HF model
    print(f"DEBUG: Starting Post-NER Filtering on {len(person_mentions)} mentions...")
    for mention in person_mentions:
        original_text = mention['text']

        # 1. Basic Cleaning: Remove leading/trailing whitespace and punctuation
        #    Example: ". Bennet " -> "Bennet" ; "." -> "" ; "Mr." -> "Mr"
        cleaned_text = original_text.strip().strip(punctuation_chars)
        cleaned_text_lower = cleaned_text.lower()

        # --- Apply Filters ---
        # Filter 1: Check if empty after cleaning (e.g., if it was just ".")
        if not cleaned_text:
            # print(f"Filtering empty/punctuation mention: '{original_text}'") # Optional debug
            continue

        # Filter 2: Filter if it's just a standalone title
        if cleaned_text_lower in standalone_titles:
            # print(f"Filtering standalone title: '{original_text}' -> '{cleaned_text}'") # Optional debug
            continue


        # Filter 3: Optional - Add more sophisticated checks if needed (e.g., for plurals)

        # --- If it passes all filters, add it ---
        # Decide whether to store the cleaned or original text for consolidation
        mention_to_add = {
            # Use cleaned_text if you want consolidation rules to work on "Bennet" instead of ". Bennet"
            "text": cleaned_text,
            # Keep original if useful for reference
            # "original_text": original_text,
            "start_char": mention['start_char'],
            "end_char": mention['end_char']
        }
        filtered_person_mentions.append(mention_to_add)

    print(f"DEBUG: Mentions remaining after filtering: {len(filtered_person_mentions)}")
else:
     print("DEBUG: Initial person_mentions list was empty. Skipping filtering.")


# --- Save the FILTERED list ---
# Make sure to save 'filtered_person_mentions' to your JSON file now
output_filtered_path = "../data/ner_person_mentions_bert_filtered.json"
if filtered_person_mentions:
    print(f"DEBUG: Saving {len(filtered_person_mentions)} filtered mentions to {output_filtered_path}...")
    import json
    import os
    try:
        os.makedirs(os.path.dirname(output_filtered_path), exist_ok=True)
        with open(output_filtered_path, 'w', encoding='utf-8') as f:
            json.dump(filtered_person_mentions, f, indent=4)
        print(f"DEBUG: Successfully saved filtered mentions.")
    except Exception as e:
        print(f"DEBUG: Error saving filtered JSON: {e}")
else:
     print("DEBUG: No filtered person mentions to save.")


# --- Proceed with Consolidation ---
# Load the *filtered* JSON file in the next steps for consolidation.

DEBUG: Starting Post-NER Filtering on 5198 mentions...
DEBUG: Mentions remaining after filtering: 3986
DEBUG: Saving 3986 filtered mentions to ../data/ner_person_mentions_bert_filtered.json...
DEBUG: Successfully saved filtered mentions.


In [None]:
## Consolidate Characters using Alias Map - xml approach downn below
## here is the one by avoiding using annotation and use nicknames

In [15]:
# --- Cell for Character Consolidation (in Notebook 01) ---
import json
from collections import Counter, defaultdict
import re
from nicknames import NickNamer # Import the library

# --- Load Filtered Mentions ---
filtered_mentions_path = "../data/ner_person_mentions_bert_filtered.json" # Path to filtered mentions
try:
    with open(filtered_mentions_path, 'r', encoding='utf-8') as f:
        filtered_person_mentions = json.load(f)
    print(f"Loaded {len(filtered_person_mentions)} filtered mentions from {filtered_mentions_path}")
except Exception as e:
    print(f"Error loading filtered mentions JSON: {e}")
    filtered_person_mentions = []

# --- Initialize NickNamer ---
try:
    nn = NickNamer()
    print("NickNamer initialized.")
except Exception as e:
    print(f"Warning: Could not initialize NickNamer. Nickname lookup disabled. Error: {e}")
    nn = None # Disable nickname lookup if initialization fails

# --- Define Titles (for fallback/normalization if not a nickname) ---
# (You might already have this list defined earlier in the notebook)
titles = {"mr", "mrs", "miss", "ms", "dr", "lady", "sir", "colonel", "captain", "lord"} # Lowercase

# --- Consolidation Logic ---
consolidated_characters = defaultdict(lambda: {"count": 0, "variations": set()})
mention_texts_for_fallback = [] # Collect texts to determine fallback keys

if filtered_person_mentions:
    print("Starting character consolidation using NickNamer and rules...")
    for mention_data in filtered_person_mentions:
        mention_text = mention_data['text'] # Assumes 'text' key holds the cleaned text from filtering step
        mention_lower = mention_text.lower()

        canonical_base = None

        # --- Step 1: Check Nickname Dictionary ---
        if nn: # Only if NickNamer initialized successfully
            formal_names = nn.canonicals_of(mention_lower)
            if formal_names:
                # Found potential formal name(s)
                # Simple strategy: use the first one found.
                # Could be enhanced to handle multiple formal names if needed.
                canonical_base = list(formal_names)[0]
                # print(f"DEBUG: Nickname mapping: '{mention_text}' -> '{canonical_base}'") # Optional debug

        # --- Step 2: Fallback to Rule-Based Normalization (if not found in nicknames) ---
        if canonical_base is None:
            parts = mention_text.split()
            # Simple title stripping (if title is first word)
            if parts and parts[0].lower().strip('.') in titles:
                 canonical_base = " ".join(parts[1:]) # Use name after title
            else:
                 canonical_base = mention_text # Use original (cleaned) text

            # If after stripping title, the name is empty, maybe use original text?
            if not canonical_base:
                 canonical_base = mention_text

            canonical_base = canonical_base.lower() # Ensure consistent casing for grouping

        # --- Grouping ---
        # Use the derived canonical_base for grouping
        # Let's refine the canonical key later based on frequency
        consolidated_characters[canonical_base]["count"] += 1
        consolidated_characters[canonical_base]["variations"].add(mention_text) # Add original mention text as variation
        mention_texts_for_fallback.append(mention_text)

    # --- Step 3: Refine Canonical Keys (Example: Use most frequent variation as key) ---
    print("Refining canonical keys based on frequency...")
    final_consolidated = defaultdict(lambda: {"count": 0, "variations": set()})
    # Count frequencies of original mentions
    mention_counts = Counter(mention_texts_for_fallback)

    processed_bases = set()
    for base, data in consolidated_characters.items():
        if base in processed_bases: continue # Avoid reprocessing if already handled

        # Find the most frequent original mention text associated with this base group
        most_frequent_variation = base # Default to the base itself
        max_freq = 0
        current_group_variations = data["variations"]

        for variation in current_group_variations:
             if mention_counts[variation] > max_freq:
                 max_freq = mention_counts[variation]
                 most_frequent_variation = variation

        # Create the final canonical key (e.g., Title_Case, underscore for space)
        # Handle potential titles still present if base == variation
        parts = most_frequent_variation.split()
        final_key_parts = []
        title_prefix = ""
        start_index = 0
        if parts and parts[0].lower().strip('.') in titles:
             title_prefix = parts[0].strip('.').capitalize() + "_"
             start_index = 1

        final_key_parts = [part.capitalize() for part in parts[start_index:]]
        final_canonical_key = title_prefix + "_".join(final_key_parts)

        # If key ends up empty, use a fallback
        if not final_canonical_key:
            final_canonical_key = f"Unknown_{base[:10]}" # Fallback key

        # Merge data for this group under the final key
        final_consolidated[final_canonical_key]["count"] += data["count"]
        final_consolidated[final_canonical_key]["variations"].update(data["variations"])
        processed_bases.add(base)


    # --- Convert sets to lists for saving ---
    final_output_list = []
    for key, data in final_consolidated.items():
        final_output_list.append({
            "canonical_key": key,
            "total_mentions": data["count"],
            "variations": sorted(list(data["variations"])), # Sorted list
            "variation_count": len(data["variations"])
            # Note: gender_from_xml is removed as we are not using XML aliases
        })

    # Sort by total mentions (descending)
    final_output_list.sort(key=lambda x: x['total_mentions'], reverse=True)

    # --- Save to CSV ---
    output_csv = "../data/character_analysis_consolidated_nicknames.csv" # New name
    output_json = "../data/character_groups_consolidated_nicknames.json" # New name
    try:
        df_consolidated = pd.DataFrame(final_output_list)
        df_consolidated.to_csv(output_csv, index=False)
        # Save the dict version to JSON as before (optional)
        # with open(output_json, 'w', encoding='utf-8') as f:
        #     json.dump(final_consolidated, f, indent=4, default=lambda x: list(x) if isinstance(x, set) else x) # Handle sets for JSON
        print(f"\nSuccessfully saved consolidated results (using nicknames) to '{output_csv}'")

    except Exception as e:
        print(f"\nError saving consolidated results: {e}")

else:
    print("\nSkipping consolidation as no filtered PERSON mentions were loaded.")

Loaded 3986 filtered mentions from ../data/ner_person_mentions_bert_filtered.json
NickNamer initialized.
Starting character consolidation using NickNamer and rules...
Refining canonical keys based on frequency...

Successfully saved consolidated results (using nicknames) to '../data/character_analysis_consolidated_nicknames.csv'


## Load Aliases from Annotated XML

To improve character consolidation (e.g., grouping 'Lizzy' with 'Elizabeth'), we load the character list and aliases from the annotated XML file (`pp_full.xml`). This acts as a lookup table based on our 'training' data.

In [13]:
# Cell 7: Load aliases from XML
xml_file_path = "../data/pp_full.xml" # Path to the annotated file
alias_to_canonical = {}
canonical_name_gender = {} # Store gender info from XML as well

print(f"Loading aliases from {xml_file_path}...")
try:
    tree = ET.parse(xml_file_path)
    root = tree.getroot()
    characters_element = root.find('characters')
    
    if characters_element is not None:
        for character in characters_element.findall('character'):
            canonical_name = character.get('name')
            aliases_str = character.get('aliases')
            gender = character.get('gender') # Get gender attribute
            
            if not canonical_name:
                continue # Skip if character has no canonical name
                
            # Store gender for the canonical name
            if gender:
                canonical_name_gender[canonical_name] = gender.capitalize()
            
            # Add canonical name itself to map (normalized)
            normalized_canonical = canonical_name.lower().strip()
            if normalized_canonical:
                 alias_to_canonical[normalized_canonical] = canonical_name
            
            # Add aliases to map (normalized)
            if aliases_str:
                aliases = aliases_str.split(';')
                for alias in aliases:
                    normalized_alias = alias.lower().strip()
                    if normalized_alias:
                        # Simple approach: map alias to canonical name
                        # More complex: handle ambiguous aliases like 'Miss Bennet' if needed
                        if normalized_alias not in alias_to_canonical: # Avoid overwriting if already mapped
                             alias_to_canonical[normalized_alias] = canonical_name
                        # else: print(f"Ambiguous or duplicate alias ignored: {normalized_alias}")
                             
        print(f"Created alias map with {len(alias_to_canonical)} entries.")
        # print("Sample alias map:", dict(list(alias_to_canonical.items())[:15]))
        # print("Canonical genders:", canonical_name_gender)
    else:
        print("Error: <characters> tag not found in XML.")

except FileNotFoundError:
    print(f"Error: XML file not found at {xml_file_path}")
except ET.ParseError as pe:
    print(f"Error parsing XML file {xml_file_path}: {pe}")
except Exception as e:
    print(f"An unexpected error occurred loading aliases: {e}")


Loading aliases from ../data/pp_full.xml...
Created alias map with 100 entries.


## Consolidate Characters using Alias Map

Now, group the PERSON mentions found by NER using the alias map. If a mention matches an alias, group it under the canonical name. Otherwise, use fallback logic (e.g., normalized name or surname).

In [14]:
# Cell 8: Consolidate character mentions using the alias map

character_groups = {}
mentions_processed = 0
mentions_mapped = 0

#print(person_mentions)

if filtered_person_mentions: # Check if NER ran successfully
    print("Consolidating mentions using alias map...")
    for mention in filtered_person_mentions:
        mentions_processed += 1
        original_mention_text = mention['text']
        normalized_mention = original_mention_text.lower().strip()
        
        # Attempt to find canonical name using the alias map
        canonical_name = alias_to_canonical.get(normalized_mention)
        
        char_key = None
        if canonical_name:
            # Found in alias map, use the canonical name as the key
            char_key = canonical_name
            mentions_mapped += 1
        else:
            # Fallback: Use the original mention text as key for now
            # More sophisticated fallback (like surname grouping) could be added here
            # We also might want to filter out non-names like 'Meryton' here
            # Simple filter: skip if it looks like a place (e.g., capitalized, maybe check against known places)
            # For now, let's just use the original text as key if no alias match
            char_key = original_mention_text 
            # Basic check to avoid adding obvious non-names if desired
            if char_key in ['Meryton', 'London', 'Hertfordshire', 'Kent', 'Derbyshire']: # Example filter
                 continue
            
        # Add to groups
        if char_key not in character_groups:
            character_groups[char_key] = {
                'variations': set(),
                'count': 0
            }
        
        character_groups[char_key]['variations'].add(original_mention_text)
        character_groups[char_key]['count'] += 1
        
    print(f"Consolidation complete. Processed {mentions_processed} mentions.")
    print(f"Mapped {mentions_mapped} mentions using aliases.")
    print(f"Resulting unique character keys: {len(character_groups)}")

    # --- Create DataFrame for analysis ---
    character_data_list = []
    for key, data in character_groups.items():
         # Get gender from XML if available for this canonical key
         xml_gender = canonical_name_gender.get(key, None) 
         character_data_list.append({
            'canonical_key': key, # Changed 'key' to 'canonical_key'
            'total_mentions': data['count'],
            'variations': ', '.join(sorted(list(data['variations']))), # Sort variations for consistency
            'variation_count': len(data['variations']),
            'gender_from_xml': xml_gender # Add gender from XML if found
        })
        
    character_df = pd.DataFrame(character_data_list)

    # Sort by total mentions to see main characters
    character_df = character_df.sort_values('total_mentions', ascending=False)

    print("\nTop characters after consolidation:")
    print(character_df.head(20))

    # --- Save the structured character data ---
    output_csv = '../data/character_analysis_consolidated.csv'
    output_json = '../data/character_groups_consolidated.json'
    
    try:
        character_df.to_csv(output_csv, index=False)
        # Save JSON compatible format
        save_groups = {k: {'variations': list(v['variations']), 'count': v['count']} 
                       for k, v in character_groups.items()}
        with open(output_json, 'w', encoding='utf-8') as f:
            json.dump(save_groups, f, indent=4, ensure_ascii=False)
        print(f"\nConsolidated character analysis data saved to '{output_csv}' and '{output_json}'")
    except Exception as e:
        print(f"\nError saving consolidated results: {e}")
        
else:
    print("\nSkipping consolidation as no PERSON mentions were extracted.")

Consolidating mentions using alias map...
Consolidation complete. Processed 3986 mentions.
Mapped 2969 mentions using aliases.
Resulting unique character keys: 206

Top characters after consolidation:
           canonical_key  total_mentions  \
25      Elizabeth_Bennet             803   
24              Mr_Darcy             406   
19           Jane_Bennet             296   
36            Mrs_Bennet             286   
20            Mr_Bingley             235   
17            Mr_Wickham             211   
21               Collins             178   
16          Lydia_Bennet             176   
50        Lady_Catherine             132   
62             Charlotte             113   
81      Caroline_Bingley              98   
139             Gardiner              80   
54          Kitty_Bennet              72   
76           Sir_William              42   
7            Mary_Bennet              42   
60            Miss Darcy              40   
80                 Hurst              32   
150  Co

## Next Steps

1.  **Review Output:** Check the new `character_analysis_consolidated.csv`. Does it correctly group 'Lizzy' under 'Elizabeth_Bennet'? Are other characters consolidated better?
2.  **Refine Alias Map/Fallback:** You might need to refine how ambiguous aliases (like 'Miss Bennet') are handled or improve the fallback logic for mentions not in the alias map.
3.  **Proceed to Gender Classification:** Now you can run the `02_gender_classification.ipynb` notebook, making sure it loads this *new* consolidated CSV (`character_analysis_consolidated.csv`). The gender classification should now operate on the canonical keys.

In [None]:
# Cell: Fuzzy Matching for Character Consolidation (Corrected)
import json
from collections import defaultdict
from thefuzz import fuzz # Or from fuzzywuzzy import fuzz
from thefuzz import process # Or from fuzzywuzzy import process

# --- Configuration ---
filtered_mentions_path = "../data/ner_person_mentions_bert_filtered.json" # Path to filtered mentions
similarity_threshold = 91 # Adjust this threshold (0-100) as needed
output_grouped_path = "../data/character_groups_fuzzy_matched.json" # Optional: Save results

# --- Load Filtered Mentions ---
try:
    with open(filtered_mentions_path, 'r', encoding='utf-8') as f:
        filtered_person_mentions = json.load(f)
    print(f"Loaded {len(filtered_person_mentions)} filtered mentions from {filtered_mentions_path}")
except Exception as e:
    print(f"Error loading filtered mentions JSON: {e}")
    filtered_person_mentions = []

# --- Extract Unique Names ---
unique_names = sorted(list(set(mention['text'] for mention in filtered_person_mentions)))
print(f"Found {len(unique_names)} unique names for fuzzy matching.")

# --- Perform Fuzzy Matching ---
grouped_names = defaultdict(list)
processed_names = set() # Keep track of names already assigned to a group

if unique_names:
    print(f"Starting fuzzy matching with threshold {similarity_threshold}...")
    for name in unique_names:
        if name in processed_names:
            continue # Skip if already part of a group

        # Create a list of names not yet processed
        choices = [n for n in unique_names if n not in processed_names]

        # Find similar names
        similar_matches = process.extractBests(name, choices, scorer=fuzz.token_sort_ratio, score_cutoff=similarity_threshold, limit=None) # limit=None gets all matches above cutoff

        # The base name for the group will be the current 'name'
        current_group = [name]
        processed_names.add(name)

        # Add similar matches found to the group and mark them as processed
        # --- CORRECTED LINE BELOW ---
        for match, score in similar_matches:
            if match != name: # Don't add the name itself again
                 current_group.append(match)
                 processed_names.add(match)
                 # print(f"  Grouping '{match}' with '{name}' (Score: {score})") # Optional debug

        # Use the first name in the sorted group as the canonical key (or keep 'name')
        canonical_key = sorted(current_group)[0]
        grouped_names[canonical_key].extend(current_group)

    print(f"Fuzzy matching finished. Found {len(grouped_names)} groups.")

    # --- Print Groups (Optional) ---
    print("\n--- Fuzzy Matched Groups ---")
    group_count = 0
    for key, names in grouped_names.items():
        if len(names) > 1: # Only print groups with more than one variation
             print(f"Group '{key}': {sorted(list(set(names)))}") # Show unique names per group
             group_count += 1
    print(f"\nDisplayed {group_count} groups with multiple variations.")

    # --- Optional: Save Grouped Results ---
    try:
        # Convert sets to lists for JSON compatibility if needed, though list is used above
        save_data = {key: sorted(list(set(names))) for key, names in grouped_names.items()}
        with open(output_grouped_path, 'w', encoding='utf-8') as f:
            json.dump(save_data, f, indent=2)
        print(f"\nSuccessfully saved fuzzy matched groups to {output_grouped_path}")
    except Exception as e:
        print(f"\nError saving fuzzy matched groups: {e}")

else:
    print("Skipping fuzzy matching as no unique names were found.")

# --- NEXT STEPS ---
# Remember to adapt the next cell (b524dadd) to use 'grouped_names' or the output file.

Loaded 3986 filtered mentions from ../data/ner_person_mentions_bert_filtered.json
Found 235 unique names for fuzzy matching.
Starting fuzzy matching with threshold 91...
Fuzzy matching finished. Found 205 groups.

--- Fuzzy Matched Groups ---
Group ' Addison': [' Addison', 'Addison']
Group ' Austen Leigh': [' Austen Leigh', 'Austen Leigh']
Group ' Bennet': [' Bennet', 'Bennet', 'Bennets']
Group ' Bingley': [' Bingley', 'Bingley', 'Bingleys']
Group ' Collins': [' Collins', 'Collins']
Group ' Darcy': [' Darcy', 'Darcy']
Group ' Denny': [' Denny', 'Denny']
Group ' Forster': [' Forster', 'Forster', 'Forsters']
Group ' Gardiner': [' Gardiner', 'Gardiner', 'Gardiners']
Group ' Hill': [' Hill', 'Hill']
Group ' Hurst': [' Hurst', 'Hurst']
Group ' Jones': [' Jones', 'Jones']
Group ' Long': [' Long', 'Long']
Group ' Nichols': [' Nichols', 'Nicholls']
Group ' Norris': [' Norris', 'Norris']
Group ' Philips': [' Philips', 'Philips']
Group ' Reynolds': [' Reynolds', 'Reynolds']
Group ' Robinson': ['