In [9]:
# Cell 1: Import libraries
import spacy
import os 
import json
import re
from collections import Counter
import pandas as pd
import xml.etree.ElementTree as ET # Added in the early steps for XML parsing

print("Libraries imported.")

Libraries imported.


## Load Cleaned Text
Read the content of the cleaned text file created by `00_pre_proc.ipynb`.

In [10]:
# Cell 2: Load cleaned text
input_file_path = "../data/pp_cleaned.txt" # Make sure this path is correct
cleaned_text = None

try:
    with open(input_file_path, 'r', encoding='utf-8') as file:
        cleaned_text = file.read()
    print(f"Successfully loaded cleaned text from: {input_file_path}")
    print(f"Text length: {len(cleaned_text)} characters")
except FileNotFoundError:
    print(f"Error: Cleaned text file not found at {input_file_path}")
    print("Please ensure '00_pre_proc.ipynb' was run successfully and saved the file.")
except Exception as e:
    print(f"An error occurred loading the file: {e}")

Successfully loaded cleaned text from: ../data/pp_cleaned.txt
Text length: 723733 characters


## Load spaCy Model

In [11]:
# Cell 3: Load spaCy model
nlp = None
if cleaned_text:
    print("Loading spaCy model 'en_core_web_trf'...")
    try:
        # Make sure you have downloaded the model: python -m spacy download en_core_web_lg
        nlp = spacy.load("en_core_web_trf") 
        print("spaCy model loaded.")
        print("Pipeline components:", nlp.pipe_names) # Should show 'ner' among others
    except OSError:
        print("Error: spaCy model 'en_core_web_lg' not found.")
        print("Download it by running: python -m spacy download en_core_web_lg")
    except Exception as e:
        print(f"An error occurred loading the spaCy model: {e}")
else:
    print("Skipping spaCy model loading as cleaned_text is not available.")

Loading spaCy model 'en_core_web_trf'...
spaCy model loaded.
Pipeline components: ['transformer', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


## Process Text for Named Entities

In [12]:
# Cell 4: Process text with spaCy NER pipeline
doc = None
if cleaned_text and nlp:
     print("Processing text with spaCy NER pipeline (this may take some time)...")
     # Increase max_length if needed, but be mindful of memory usage
     # nlp.max_length = len(cleaned_text) + 100 
     try:
         doc = nlp(cleaned_text)
         print("Text processing complete.")
     except ValueError as ve:
         print(f"ValueError during processing: {ve}")
         print("The text might be too long for the default spaCy model settings.")
         print("Consider increasing nlp.max_length or processing in chunks.")
     except Exception as e:
         print(f"An unexpected error occurred during text processing: {e}")
else:
     print("Skipping text processing as cleaned_text or nlp model is not available.")

Processing text with spaCy NER pipeline (this may take some time)...
Text processing complete.


## Extract PERSON Entities

In [13]:
# Cell 5: Extract PERSON entities found by NER
person_mentions = []
if doc:
    print("\n--- Extracting PERSON Entities ---")
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            # Store text, start/end character offsets
            person_mentions.append({
                "text": ent.text,
                "start_char": ent.start_char,
                "end_char": ent.end_char
            })
    print(f"Found {len(person_mentions)} PERSON mentions.")
    # Optional: Print first few mentions
    if person_mentions:
         print("First 10 mentions:", person_mentions[:10])
else:
     print("Skipping extraction as doc object is not available.")


--- Extracting PERSON Entities ---
Found 3768 PERSON mentions.
First 10 mentions: [{'text': 'Jane Austen', 'start_char': 28, 'end_char': 39}, {'text': 'George Saintsbury', 'start_char': 62, 'end_char': 79}, {'text': 'Hugh Thomson\n\n\n\nRuskin', 'start_char': 101, 'end_char': 123}, {'text': 'George Allen', 'start_char': 179, 'end_char': 191}, {'text': 'Walt Whitman', 'start_char': 294, 'end_char': 306}, {'text': 'Edmund', 'start_char': 1621, 'end_char': 1627}, {'text': 'Fanny', 'start_char': 1638, 'end_char': 1643}, {'text': 'Mary', 'start_char': 1652, 'end_char': 1656}, {'text': 'Fanny', 'start_char': 1679, 'end_char': 1684}, {'text': 'Crawford', 'start_char': 1714, 'end_char': 1722}]


In [14]:
# Cell 6: Save raw PERSON mentions (optional but good for debugging)
if person_mentions:
    output_data_path = "../data/ner_person_mentions.json" # Choose your output format/name

    print(f"\nSaving extracted PERSON mentions to {output_data_path}...")
    try:
        # Ensure data directory exists
        os.makedirs(os.path.dirname(output_data_path), exist_ok=True)
        with open(output_data_path, 'w', encoding='utf-8') as f:
            json.dump(person_mentions, f, ensure_ascii=False, indent=4)
        print("Raw NER results saved successfully.")
    except Exception as e:
        print(f"Error saving NER results: {e}")
else:
    print("\nNo PERSON mentions extracted to save.")


Saving extracted PERSON mentions to ../data/ner_person_mentions.json...
Raw NER results saved successfully.


## Load Aliases from Annotated XML

To improve character consolidation (e.g., grouping 'Lizzy' with 'Elizabeth'), we load the character list and aliases from the annotated XML file (`pp_full.xml`). This acts as a lookup table based on our 'training' data.

In [15]:
# Cell 7: Load aliases from XML
xml_file_path = "../data/pp_full.xml" # Path to the annotated file
alias_to_canonical = {}
canonical_name_gender = {} # Store gender info from XML as well

print(f"Loading aliases from {xml_file_path}...")
try:
    tree = ET.parse(xml_file_path)
    root = tree.getroot()
    characters_element = root.find('characters')
    
    if characters_element is not None:
        for character in characters_element.findall('character'):
            canonical_name = character.get('name')
            aliases_str = character.get('aliases')
            gender = character.get('gender') # Get gender attribute
            
            if not canonical_name:
                continue # Skip if character has no canonical name
                
            # Store gender for the canonical name
            if gender:
                canonical_name_gender[canonical_name] = gender.capitalize()
            
            # Add canonical name itself to map (normalized)
            normalized_canonical = canonical_name.lower().strip()
            if normalized_canonical:
                 alias_to_canonical[normalized_canonical] = canonical_name
            
            # Add aliases to map (normalized)
            if aliases_str:
                aliases = aliases_str.split(';')
                for alias in aliases:
                    normalized_alias = alias.lower().strip()
                    if normalized_alias:
                        # Simple approach: map alias to canonical name
                        # More complex: handle ambiguous aliases like 'Miss Bennet' if needed
                        if normalized_alias not in alias_to_canonical: # Avoid overwriting if already mapped
                             alias_to_canonical[normalized_alias] = canonical_name
                        # else: print(f"Ambiguous or duplicate alias ignored: {normalized_alias}")
                             
        print(f"Created alias map with {len(alias_to_canonical)} entries.")
        # print("Sample alias map:", dict(list(alias_to_canonical.items())[:15]))
        # print("Canonical genders:", canonical_name_gender)
    else:
        print("Error: <characters> tag not found in XML.")

except FileNotFoundError:
    print(f"Error: XML file not found at {xml_file_path}")
except ET.ParseError as pe:
    print(f"Error parsing XML file {xml_file_path}: {pe}")
except Exception as e:
    print(f"An unexpected error occurred loading aliases: {e}")


Loading aliases from ../data/pp_full.xml...
Created alias map with 100 entries.


## Consolidate Characters using Alias Map

Now, group the PERSON mentions found by NER using the alias map. If a mention matches an alias, group it under the canonical name. Otherwise, use fallback logic (e.g., normalized name or surname).

In [16]:
# Cell 8: Consolidate character mentions using the alias map

character_groups = {}
mentions_processed = 0
mentions_mapped = 0

if person_mentions: # Check if NER ran successfully
    print("Consolidating mentions using alias map...")
    for mention in person_mentions:
        mentions_processed += 1
        original_mention_text = mention['text']
        normalized_mention = original_mention_text.lower().strip()
        
        # Attempt to find canonical name using the alias map
        canonical_name = alias_to_canonical.get(normalized_mention)
        
        char_key = None
        if canonical_name:
            # Found in alias map, use the canonical name as the key
            char_key = canonical_name
            mentions_mapped += 1
        else:
            # Fallback: Use the original mention text as key for now
            # More sophisticated fallback (like surname grouping) could be added here
            # We also might want to filter out non-names like 'Meryton' here
            # Simple filter: skip if it looks like a place (e.g., capitalized, maybe check against known places)
            # For now, let's just use the original text as key if no alias match
            char_key = original_mention_text 
            # Basic check to avoid adding obvious non-names if desired
            if char_key in ['Meryton', 'London', 'Hertfordshire', 'Kent', 'Derbyshire']: # Example filter
                 continue
            
        # Add to groups
        if char_key not in character_groups:
            character_groups[char_key] = {
                'variations': set(),
                'count': 0
            }
        
        character_groups[char_key]['variations'].add(original_mention_text)
        character_groups[char_key]['count'] += 1
        
    print(f"Consolidation complete. Processed {mentions_processed} mentions.")
    print(f"Mapped {mentions_mapped} mentions using aliases.")
    print(f"Resulting unique character keys: {len(character_groups)}")

    # --- Create DataFrame for analysis ---
    character_data_list = []
    for key, data in character_groups.items():
         # Get gender from XML if available for this canonical key
         xml_gender = canonical_name_gender.get(key, None) 
         character_data_list.append({
            'canonical_key': key, # Changed 'key' to 'canonical_key'
            'total_mentions': data['count'],
            'variations': ', '.join(sorted(list(data['variations']))), # Sort variations for consistency
            'variation_count': len(data['variations']),
            'gender_from_xml': xml_gender # Add gender from XML if found
        })
        
    character_df = pd.DataFrame(character_data_list)

    # Sort by total mentions to see main characters
    character_df = character_df.sort_values('total_mentions', ascending=False)

    print("\nTop characters after consolidation:")
    print(character_df.head(20))

    # --- Save the structured character data ---
    output_csv = '../data/character_analysis_consolidated.csv'
    output_json = '../data/character_groups_consolidated.json'
    
    try:
        character_df.to_csv(output_csv, index=False)
        # Save JSON compatible format
        save_groups = {k: {'variations': list(v['variations']), 'count': v['count']} 
                       for k, v in character_groups.items()}
        with open(output_json, 'w', encoding='utf-8') as f:
            json.dump(save_groups, f, indent=4, ensure_ascii=False)
        print(f"\nConsolidated character analysis data saved to '{output_csv}' and '{output_json}'")
    except Exception as e:
        print(f"\nError saving consolidated results: {e}")
        
else:
    print("\nSkipping consolidation as no PERSON mentions were extracted.")

Consolidating mentions using alias map...
Consolidation complete. Processed 3768 mentions.
Mapped 2827 mentions using aliases.
Resulting unique character keys: 163

Top characters after consolidation:
        canonical_key  total_mentions  \
26   Elizabeth_Bennet             755   
25           Mr_Darcy             422   
36         Mrs_Bennet             319   
19         Mr_Bingley             306   
18        Jane_Bennet             295   
16         Mr_Wickham             198   
21            Collins             189   
15       Lydia_Bennet             175   
49     Lady_Catherine             116   
102          Gardiner              94   
58          Charlotte              87   
50       Kitty_Bennet              71   
79              Lucas              44   
7         Mary_Bennet              39   
89            Forster              39   
125       Fitzwilliam              35   
52            Philips              34   
80              Hurst              33   
78            Willia

## Next Steps

1.  **Review Output:** Check the new `character_analysis_consolidated.csv`. Does it correctly group 'Lizzy' under 'Elizabeth_Bennet'? Are other characters consolidated better?
2.  **Refine Alias Map/Fallback:** You might need to refine how ambiguous aliases (like 'Miss Bennet') are handled or improve the fallback logic for mentions not in the alias map.
3.  **Proceed to Gender Classification:** Now you can run the `02_gender_classification.ipynb` notebook, making sure it loads this *new* consolidated CSV (`character_analysis_consolidated.csv`). The gender classification should now operate on the canonical keys.

In [None]:
import spacy
import coreferee # Import coreferee
import pandas as pd

# --- Configuration ---
GENDERED_CONTEXT_CSV_PATH = "../data/character_analysis_gendered_contextual.csv" # Input from Notebook 02b
CLEANED_TEXT_PATH = "../data/pp_cleaned.txt" # Input from Notebook 00
OUTPUT_CSV_PATH = "../data/character_analysis_gendered_coref.csv"
GENDER_FEMALE = "Female"
GENDER_MALE = "Male"
GENDER_UNKNOWN = "Unknown"
MALE_PRONOUNS = {'he', 'him', 'his'}
FEMALE_PRONOUNS = {'she', 'her', 'hers'}

# --- Load spaCy model and add coreferee ---
print("Loading spaCy model and adding coreferee...")
# Load your spaCy model (make sure it's compatible with coreferee)
nlp = spacy.load('en_core_web_trf')
# Add the coreferee pipe
# Coreferee automatically initializes when added if needed.
nlp.add_pipe('coreferee')
print("Pipeline:", nlp.pipe_names)

# --- Load Data ---
print(f"Loading data from {GENDERED_CONTEXT_CSV_PATH}...")
try:
    char_df = pd.read_csv(GENDERED_CONTEXT_CSV_PATH)
    print(f"Loaded {len(char_df)} characters.")
except Exception as e:
    print(f"Error loading CSV: {e}")
    char_df = None

print(f"Loading text from {CLEANED_TEXT_PATH}...")
try:
    with open(CLEANED_TEXT_PATH, 'r', encoding='utf-8') as f:
        full_text = f.read()
    print(f"Loaded text ({len(full_text)} chars).")
except Exception as e:
    print(f"Error loading text: {e}")
    full_text = None

# --- Process the full text ---
doc = None
if full_text:
    print("Processing text with spaCy and coreferee (this can take time)...")
    # Increase max_length if your text is very long
    # nlp.max_length = len(full_text) + 100
    doc = nlp(full_text)
    print("Text processing complete.")
    # --- Access Coreference Chains ---
    if doc._.coref_chains:
         print(f"Found {len(doc._.coref_chains)} coreference chains.")
         # Example: Print the first few chains
         # doc._.coref_chains.print() # coreferee has a built-in print method
    else:
         print("No coreference chains found by coreferee.")

# --- Apply Coref Results to Gender Classification (Conceptual) ---
if char_df is not None and doc is not None and doc._.coref_chains:
    print("Applying coreference results to Unknown characters...")
    # Create a map from mention spans (start_token_index) to their chain index
    mention_to_chain_index = {}
    for chain_index, chain in enumerate(doc._.coref_chains):
        for mention in chain:
             # A mention in coreferee is a list of token indices
             start_token_index = mention.token_indices[0]
             mention_to_chain_index[start_token_index] = chain_index

    # Create a map to store aggregated gender evidence per chain
    chain_gender_evidence = {i: {'male': 0, 'female': 0, 'known_gender': GENDER_UNKNOWN} for i in range(len(doc._.coref_chains))}

    # --- Pass 1: Populate evidence from known-gender mentions in chains ---
    # (Requires mapping your classified characters back to tokens/mentions - complex step)
    # Placeholder logic: Iterate through chains, check if any mention text matches a known M/F character variation
    # If a chain contains "Elizabeth", mark chain_gender_evidence[chain_idx]['known_gender'] = GENDER_FEMALE
    # If a chain contains "Mr. Darcy", mark chain_gender_evidence[chain_idx]['known_gender'] = GENDER_MALE
    # Also count pronouns within the chain's mentions
    for chain_index, chain in enumerate(doc._.coref_chains):
        for mention in chain:
             mention_text = doc[mention.token_indices[0]:mention.token_indices[-1]+1].text.lower()
             # Check against known gendered characters (you'd need your previous results here)
             # if mention_text corresponds to known Male char: chain_gender_evidence[chain_index]['known_gender'] = GENDER_MALE; break
             # if mention_text corresponds to known Female char: chain_gender_evidence[chain_index]['known_gender'] = GENDER_FEMALE; break
             # Count pronouns
             if mention_text in MALE_PRONOUNS: chain_gender_evidence[chain_index]['male'] += 1
             if mention_text in FEMALE_PRONOUNS: chain_gender_evidence[chain_index]['female'] += 1

    # --- Pass 2: Classify 'Unknown' characters based on their chain's evidence ---
    char_df['coref_gender'] = char_df['final_gender_contextual'] # Start with previous best guess
    unknown_indices = char_df[char_df['coref_gender'] == GENDER_UNKNOWN].index

    for index in unknown_indices:
        char_name = char_df.loc[index, 'canonical_key']
        variations = set(var.strip() for var in str(char_df.loc[index, 'variations']).split(','))
        variations.add(char_name)

        # Find mentions of this character in the doc
        found_chain_indices = set()
        # This part is tricky: Need to map character name back to mentions found by coreferee
        # A simple approach: iterate through all mentions in all chains
        for chain_index, chain in enumerate(doc._.coref_chains):
             for mention in chain:
                 mention_span = doc[mention.token_indices[0]:mention.token_indices[-1]+1]
                 if mention_span.text in variations:
                     found_chain_indices.add(chain_index)
                     break # Found this char in this chain

        # Aggregate evidence from all chains this character belongs to
        final_male = 0
        final_female = 0
        final_known = GENDER_UNKNOWN
        for chain_idx in found_chain_indices:
             evidence = chain_gender_evidence[chain_idx]
             final_male += evidence['male']
             final_female += evidence['female']
             if evidence['known_gender'] != GENDER_UNKNOWN:
                  # If conflicting known genders in different chains, maybe mark ambiguous?
                  final_known = evidence['known_gender'] # Simplistic: take last known

        # Apply classification logic based on aggregated evidence
        new_gender = GENDER_UNKNOWN
        if final_known != GENDER_UNKNOWN:
             new_gender = final_known
        elif final_male > final_female: # Add thresholds if needed
             new_gender = GENDER_MALE
        elif final_female > final_male:
             new_gender = GENDER_FEMALE

        char_df.loc[index, 'coref_gender'] = new_gender

    # --- Display/Save Results ---
    print("\\nCharacters re-classified using Coreference:")
    # Show changes...
    print(char_df[['canonical_key', 'final_gender_contextual', 'coref_gender']].head(20))
    # Save char_df[['...','coref_gender']] to OUTPUT_CSV_PATH

else:
    print("Skipping coreference application due to missing data or coref chains.")

print("\\n--- Coreference Gender Classification Attempt Finished ---")