# SETUP

## Import

In [1]:
import re
import json
# import requests
# from bs4 import BeautifulSoup

from shared.constants import *
from shared.corpus import Corpus
from shared.utils import load_json

import spacy
from spacy.language import Language
from spacy.lang.en import English

In [2]:
# # Get characters

# url = "https://pemberley.com/janeinfo/ppdrmtis.html"
# res = requests.get(url)
# soup = BeautifulSoup(res.content, "html.parser")

# character_names = []
# target_h2 = soup.find("h2", string="Brief, Organized Listing of Characters")
# target_ul = target_h2.find_next("ul")
# for a in target_ul.find_all("a"):
#     name = a.get_text(strip=True).replace('\n', ' ')
#     if name:
#         character_names.append(name)

# print(character_names)
# with open(JSON_CHARACTERS_PNP, "w", encoding="utf-8") as f:
#     json.dump(character_names, f, indent=4)

# Rule-Based NER (Gazetteer Method)

In [3]:
# Load data
with open(TXT_PNP, "r", encoding="utf-8") as f:
    text = f.read().strip()
    text = re.split(PATTERN_CHAPTER_LINE, text)[1:6]

with open(JSON_CHARACTERS_PNP, "r", encoding="utf-8") as f:
    characters = json.load(f)
    names = set()
    for character in characters:
        names.update(
            [
                i for i in character.split() 
                if not i in ['Mr.', 'The', 'Mrs.', 'Miss', 'Old', 'Lady', 'de', 'Sir', 'Captain', 'Colonel']
            ]
        )
    names = sorted(names)

print(characters)
print(names)

['Mr. Bennet', 'Mrs. Bennet', 'Jane', 'Elizabeth', 'Mary', 'Kitty', 'Lydia', 'Bingley', 'Louisa Hurst', 'Caroline', 'Mr. Collins', 'Old Mr. Darcy', 'Lady Anne Darcy', 'Darcy', 'Georgiana Darcy', 'Lady Catherine', 'Anne de Bourgh', 'Colonel Fitzwilliam', 'Mr. Gardiner', 'Mrs. Gardiner', 'Sir William', 'Lady Lucas', 'Charlotte', 'Maria', 'Old Mr. Wickham', 'Wickham', 'Mrs. Annesley', 'Captain Carter', 'Mr. Chamberlayne', 'Dawson', 'Mr. Denny', 'Colonel Forster', 'William Goulding', 'Miss Grantley', 'Haggerston', 'The Harringtons', 'Mrs. Hill', 'Mr. Hurst', 'Mrs. Jenkinson', 'Mr. Jones', 'Miss Mary King', 'Mrs. Long', 'Lady Metcalfe', 'Mr. Morris', 'Mrs. Nicholls', 'Mr. Philips', 'Miss Pope', 'Mr. Pratt', 'Mrs. Reynolds', 'Mr. Robinson', 'Mr. Stone', 'Miss Watson', 'The Miss Webbs', 'Mrs. Younge']
['Anne', 'Annesley', 'Bennet', 'Bingley', 'Bourgh', 'Caroline', 'Carter', 'Catherine', 'Chamberlayne', 'Charlotte', 'Collins', 'Darcy', 'Dawson', 'Denny', 'Elizabeth', 'Fitzwilliam', 'Forster', 

In [4]:
# Characters by Chapter
for n_chapter, chapter in enumerate(text, start=1):
    print(f'\n\nChapter {n_chapter}:')
    chapter = chapter.strip().replace('\n', ' ').translate(str.maketrans('', '', PUNCTUATION))
    words = chapter.split() # NOTE: Mr and Mrs will be split

    chapter_names = set()
    for n, word in enumerate(words):
        if word in names:
            if (words[n-2] == 'Old') or (words[n-1] == 'de'):
                chapter_names.add(' '.join(words[n-2:n+1])) # Handle 3-word names
            elif words[n-1][0].isupper():
                chapter_names.add(' '.join(words[n-1:n+1])) # Handle 2-word names
            else:
                chapter_names.add(word)
    chapter_names = sorted(chapter_names)
    print(', '.join(chapter_names))



Chapter 1:
Bingley, Jane, Lady Lucas, Lydia, Mr Bennet, Mr Bingley, Mr Morris, Mrs Long, Sir William


Chapter 2:
Elizabeth, Kitty, Lydia, Mary, Mr Bennet, Mr Bingley, Mrs Bennet, Mrs Long, Now Kitty, While Mary


Chapter 3:
Bingley, Catherine, Come Darcy, Elizabeth, Elizabeth Bennet, Jane, Lady Lucas, Lydia, Maria, Maria Lucas, Mary, Miss Bennet, Miss Bingley, Miss King, Miss Lucas, Mr Bennet, Mr Bingley, Mr Darcy, Mr Hurst, Mrs Bennet, Mrs Hurst, Sir William


Chapter 4:
Bingley, Darcy, Elizabeth, Miss Bennet, Miss Bingley, Mr Bingley, Mrs Hurst, When Jane


Chapter 5:
Charlotte, Elizabeth, Jane, Lady Lucas, Lucas, Mary, Miss Bennet, Miss Bingley, Miss Lucas, Mr Darcy, Mr Robinson, Mrs Bennet, Mrs Long, Sir William, William Lucas


# NER with SpaCy

## Data

In [5]:
# Data
hp_corpus = Corpus(DIR_HP)
chapter1 = hp_corpus.books[0].chapters[0].text
chapter1[:100]

'Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly norma'

In [6]:
# url = "https://en.wikipedia.org/wiki/List_of_Harry_Potter_characters"
# res = requests.get(url)
# soup = BeautifulSoup(res.content, "html.parser")

# characters = []
# divs = soup.find_all("div", class_="mw-heading mw-heading3")
# for div in divs:
#     ul = div.find_next("ul")
#     for li in ul.find_all("li"):
#             text = li.get_text(strip=True)
#             name = re.search(r"^(.*?)â€“", text).group(1).replace(' and ', ',').translate(str.maketrans('()', ',,'))
#             if ',' in name:
#                 split_name = [part.strip() for part in name.split(',') if part.strip()]
#                 last_name = split_name[0]

#                 # Case: <last_name>, <name 1>, <name2>, ...
#                 if len(split_name[1:]) > 1:
#                     for first_name in split_name[1:]:
#                         characters.append(f'{first_name.strip()} {last_name.strip()}')
#                 # Case: <last_name>, <first_name>
#                 else:
#                     characters.append(f'{split_name[1].strip()} {last_name.strip()}')
#             else:
#                 # Case: <name>
#                 characters.append(name.strip())

# print(characters[:10])
# with open(JSON_CHARACTERS_HP, "w", encoding="utf-8") as f:
#     json.dump(characters, f, indent=4)

In [7]:
nlp = spacy.load('en_core_web_lg')

In [8]:
doc = nlp(chapter1)

In [9]:
unique_ents = list({ent.text: ent for ent in doc.ents}.values())
for ent in unique_ents[:10]:
    print(f'{ent.text:<15}{ent.label_:<10}')

Dursley        PERSON    
number four    CARDINAL  
Privet Drive   PERSON    
Grunnings      ORG       
Dursleys       NORP      
Dudley         PERSON    
Potters        ORG       
Potter         PERSON    
several years  DATE      
Tuesday        DATE      


We need to customize the model to make NER for the HP corpus better

# MODELS

## Rule-Based Model (with EntityRuler)

Using EntityRuler pipe we can generate rule-based NER with SpaCy.  
This approach can be used to generate train datasets.

In [10]:
# Generate patterns for characters

def generate_characters(characters: list[str]) -> list:
    """Generate combinations of character names and titles"""
    names = []
    titles = ["Dr.", "Professor", "Mr.", "Mrs.", "Ms.", "Miss", "Aunt", "Uncle", "Mr. and Mrs."]
    
    for item in characters:
        item = item.replace('The', '').replace('the', '').strip()
        names.append(item)
        for name in item.split():
            names.append(name)
    
    generated_characters = set()
    for title in titles:
        for name in names:
            generated_characters.add(name)
            generated_characters.add(f'{title} {name}')
    generated_characters = sorted(generated_characters)
    return generated_characters

def generate_characters_patterns(generated_characters: list) -> list[dict]:
    patterns = []
    for item in generated_characters:
        pattern = {
            "label": "PERSON",
            "pattern": item
        }
        patterns.append(pattern)
    return (patterns)

data = load_json(JSON_CHARACTERS_HP)
generated_characters = generate_characters(data)
patterns = generate_characters_patterns(generated_characters)
print(len(patterns))
for pattern in patterns[:5]:
    print(pattern)

4360
{'label': 'PERSON', 'pattern': '"Delphi"'}
{'label': 'PERSON', 'pattern': 'Abbott'}
{'label': 'PERSON', 'pattern': 'Aberforth'}
{'label': 'PERSON', 'pattern': 'Aberforth Dumbledore'}
{'label': 'PERSON', 'pattern': 'Alastor'}


In [11]:
# Create rule-based NER model with EntityRuler pipe

path_hp_rule_based_ner_model = os.path.join(DIR_MODELS_SPACY, "hp_rule_based_ner")

def create_rule_based_model(
        patterns: list[dict], 
        path=path_hp_rule_based_ner_model
    ) -> None:
    """Create and save a rule-based NER model using custom patterns."""

    # Initialize model with an empty pipeline
    nlp = English() # no pre-trained components (only basic tokenizer included)

    # Add EntityRuler to the model
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)

    # Save model
    nlp.to_disk(path)

def test_rule_based_model(nlp: Language, text: str) -> list[str]:
    """Extract recognized entities."""
    doc = nlp(text)
    return sorted(set(ent.text for ent in doc.ents))

create_rule_based_model(patterns)

In [12]:
# Test model

hp_corpus = Corpus(DIR_HP)
nlp = spacy.load(path_hp_rule_based_ner_model)

for n_book, book in enumerate(hp_corpus.books[:1], start=1):
    for n_chapter, chapter in enumerate(book.chapters[:3], start=1): # NOTE: only book 1 chapters 1-3
        print(f'\nBook {n_book}, Chapter {n_chapter}')
        results = test_rule_based_model(nlp, chapter.text)
        results = sorted(set(results))
        print(results)


Book 1, Chapter 1
['Albus', 'Albus Dumbledore', 'Dedalus Diggle', 'Dudley', 'Dumbledore', 'Godric', 'Hagrid', 'Harry', 'Harry Potter', 'James', 'James Potter', 'Lily', 'Madam', 'Mr. Dursley', 'Mr. and Mrs. Dursley', 'Mrs. Dursley', 'Mrs. Potter', 'Muggle', 'Petunia', 'Pomfrey', 'Potter', 'Professor Dumbledore', 'Professor McGonagall', 'Sirius', 'Sirius Black', 'Ted', 'Voldemort']

Book 1, Chapter 2
['Aunt Petunia', 'Dudley', 'Dudley Dursley', 'Harry', 'Harry Potter', 'Marge', 'Mr. Dursley', 'Mrs. Figg', 'Nearly', 'Uncle Vernon', 'Vernon']

Book 1, Chapter 3
['Aunt Petunia', 'Dudley', 'Harry', 'Marge', 'Mrs. Figg', 'Nearly', 'Petunia', 'Potter', 'Uncle Vernon', 'Vernon']


There are still many unrecognized entities because a hardcoded list can never be complete (e.g. "Ronald" was not in the original list), and typos can occur. However, this approach may help generate training data.