# SETUP

## Import

In [1]:
import re
import json
import requests
from bs4 import BeautifulSoup

from shared.constants import *
from shared.corpus import Corpus

import spacy

In [2]:
# # Get characters

# url = "https://pemberley.com/janeinfo/ppdrmtis.html"
# res = requests.get(url)
# soup = BeautifulSoup(res.content, "html.parser")

# character_names = []
# target_h2 = soup.find("h2", string="Brief, Organized Listing of Characters")
# target_ul = target_h2.find_next("ul")
# for a in target_ul.find_all("a"):
#     name = a.get_text(strip=True).replace('\n', ' ')
#     if name:
#         character_names.append(name)

# print(character_names)
# with open(JSON_CHARACTERS_PNP, "w", encoding="utf-8") as f:
#     json.dump(character_names, f, indent=4)

# Rule-Based NER (Gazetteer Method)

In [3]:
# Load data
with open(TXT_PNP, "r", encoding="utf-8") as f:
    text = f.read().strip()
    text = re.split(PATTERN_CHAPTER_LINE, text)[1:6]

with open(JSON_CHARACTERS_PNP, "r", encoding="utf-8") as f:
    characters = json.load(f)
    names = set()
    for character in characters:
        names.update(
            [
                i for i in character.split() 
                if not i in ['Mr.', 'The', 'Mrs.', 'Miss', 'Old', 'Lady', 'de', 'Sir', 'Captain', 'Colonel']
            ]
        )
    names = list(names)

print(characters)
print(names)

['Mr. Bennet', 'Mrs. Bennet', 'Jane', 'Elizabeth', 'Mary', 'Kitty', 'Lydia', 'Bingley', 'Louisa Hurst', 'Caroline', 'Mr. Collins', 'Old Mr. Darcy', 'Lady Anne Darcy', 'Darcy', 'Georgiana Darcy', 'Lady Catherine', 'Anne de Bourgh', 'Colonel Fitzwilliam', 'Mr. Gardiner', 'Mrs. Gardiner', 'Sir William', 'Lady Lucas', 'Charlotte', 'Maria', 'Old Mr. Wickham', 'Wickham', 'Mrs. Annesley', 'Captain Carter', 'Mr. Chamberlayne', 'Dawson', 'Mr. Denny', 'Colonel Forster', 'William Goulding', 'Miss Grantley', 'Haggerston', 'The Harringtons', 'Mrs. Hill', 'Mr. Hurst', 'Mrs. Jenkinson', 'Mr. Jones', 'Miss Mary King', 'Mrs. Long', 'Lady Metcalfe', 'Mr. Morris', 'Mrs. Nicholls', 'Mr. Philips', 'Miss Pope', 'Mr. Pratt', 'Mrs. Reynolds', 'Mr. Robinson', 'Mr. Stone', 'Miss Watson', 'The Miss Webbs', 'Mrs. Younge']
['Forster', 'Catherine', 'Watson', 'Pratt', 'Jones', 'Goulding', 'Harringtons', 'Metcalfe', 'Fitzwilliam', 'Robinson', 'Lydia', 'Webbs', 'Collins', 'Reynolds', 'Georgiana', 'Maria', 'King', 'Hil

In [4]:
# Characters by Chapter
for n_chapter, chapter in enumerate(text, start=1):
    print(f'\n\nChapter {n_chapter}:')
    chapter = chapter.strip().replace('\n', ' ').translate(str.maketrans('', '', PUNCTUATION))
    words = chapter.split() # NOTE: Mr and Mrs will be split

    chapter_names = set()
    for n, word in enumerate(words):
        if word in names:
            if (words[n-2] == 'Old') or (words[n-1] == 'de'):
                chapter_names.add(' '.join(words[n-2:n+1])) # Handle 3-word names
            elif words[n-1][0].isupper():
                chapter_names.add(' '.join(words[n-1:n+1])) # Handle 2-word names
            else:
                chapter_names.add(word)
    print(', '.join(chapter_names))



Chapter 1:
Mrs Long, Mr Morris, Mr Bennet, Bingley, Sir William, Lydia, Mr Bingley, Jane, Lady Lucas


Chapter 2:
Mrs Long, Mrs Bennet, Elizabeth, Now Kitty, Mr Bennet, Kitty, Mary, Lydia, Mr Bingley, While Mary


Chapter 3:
Catherine, Elizabeth Bennet, Mr Darcy, Lydia, Miss King, Mr Hurst, Maria, Miss Bennet, Mary, Bingley, Mrs Hurst, Miss Lucas, Elizabeth, Sir William, Mr Bingley, Jane, Lady Lucas, Mrs Bennet, Miss Bingley, Maria Lucas, Mr Bennet, Come Darcy


Chapter 4:
Elizabeth, When Jane, Miss Bingley, Miss Bennet, Bingley, Darcy, Mr Bingley, Mrs Hurst


Chapter 5:
Mrs Long, Mrs Bennet, Miss Lucas, Elizabeth, Mr Robinson, Miss Bingley, Miss Bennet, Lucas, Charlotte, Mr Darcy, Mary, William Lucas, Sir William, Jane, Lady Lucas


# NER with SpaCy

## Data

In [5]:
# Data
hp_corpus = Corpus(DIR_HP)
chapter1 = hp_corpus.books[0].chapters[0].text
chapter1[:100]

'Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly norma'

In [None]:
# url = "https://en.wikipedia.org/wiki/List_of_Harry_Potter_characters"
# res = requests.get(url)
# soup = BeautifulSoup(res.content, "html.parser")

# characters = []
# divs = soup.find_all("div", class_="mw-heading mw-heading3")
# for div in divs:
#     ul = div.find_next("ul")
#     for li in ul.find_all("li"):
#             text = li.get_text(strip=True)
#             name = re.search(r"^(.*?)–", text).group(1).replace(' and ', ',').translate(str.maketrans('()', ',,'))
#             if ',' in name:
#                 split_name = [part.strip() for part in name.split(',') if part.strip()]
#                 last_name = split_name[0]

#                 # Case: <last_name>, <name 1>, <name2>, ...
#                 if len(split_name[1:]) > 1:
#                     for first_name in split_name[1:]:
#                         characters.append(f'{first_name.strip()} {last_name.strip()}')
#                 # Case: <last_name>, <first_name>
#                 else:
#                     characters.append(f'{split_name[1].strip()} {last_name.strip()}')
#             else:
#                 # Case: <name>
#                 characters.append(name.strip())

# print(characters[:10])
# with open(JSON_CHARACTERS_HP, "w", encoding="utf-8") as f:
#     json.dump(characters, f, indent=4)

['Hannah Abbott', 'Ludo Bagman', 'Bathilda Bagshot', 'Katie Bell', 'Cuthbert Binns', 'Phineas Nigellus Black', 'Sirius Black', 'Regulus Black', 'Amelia Bones', 'Edgar Bones']


In [7]:
nlp = spacy.load('en_core_web_lg')

In [8]:
doc = nlp(chapter1)

In [9]:
unique_ents = list({ent.text: ent for ent in doc.ents}.values())
for ent in unique_ents[:10]:
    print(f'{ent.text:<15}{ent.label_:<10}')

Dursley        PERSON    
number four    CARDINAL  
Privet Drive   PERSON    
Grunnings      ORG       
Dursleys       NORP      
Dudley         PERSON    
Potters        ORG       
Potter         PERSON    
several years  DATE      
Tuesday        DATE      


We need to customize the model to make NER for the HP corpus better

## EntityRuler

Using EntityRuler we can generate rule-based NER with SpaCy.  
This approach can be used to generate train datasets.