## https://www.youtube.com/watch?v=ytAyCO-n8tY&t=38s

In [5]:
# !python -m spacy download en_core_web_sm
# !python -m spacy download en_core_web_lg
# !python -m spacy download en_core_web_md
# !pip install nltk

Collecting nltk
  Downloading nltk-3.6.2-py3-none-any.whl (1.5 MB)
Collecting click
  Using cached click-8.0.1-py3-none-any.whl (97 kB)
Collecting regex
  Downloading regex-2021.4.4-cp37-cp37m-win_amd64.whl (269 kB)
Installing collected packages: regex, click, nltk
Successfully installed click-8.0.1 nltk-3.6.2 regex-2021.4.4


In [106]:
import spacy
import nltk
import pickle
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
import json

In [9]:
nlp = spacy.load('en_core_web_lg')

In [118]:
def load_data(file):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def generate_better_characters(file):
    characters = load_data(file)
    new_characters = [character for character in characters]
    print("Original number of new_characters {}".format(len(new_characters)))
    
    for item in characters:
        item  = item.replace("The", "").replace("the", "").replace("and", "").replace("And", "")
        names = item.split(" ")
        for name in names:
            name = name.strip()
            new_characters.append(name)
        if "(" in item:
            names = item.split("(")
            for name in names:
                name = name.replace(")", "").strip()
                new_characters.append(name)
        if "," in item:
            names = item.split(",")
            for name in names:
                name = name.replace("and", "").strip()
                if " " in name:
                    new_names = name.split()
                    for x in new_names:
                        x = x.strip()
                        new_characters.append(x)
                new_characters.append(name)
    final_characters = []
    titles = ["Dr.", "Professor", "Mr.", "Mrs.", "Ms.", "Miss", "Aunt", "Uncle", "Mr. and Mrs."]
    
    for character in new_characters:
        if "" != character:
            final_characters.append(character)
            for title in titles:
                titled_char = f"{title} {character}"
                final_characters.append(titled_char)
            
    final_characters = list(set(final_characters))
    final_characters.sort()
    
    print("After stripping, total number of new characters {}".format(len(final_characters)))
    return final_characters


def create_training_data(file, label_type):
    data = generate_better_characters(file)
    patterns = []
    for item in data:
        pattern = {
            "label": label_type,
            "pattern": item
        }
        patterns.append(pattern)
    return patterns

def generate_rules(patterns):
    nlp = English()
    ruler = EntityRuler(nlp)
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    nlp.to_disk('hr_ner')

def test_model(model, text):
    doc = nlp(text)
    results = []
    for ent in doc.ents:
        results.append(ent.text)
    return results

def save_data(file, data):
    with open(file, "w", encoding='utf-8') as file:
        json.dump(data, file, indent=4)
    

In [111]:
patterns = create_training_data("../data/digital_humanity/hp_characters.json", 'PERSON')
generate_rules(patterns)

Original number of new_characters 207
After stripping, total number of new characters 5119


In [112]:
nlp = spacy.load("hr_ner")

In [121]:
with open("../data/digital_humanity/hp.txt") as f:
    text = f.read()
    chapters = text.split("CHAPTER")[1:]
    ie_data = {}
    for chapter in chapters:
        chapter_num, chapter_title =  chapter.split("\n\n")[0:2]
        chapter_num = chapter_num.strip()
        segments = chapter.split("\n\n")[2:]
        hits = []
        
        for segment in segments:
            segment = segment.strip()
            segment = segment.replace("\n", " ")
            results =  test_model(nlp,segment)
            for result in results:
                hits.append(result)
        ie_data[chapter_num] = hits

save_data("../data/digital_humanity/results/hp_data.json", ie_data)

{'ONE': ['Mr. and Mrs. Dursley',
  'Mr. Dursley',
  'Mrs. Dursley',
  'Dudley',
  'Mrs. Potter',
  'Mrs. Dursley',
  'Mrs. Dursley',
  'Dudley',
  'Mr. and Mrs. Dursley',
  'Mr. Dursley',
  'Mrs. Dursley',
  'Dudley',
  'Mr. Dursley',
  'Mrs. Dursley',
  'Dudley',
  'Dudley',
  'Mr. Dursley',
  'Mr. Dursley',
  'Mr. Dursley',
  'Mr. Dursley',
  'Mr. Dursley',
  'Mr. Dursley',
  'Mr. Dursley',
  'Mr. Dursley',
  'Mr. Dursley',
  'Mr. Dursley',
  'Mr. Dursley',
  'Harry',
  'Mr. Dursley',
  'Potter',
  'Potter',
  'Harry',
  'Harry',
  'Mrs. Dursley',
  'Mr. Dursley',
  'Mr. Dursley',
  'Mr. Dursley',
  'Mr. Dursley',
  'Mr. Dursley',
  'Mrs. Dursley',
  'Mrs.',
  'Dudley',
  'Mr. Dursley',
  'Dudley',
  'Ted',
  'Mr. Dursley',
  'Mrs. Dursley',
  'Petunia',
  'Mrs. Dursley',
  'Mr. Dursley',
  'Mrs. Dursley',
  'Mrs. Dursley',
  'Mr. Dursley',
  'Potter',
  'Dudley',
  'Mrs. Dursley',
  'Harry',
  'Mr. Dursley',
  'Mrs. Dursley',
  'Mr. Dursley',
  'Mrs. Dursley',
  'Mr. Dursley',
  'Mr