In [5]:
import spacy
import json
import random

In [12]:
def load_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

def save_data(file, data):
    with open(file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)


#TRAIN_DATA = [(text, {"entities": [(start, end, label)]})]
def test_model(model, text):
    doc = nlp(text)
    results = []
    entities = []
    for ent in doc.ents:
        entities.append((ent.start_char, ent.end_char, ent.label_))
    if len(entities) > 0:
        results = [text, {"entities": entities}]
        return (results)

In [13]:
nlp = spacy.load("hp_ner")

TRAIN_DATA = []
with open ("data/hp.txt", "r")as f:
    text = f.read()

    chapters = text.split("CHAPTER")[1:]
    for chapter in chapters:
        chapter_num, chapter_title = chapter.split("\n\n")[0:2]
        chapter_num = chapter_num.strip()
        segments = chapter.split("\n\n")[2:]
        hits = []
        for segment in segments:
            segment = segment.strip()
            segment = segment.replace("\n", " ")
            results = test_model(nlp, segment)
            if results != None:
                TRAIN_DATA.append(results)

print (len(TRAIN_DATA))


2213


In [16]:
TRAIN_DATA[0]

["Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense.",
 {'entities': [(0, 20, 'PERSON')]}]

In [17]:
save_data("data/hp_training_data.json", TRAIN_DATA)

In [26]:
from spacy.training.example import Example

def train_spacy(data, iterations):
    TRAIN_DATA = data
    nlp = spacy.blank("en")

    # Get ner pipeline
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner")
    else:
        ner = nlp.get_pipe("ner")

    # Iterate over all entities and add labels to NER model - These could be custom
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # Find all other pipes and disable them
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.select_pipes(disable=other_pipes):
        
        optimizer = nlp.initialize()
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                example = Example.from_dict(nlp.make_doc(text), annotations)
                nlp.update(
                    [example],
                    drop=0.2,
                    sgd=optimizer,
                    losses=losses
                )
            print(losses)
    return nlp

In [None]:
train_spacy(TRAIN_DATA, 30)

Starting iteration 0
{'ner': 882.4063242711264}
Starting iteration 1
{'ner': 224.5125649694635}
Starting iteration 2
{'ner': 119.971207645899}
Starting iteration 3
{'ner': 135.8510571489502}
Starting iteration 4
{'ner': 81.66980922921383}
Starting iteration 5
{'ner': 65.39113734567212}
Starting iteration 6
{'ner': 79.33539710237855}
Starting iteration 7
{'ner': 76.1587136903285}
Starting iteration 8
{'ner': 43.04621944385383}
Starting iteration 9
{'ner': 40.385160226992205}
Starting iteration 10
{'ner': 35.203288566882506}
Starting iteration 11
{'ner': 36.958736954395114}
Starting iteration 12
{'ner': 35.80949118649959}
Starting iteration 13
{'ner': 9.733852724509132}
Starting iteration 14
{'ner': 1.740386369374765}
Starting iteration 15
{'ner': 0.006831875724748506}
Starting iteration 16
{'ner': 70.33368192341253}
Starting iteration 17
{'ner': 29.8763223222195}
Starting iteration 18
{'ner': 13.567793729582318}
Starting iteration 19
{'ner': 16.633665949743076}
Starting iteration 20


In [None]:
test = "Harry James[59] Potter (b. 31 July 1980[1]) was an English half-blood[2] wizard, and one of the most famous wizards of modern times. He was the only child and son of James and Lily Potter (née Evans), both members of the original Order of the Phoenix. Harry's birth was overshadowed by a prophecy, naming either himself or Neville Longbottom as the one with the power to vanquish Lord Voldemort. After half of the prophecy was reported to Voldemort, courtesy of Severus Snape, Harry was chosen as the target due to his many similarities with the Dark Lord. In turn, this caused the Potter family to go into hiding. Voldemort made his first vain attempt to circumvent the prophecy when Harry was a year and three months old. During this attempt, he murdered Harry's parents as they tried to protect him, but this unsuccessful attempt to kill Harry led to Voldemort's first downfall. This downfall marked the end of the First Wizarding War, and to Harry henceforth being known as the 'Boy Who Lived',[5] as he was the only known survivor of the Killing Curse. One consequence of Lily's loving sacrifice was that her orphaned son had to be raised by her only remaining blood relative, his Muggle aunt, Petunia Dursley. While in her care he would be protected from Lord Voldemort, due to the Bond of Blood charm Albus Dumbledore placed upon him.[60] This powerful charm would protect him until he became of age, or no longer called his aunt's house home. Due to Petunia's resentment of her sister and her magic gifts, Harry grew up abused and neglected. On his eleventh birthday, Harry learned that he was a wizard, from Rubeus Hagrid.[61] He began attending Hogwarts School of Witchcraft and Wizardry in 1991. The Sorting Hat was initially going to Sort Harry into Slytherin House, but Harry pleaded 'not Slytherin' and the Hat heeded this plea, instead sorting the young wizard into Gryffindor House.[62] At school, Harry became best friends with Ron Weasley and Hermione Granger. He later became the youngest Quidditch Seeker in over a century and eventually the captain of the Gryffindor House Quidditch Team in his sixth year, winning two Quidditch Cups.[63] He became even better known in his early years for protecting the Philosopher's Stone from Voldemort, saving Ron's sister Ginny Weasley, solving the mystery of the Chamber of Secrets, slaying Salazar Slytherin's basilisk, and learning how to conjure a corporeal stag Patronus at the age of thirteen. In his fourth year, Harry won the Triwizard Tournament, although the competition ended with the tragic death of Cedric Diggory and the return of Lord Voldemort. During the next school year, Harry reluctantly taught and led Dumbledore's Army. He also fought in the Battle of the Department of Mysteries, during which he lost his godfather, Sirius Black."

import re

def clean_text(text):
    cleaned = re.sub(r"[\(\[].*?[\)\]]", "", text)
    return (cleaned)

test = clean_text(test)
people = []
nlp = spacy.load("hp_ner_model")
doc = nlp(test)
for ent in doc.ents:
    print (ent)

In [None]:
nlp.to_disk("hp_ner_model")