# Training a custom Holocaust NER model

The model shoudl be able to identify camps and tag them as camps appropriately.

- Create blank spacy model.
- Add a custom pipeline which will use NER to identify cutom labels
- The label here is CONCENTRATION_CAMP


In [1]:
# Basic Functions
import re, glob
from hc_ner import *

In [2]:
# Finding and Downloading Data

# https://en.wikipedia.org/wiki/List_of_subcamps_of_Auschwitz
# https://collections.ushmm.org/search/


def clean_hc_labels(file: str) -> list[str]:
    data = []
    with open(file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            results = re.findall(r"[A-Z].*?\t", line)
            if len(results) > 0:
                results = results[0].replace("\t", "")
                data.append(results)
    return data

# Clean camps and ghetto
camp_file = './data/hc/camps.txt'
camp_data = clean_hc_labels(camp_file)
save_data('./data/hc/camps.json', camp_data)

ghetto_file = './data/hc/ghettos.txt'
ghetto_data = clean_hc_labels(ghetto_file)
save_data('./data/hc/ghettos.json', ghetto_data)


In [3]:
MODEL_NAME = "holocaust.ner"
CAMP_LABEL = "CONC_CAMP"

# Generate patterns from camp data
labels = create_training_data_for_rule_generation(camp_data, CAMP_LABEL)

# Process camps into model using entity ruler
nlp = generate_entity_ruler_model(MODEL_NAME, labels)

Built 56 training examples for ruler: [{'label': 'CONC_CAMP', 'pattern': 'Theresienstadt'}, {'label': 'CONC_CAMP', 'pattern': 'Trawniki'}, {'label': 'CONC_CAMP', 'pattern': 'Treblinka'}, {'label': 'CONC_CAMP', 'pattern': 'Vaivara'}, {'label': 'CONC_CAMP', 'pattern': 'Westerbork'}]


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [4]:
SCRUB_LIST = [
    "U N I T E D",
    "S T A T E S",
    "H O L O C A U S T",
    "M E M O R I A L",
    "M U S E U M",
    "C R E D I T S",
    "E C H O E S",
    "O F",
    "M E M O R Y",
    "of memory",
    "e c h o e s"
]

PATTERN = r'^([A-Z]( )*)+[A-Z]*$'

# Readin in book data

books = glob.glob('/Users/tapiwamaruni/projects/spacy-ner/ner_youtube/data/hc/pdfs/*.pdf')
final_text = ""
for book in books:
    print(book)
    pages, text = extract_pdf_text_pymupdf(book, SCRUB_LIST)
    final_text += text + "\n\n"

# Clean book text
cleaned_text = clean_text_strict(final_text)



/Users/tapiwamaruni/projects/spacy-ner/ner_youtube/data/hc/pdfs/200080819-echoes-vol_4.pdf
Document has 71 pages
<<< Page 0 >>>
>>> Removing line [E C H O E S]
>>> Removing line [of memory]
>>> Removing line [4]
>>> Removing line [V]
>>> Removing line [O]
>>> Removing line [L]
>>> Removing line [U]
>>> Removing line [M]
>>> Removing line [E]
>>> Removing line []
<<< Page 1 >>>
>>> Removing line [U N I T E D]
>>> Removing line [S T A T E S]
>>> Removing line [H O L O C A U S T]
>>> Removing line [M E M O R I A L]
>>> Removing line [M U S E U M]
>>> Removing line [M E M O R Y]
>>> Removing line [E C H O E S]
>>> Removing line [of memory]
>>> Removing line [4]
>>> Removing line [V]
>>> Removing line [O]
>>> Removing line [L]
>>> Removing line [U]
>>> Removing line [M]
>>> Removing line [E]
>>> Removing line []
<<< Page 2 >>>
>>> Removing line [m]
>>> Removing line []
<<< Page 3 >>>
>>> Removing line [U N I T E D]
>>> Removing line [S T A T E S]
>>> Removing line [H O L O C A U S T]
>>> Re

In [5]:
# Manual annotation of text

#TRAIN_DATA = [(text, {"entities": [(start, end, label)]})]

def annotate_text(LABEL, tags: list[str], segments: list[str]) -> list[tuple]:
    TRAIN_DATA: list[tuple] = []
    for segment in segments:
        segment = segment.strip()
        segment = segment.replace("\n", " ")
        
        # run through tags and see if there are matches
        annotations: list[tuple] = []
        
        for tag in tags:
            
            if tag in segment:
                start = segment.find(tag)
                end = start + len(tag)
                annotations.append((start, end, LABEL))
                
        # Add to training data
        if len(annotations) > 0:
            TRAIN_DATA.append((segment, {"entities": annotations}))
        
    return TRAIN_DATA




segments = cleaned_text.split("\n\n")
TRAIN_DATA = annotate_text(CAMP_LABEL, camp_data, segments)
print(f"Compiled {len(TRAIN_DATA)} training examples")

Compiled 17 training examples


In [6]:
# Train model
nlp = train_spacy_ner(MODEL_NAME, TRAIN_DATA, 30)

Starting training for spacy NER model: holocaust.ner, training data length: 17
Found labels from TRAIN_DATA: {'CONC_CAMP'}
Starting iteration 0




Losses at iteration 0 : {'ner': 27325.1246519768}
Starting iteration 1
Losses at iteration 1 : {'ner': 40.388141277124184}
Starting iteration 2
Losses at iteration 2 : {'ner': 60.16103915955571}
Starting iteration 3
Losses at iteration 3 : {'ner': 34.41090552133292}
Starting iteration 4
Losses at iteration 4 : {'ner': 26.518622853410857}
Starting iteration 5
Losses at iteration 5 : {'ner': 20.64736281383545}
Starting iteration 6
Losses at iteration 6 : {'ner': 17.109430326050607}
Starting iteration 7
Losses at iteration 7 : {'ner': 18.046899015929966}
Starting iteration 8
Losses at iteration 8 : {'ner': 8.941540290892004}
Starting iteration 9
Losses at iteration 9 : {'ner': 10.415300061525151}
Starting iteration 10
Losses at iteration 10 : {'ner': 10.539982445474248}
Starting iteration 11
Losses at iteration 11 : {'ner': 12.45317693160774}
Starting iteration 12
Losses at iteration 12 : {'ner': 3.85623112626943}
Starting iteration 13
Losses at iteration 13 : {'ner': 7.583733991859121}
S

In [13]:
# Test the model

nlp = spacy.load("holocaust.ner")

doc = nlp(TRAIN_DATA[0][0])

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Auschwitz 22612 22621 CONC_CAMP
Buchenwald 22622 22632 CONC_CAMP
