# Basic Functions

- Create blank spacy model.
- Add a custom pipeline which will use NER to identify cutom labels
- The label here is CONCENTRATION_CAMP

In [None]:
# Basic Functions
import re
from hc_ner import *

def clean_hc_file(file):
    data = []
    with open(file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            results = re.findall(r"[A-Z].*?\t", line)
            if len(results) > 0:
                results = results[0].replace("\t", "")
                data.append(results)
    return data

In [None]:
# Finding and Downloading Data

# https://en.wikipedia.org/wiki/List_of_subcamps_of_Auschwitz
# https://collections.ushmm.org/search/

# Clean camps and ghetto
camp_file = './data/hc/camps.txt'
camp_data = clean_hc_file(camp_file)
save_data('./data/hc/camps.json', camp_data)

ghetto_file = './data/hc/ghettos.txt'
ghetto_data = clean_hc_file(ghetto_file)
save_data('./data/hc/ghettos.json', ghetto_data)


In [None]:
# import PyPDF2
import pdfplumber


def extract_pdf(file_name):
    with pdfplumber.open(file_name) as pdf:
        data = []
        for page in pdf.pages:
            data.append(page.extract_text())
        return data


# Generate training examples
pdf = './data/hc/pdfs/200090122-echoes-vol_1.pdf'

# Read all pages of PDF
data = extract_pdf(pdf)[3:]

In [None]:
# Merge and clean
full_text = "\n".join(data)

In [None]:
#print(full_text)
save_data('./data/hc/full_text1.txt', full_text)

In [None]:
TRAIN_DATA = []
    
# New spacy model just for training preparation
nlp = spacy.blank("en")

chapters = full_text.split('CHAPTER')

for idx, chapter in enumerate(chapters):
    segments = chapter.split("\n\n")
    hits = []
    for segment in segments:
        segment = segment.strip()
        segment = segment.replace("\n", " ")
        # results = build_test_model(nlp, segment)
        # if results != None:
        #     TRAIN_DATA.append(results)
        print(segment)

In [None]:
print(full_text)

In [None]:

def prepare_training_data(data):
    results = []
    for item in data:
        results.append([item, "CONC_CAMP"])
    return results

In [None]:
import spacy

# Create blank Spacy NER model
nlp = spacy.blank("en")


TRAIN_DATA = book_extract_train_data_doublespaced_segments('./data/hc/full_text1.txt')

# Add NER pipeline
ner = nlp.add_pipe("ner")

# Add labels before training
ner.add_label("CONC_CAMP")




# other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
# with nlp.select_pipes(disable=other_pipes):
#     nlp.begin_training()
#     for itn in range(10):
#         random.shuffle(TRAIN_DATA)
#         losses = {}
#         for text, annotations in TRAIN_DATA:
#             example = Example.from_dict(nlp.make_doc(text), annotations)
#             nlp.update([example], drop=0.2, sgd=optimizer, losses=losses)
#         print(losses)

# 
#nlp = train_spacy

nlp.to_disk("holocaust.ner")

In [None]:
TRAIN_DATA

In [54]:

import fitz, re
from string import printable

SCRUB_LIST = [
    "U N I T E D",
    "S T A T E S",
    "H O L O C A U S T",
    "M E M O R I A L",
    "M U S E U M",
    "C R E D I T S",
    "E C H O E S",
    "O F",
    "M E M O R Y",
    "of memory",
    "e c h o e s"
]

PATTERN = r'^([A-Z]( )*)+[A-Z]*$'

def is_valid_line(line, clean_list):
    return line.strip() in clean_list or len(re.findall(PATTERN, line)) > 0 or line.isdigit()

# Strip head lines
def strip_headers(text, scrub_list=[]):
    # Use re to check if there is a header
    lines = text.split('\n')
    
    # Strip scrub list into new list
    clean_list = [s.strip() for s in scrub_list]
    # print (f"Final clean list: {clean_list}")
    
    
    
    # Clean lines
    l2 = []
    for i, line in enumerate(lines):
        
        # Cleanup hidden characters
        line = ''.join(char for char in line if char in printable)
        l2.append(line)
    
    final_list = []
    for i, line in enumerate(l2):
        
        # If line in scrub list then remove
        if is_valid_line(line, clean_list):
            print(f">>> Removing line [{line}]")
        elif i<len(l2)-1 and is_valid_line(l2[i-1], clean_list) and is_valid_line(l2[i+1], clean_list): # Line before and after match pattern
            print(f">>> Removing line [Inbetween]: [{line}]")
        else:
            final_list.append(line.strip())
            # print(f"<<< Keeping line: [{line}]")
            
    return '\n'.join(final_list)

def extract_pdf_text_pymupdf(file_name, scrub_list=[]):
    # Create a document object
    doc = fitz.open(file_name)
    
    print(f'Document has {doc.page_count} pages')
    
    pages = []
    final_text = ""
    for i in range(doc.page_count):
        print(f'<<< Page {i} >>>')
        page = doc[i]
        text = page.get_text()
        # print(text)

        p = strip_headers(text, scrub_list)
        
        # links = page.get_links()
        # print(links)
        
        pages.append(p)
        final_text += p + "\n"

    return pages, final_text

# Test main
pages, final_text = extract_pdf_text_pymupdf('/Users/tapiwamaruni/projects/spacy-ner/ner_youtube/data/hc/pdfs/200090122-echoes-vol_1.pdf', SCRUB_LIST)



Document has 97 pages
<<< Page 0 >>>
>>> Removing line [e c h o e s]
>>> Removing line [ of memory]
>>> Removing line [1]
>>> Removing line [U N I T E D  S T A T E S  H O L O C A U S T  M E M O R I A L  M U S E U M]
<<< Page 1 >>>
>>> Removing line [E C H O E S]
>>> Removing line [of memory]
>>> Removing line [1]
>>> Removing line [V]
>>> Removing line [O]
>>> Removing line [L]
>>> Removing line [U]
>>> Removing line [M]
>>> Removing line [E]
>>> Removing line [U N I T E D]
>>> Removing line [S T A T E S]
>>> Removing line [H O L O C A U S T]
>>> Removing line [M E M O R I A L]
>>> Removing line [M U S E U M]
>>> Removing line [S T O R I E S]
>>> Removing line [F R O M]
>>> Removing line [T H E]
>>> Removing line [M E M O R Y]
>>> Removing line [P R O J E C T]
<<< Page 2 >>>
<<< Page 3 >>>
>>> Removing line [U N I T E D]
>>> Removing line [S T A T E S]
>>> Removing line [H O L O C A U S T]
>>> Removing line [M E M O R I A L]
>>> Removing line [M U S E U M]
>>> Removing line [C O N T E 

In [55]:
# pages[27]
# print(final_text)

'Ah, there is where our shelter must have been, the man said. I recognize remnants of our blanket.\nThey were coming closer; I could sense they were just steps away.\nLook, the woman exclaimed, as she pointed down at me, the teapot. She leaned down and picked\nup what was left of me. Look, it is ruined. Oh, the teapotruined. The man reached out to hold me.\nBoth cradled my ravaged body in their hands.\nOver there, the woman said, pointing, the lid. So my top had rolled farther after separating from\nmy body. She went to pick it up. Please take me away with you, I pined.\nTo think what this teapot has gone through. I feel it has been like a trusted friend, always there when\nneeded, but now gravely ill. We cannot leave the teapot here, at least it will be a souvenir of our past life,\nsaid the man. It survived, let us take it along.\nHad I a set of hands I would have clasped them in a thanksgiving prayer.\nk\nFor me, life was changed forever. The couple wrapped me in old rags and took m

In [56]:
# Clean each segement


v
o
l
u
m
e


This program has been made possible in part with support from the
Helena Rubinstein Foundation.


Echoes of Memory
LETTER FROM THE DIRECTOR ................................................................i
FOREWORD ......................................................................................iii
Elizabeth Anthony, Memory Project Coordinator
INTRODUCTION ..................................................................................v
Margaret Peterson, Memory Project Instructor
ERIKA ECKSTUT ................................................................................1
Teach Love, Not Hate ....................................................................2
Lasting Memory ..........................................................................4
FRANK EPHRAIM ................................................................................5
Sardines ....................................................................................6
Lunch Trade* ...........