# Extracting place names using Spacy and Named Entity Recognition

This works through all the text files extracting place names using Named Entity Recognition. The book id and place name are added to the `spacy_ner.txt` file for further processing.

In [1]:
import spacy
from pathlib import Path
import itertools
from tqdm.auto import tqdm
nlp = spacy.load("en_core_web_sm")

In [2]:
output_path = Path('/media/tim/workingData/loc/')

In [3]:
# Looks like this is going to take days to run...

In [None]:
# %%time
# Path("spacy_ner.txt").unlink(missing_ok=True)
# for text_file in list(itertools.islice(output_path.glob("*.txt"), 50)):
processed = Path("spacy_processed.txt").read_text().split("\n")
# This sets the maximum number of chars Spacy will accept
# This value is bigger than the default, but not too big or there'll be memory probs
# I split bigger texts into chunks lower down...
nlp.max_length = 1600000
for text_file in tqdm(output_path.glob("*.txt")):
    # Keep track of what's been processed in case I need to restart
    if text_file.name not in processed:
        places = []
        text_id = text_file.stem
        text = text_file.read_text()
        # If the text files are too big there are memory problems and the kernel dies
        # So split up big files into paragraph-ish blocks
        if len(text) < 1500000:
            texts = [text]
        else:
            texts = text.split("\n\n")
        for block in texts:
            if block.strip():
                try:
                    doc = nlp(block)
                # Blocks too large for Spacy will throw a Value Error
                # This is probably because they're undecoded byte string strings
                # We'll just record them for now
                except ValueError:
                    print(text_id)
                else:
                    # Get place entities
                    places.extend([e.text.strip().replace('\n', '') for e in doc.ents if e.label_ == "GPE"])
        if places:
            # Remove duplicates
            ents = list(set(places))
            with Path("spacy_ner.txt").open("a") as results_file:
                # Just saving the place name and the book id for now
                for ent in ents:
                    results_file.write(f"{text_id}\t{ent}\n")
        # Update the list of processed files
        processed.append(text_file.name)
        with Path("spacy_processed.txt").open("a") as processed_file:
            processed_file.write(f"{text_file.name}\n")

Encoding errors:

- 09011037.txt