# Extract sentences containing country names from the texts

Using the `ref_countries.csv` dataset that matches place name references to Wikidata countries, we can extract sentences containing those place names from the original texts.

In [1]:
import pandas as pd
import spacy
import re
from pathlib import Path
import itertools
from tqdm.auto import tqdm
from spacy.matcher import Matcher, PhraseMatcher
nlp = spacy.load("en_core_web_sm")

In [2]:
# Load the dataset
df = pd.read_csv("refs_countries.csv", dtype={"book_id": str, "text": str}, low_memory=False)

In [3]:
df

Unnamed: 0,book_id,text,country,startDate,endDate,countryLabel,countryTypeLabel,lat,lon
0,00000003,Scotland,http://www.wikidata.org/entity/Q230791,0843-01-01T00:00:00Z,1707-04-30T00:00:00Z,Kingdom of Scotland,historical country,,
1,00000036,Scotland,http://www.wikidata.org/entity/Q230791,0843-01-01T00:00:00Z,1707-04-30T00:00:00Z,Kingdom of Scotland,historical country,,
2,00000064,Scotland,http://www.wikidata.org/entity/Q230791,0843-01-01T00:00:00Z,1707-04-30T00:00:00Z,Kingdom of Scotland,historical country,,
3,00000068,Scotland,http://www.wikidata.org/entity/Q230791,0843-01-01T00:00:00Z,1707-04-30T00:00:00Z,Kingdom of Scotland,historical country,,
4,00000108,Scotland,http://www.wikidata.org/entity/Q230791,0843-01-01T00:00:00Z,1707-04-30T00:00:00Z,Kingdom of Scotland,historical country,,
...,...,...,...,...,...,...,...,...,...
294229,06014839,Columbia District,http://www.wikidata.org/entity/Q2980922,1810-01-01T00:00:00Z,1858-08-02T00:00:00Z,Columbia District,historical country,,
294230,06015303,Morgannwg,http://www.wikidata.org/entity/Q21005050,1063-01-01T00:00:00Z,1091-01-01T00:00:00Z,Kingdom of Morgannwg,historical country,,
294231,06016559,Valachia,http://www.wikidata.org/entity/Q389004,1330-01-01T00:00:00Z,1859-01-01T00:00:00Z,Principality of Wallachia,historical country,,
294232,06022733,Lower Burgundy,http://www.wikidata.org/entity/Q33242,0855-09-23T00:00:00Z,0933-01-01T00:00:00Z,Lower Burgundy,historical country,,


In [None]:
#matcher = PhraseMatcher(nlp.vocab)

output_file = Path("sentences.tsv")
nlp.max_length = 1600000
processed = Path("books_processed.txt").read_text().split("\n")

# Loop through each individual book, getting the list of references for that book
for book_id, refs in tqdm(df.groupby(by="book_id")):
    if book_id not in processed:
        text_file = Path(f"/media/tim/workingData/loc/{book_id}.txt")
        terms = refs["text"].to_list()
        # Open the text file
        text = text_file.read_text()
        # Chunkify if too big to keep Spacy happy
        if len(text) < 1400000:
            texts = [text]
        else:
            texts = text.split("\n\n")
        for block in texts:
            doc = nlp(block)
            # Loop through the list of place names in this book
            for term in terms:
                patterns = []
                # Placenames consisting of multiple words need to be tokenised and
                # added to the matcher pattern individually
                term_pattern = [{"ORTH": t} for t in term.split()]
                # First pattern -- country followed by verb
                patterns.append(term_pattern + [{"POS": {"IN": ["AUX", "VERB"]}}])
                # Second pattern -- preposition followed by country
                patterns.append([{"POS": "ADP"}] + term_pattern)
                # Third pattern -- punctuation, country, punctuation (to get lists of names)
                patterns.append([{"IS_PUNCT": True}] + term_pattern + [{"IS_PUNCT": True}])
                # I tried only instatiating the Matcher once, then removing the patterns at the end of the loop,
                # but memory consumption kept increasing suggesting that the Matcher kept its matches even when you removed the pattern
                # Re-initialising very time avoids the memory problems
                matcher = Matcher(nlp.vocab)
                matcher.add("country_refs", patterns)
                matches = matcher(doc)
                sentences = []
                # Work through all the matches saving sentences to a list
                for match_id, start, end in matches:
                    span = doc[start:end]
                    # Remove line breaks
                    sentence = str(span.sent).replace("\n", " ")
                    # Remove extra whitespace
                    sentence = re.sub(r"\s+", " ", sentence)
                    sentences.append(sentence)
                # Only keep sentences that contain a verb
                for sent in list(set(sentences)):
                    sent_doc = nlp(sent)
                    has_verb = False
                    for token in sent_doc:
                        if token.pos_ in ["VERB", "AUX"]:
                            has_verb = True
                    # Save book id, place name, and sentence to a file
                    if has_verb:
                        with output_file.open("a") as output:
                            output.write(f"{book_id}\t{term}\t{sent}\n")
        processed.append(book_id)
        with Path("books_processed.txt").open("a") as processed_file:
            processed_file.write(f"{book_id}\n")