In [17]:
import spacy
from pathlib import Path
import itertools
from tqdm.auto import tqdm
from spacy.matcher import Matcher, PhraseMatcher
nlp = spacy.load("en_core_web_sm")

In [16]:
text = """
This is a sentence. This is a sentence about Australia. You see, Australia was a nice country.
"""

In [5]:
text_file = Path("/media/tim/workingData/loc/00000410.txt")

Does it make any difference to processing speed if we just look for a single token rather than POS?

In [25]:
%%time
matcher = Matcher(nlp.vocab)
#pattern = [{"ORTH": "America"}, {"POS": {"IN": ["AUX", "VERB"]}}]
pattern = [{"ORTH": "North"}, {"ORTH": "America"}]
matcher.add("country1", [pattern])

doc = nlp(text_file.read_text())
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.sent)


It spread to China 
and Hindustan, to the Indian tribes of North America, 
and to South America.
The Jesuits in North America in the Seventeenth 

Century. 
Parsons, J. Russell.
CPU times: user 10.6 s, sys: 1.05 s, total: 11.7 s
Wall time: 11.7 s


If I'm not getting POS then I can use the PhraseMatcher to search for all terms in a text at once! Much faster. ASlso avoids problems with tokeinsing the texts.

In [22]:

matcher = PhraseMatcher(nlp.vocab)
terms = ["America", "North America", "Germany", "China"]
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns)

doc = nlp(text_file.read_text())
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text, span.sent)

China If desirable the teacher can omit the chapters on China, 
India, Persia, and Israel.
China CHAPTER II 

China 20 

I. Geography and history
Germany In Germany.
Germany The School System of Germany 289 

I. Administration.
Germany The last chapters of this book, therefore, are devoted to a 
study of the school systems of Germany, France, England, 
and America.
America The last chapters of this book, therefore, are devoted to a 
study of the school systems of Germany, France, England, 
and America.
China r China. 

China The result 
in China furnishes the best argument against a method of 
instruction that appeals solely to the memory.
China The 

1 Mrs. E. E. Baldwin, Foochow, China. 

'■^ Houghton, " Women of the Orient," p. 14. 




China China is divided into provinces which are subdivided 
into districts.
China During the last few years, military, polytechnic, and 
other high schools have been founded in China, and teach- 
ers from France, Germany, England, and America have 
b

In [9]:
%%time
matcher = Matcher(nlp.vocab)
pattern = [{"ORTH": "America"}, {"POS": {"IN": ["AUX", "VERB"]}}]
matcher.add("country1", [pattern])

doc = nlp(text_file.read_text())
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.sent)

During the last few years, military, polytechnic, and 
other high schools have been founded in China, and teach- 
ers from France, Germany, England, and America have 
been placed in charge of them.
With free schools, abler teachers, consecrated to their 
calling, and better courses of instruction ; with a people 
generous in expenditures for educational purposes, a co- 
operation of parents and teachers, and a willingness to learn 
from other nations ; with the many educational periodicals, 
the pedagogical books, and teachers' institutes to broaden 
and stimulate the teacher, — the friends of education in 
America may labor on assured that the new century will 
give abundant fruitage to the work which has so marvel- 
ously prospered in the old. 




CPU times: user 10.4 s, sys: 948 ms, total: 11.3 s
Wall time: 11.3 s


If I collect all the sentences first, can I then filter by POS? Yes, and it's quick.

In [11]:
%%time
matcher = Matcher(nlp.vocab)
pattern = [{"ORTH": "America"}, {"POS": {"IN": ["AUX", "VERB"]}}]
matcher.add("country1", [pattern])

doc = nlp("""During the last few years, military, polytechnic, and 
other high schools have been founded in China, and teach- 
ers from France, Germany, England, and America have 
been placed in charge of them.""")
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.sent)
    
doc = nlp("""Another sentence with the word America in it.""")
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.sent)

During the last few years, military, polytechnic, and 
other high schools have been founded in China, and teach- 
ers from France, Germany, England, and America have 
been placed in charge of them.
CPU times: user 19.4 ms, sys: 0 ns, total: 19.4 ms
Wall time: 17.9 ms


Ok, so I'll get all the sentences that mention a country, and then I'll filter later to find specific POS patterns. This means I only do the time consuming full parsing once, and have the flexibility later to change what I'm looking for.

In [None]:
# Get text refs that are linked to countries

# Get books containing text refs
# Get sentences containing text refs from books

In [27]:
import spacy
from spacy import displacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
matched_sents = []  # Collect data of matched sentences to be visualized

def collect_sents(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]  # Matched span
    sent = span.sent  # Sentence containing matched span
    # Append mock entity for match in displaCy style to matched_sents
    # get the match span by ofsetting the start and end of the span with the
    # start and end of the sentence in the doc
    match_ents = [{
        "start": span.start_char - sent.start_char,
        "end": span.end_char - sent.start_char,
        "label": "MATCH",
    }]
    matched_sents.append({"text": sent.text, "ents": match_ents})

pattern = [{"LOWER": "facebook"}, {"POS": "AUX"}, {"POS": "ADV", "OP": "*"},
           {"POS": "ADJ"}]
matcher.add("FacebookIs", [pattern], on_match=collect_sents)  # add pattern
doc = nlp("I'd say that Facebook is evil. – Facebook is pretty cool, right?")
matches = matcher(doc)
displacy.render(matched_sents, style="ent", manual=True)