In [2]:
import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)
terms = ["statistics", "continuous probability distribution", "Gaussian distribution", "non mention"]
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns)

doc = nlp("In statistics, a normal distribution or Gaussian distribution is a type of continuous probability distribution for a real-valued random variable. The general form of its probability density function is")
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print([start, end, span.text])

[1, 2, 'statistics']
[7, 9, 'Gaussian distribution']
[13, 16, 'continuous probability distribution']


In [5]:
import spacy
from spacy.tokens import DocBin

nlp = spacy.blank("en")  # Replace "en" with your model's language if different
doc_bin = DocBin().from_disk("/Users/owner/myles-personal-env/Projects/wikiSearch/src/models/test.spacy")  # Replace with your .spacy file path
for doc in doc_bin.get_docs(nlp.vocab):
    for ent in doc.ents:
        if ent.text.startswith(" ") or ent.text.endswith(" "):
            print(f"Invalid span in Doc'{ent.text}' in '{doc}'")


doc = nlp("Apple is opening its first big office in San Francisco.")
print([(ent.text, ent.label_) for ent in doc.ents])

Invalid span in Doc' distribution' in 'In the case σ > 0 ,  convergence in distribution means that the cumulative distribution functions of n ( X ¯ n − μ )  converge pointwise to the cdf of the N ( 0 , σ 2 )  distribution: for every real number z ,  where Φ ( z )  is the standard normal cdf evaluated at z .   The convergence is uniform in z  in the sense that where sup  denotes the least upper bound (or supremum) of the set.'
[]


In [8]:
import spacy

nlp = spacy.load("en_core_web_sm")
ruler = nlp.add_pipe("entity_ruler")
patterns = [{"label": "ORG", "pattern": "MyCorp Inc."}]
ruler.add_patterns(patterns)

doc = nlp("MyCorp Inc. is a company in the U.S.")
print([(ent.text, ent.label_) for ent in doc.ents])

[('MyCorp Inc.', 'ORG'), ('U.S.', 'GPE')]


In [10]:
from spacy.lang.en import English

# This is what will be used for string literal ner matching
# Money bag

nlp = English()
ruler = nlp.add_pipe("entity_ruler")
patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"}]
ruler.add_patterns(patterns)

doc1 = nlp("Apple is opening its first big office in San Francisco.")
print('doc1', [(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])

doc2 = nlp("Apple is opening its first big office in San Fran.")
print('doc2', [(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents])

doc1 [('Apple', 'ORG', 'apple'), ('San Francisco', 'GPE', 'san-francisco')]
doc2 [('Apple', 'ORG', 'apple'), ('San Fran', 'GPE', 'san-francisco')]
