<a href="https://colab.research.google.com/github/torquerxf/learn-spacy/blob/main/NLP_livecoding6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# RegEx

In [1]:
import re

In [8]:
pattern = r"((\d){1,2} (January|February|March|April|May|June|July|August|September|October|November|December))"

text = "This is a date 18 July. Another date would be 19 January."
matches = re.findall(pattern, text)
print(matches)

[('18 July', '8', 'July'), ('19 January', '9', 'January')]


In [10]:
pattern = r"(((\d){1,2}( (January|February|March|April|May|June|July|August|September|October|November|December)))|(((January|February|March|April|May|June|July|August|September|October|November|December) (\d){1,2})))"

text = "This is a date 18 July. Another date would be January 19."
matches = re.findall(pattern, text)
print(matches)

[('18 July', '18 July', '8', ' July', 'July', '', '', '', ''), ('January 19', '', '', '', '', 'January 19', 'January 19', 'January', '9')]


In [None]:
# RegEx with SpaCy

In [12]:
import spacy

In [13]:
text = "This is a sample number 555-5555."

In [15]:
nlp = spacy.blank("en")

In [16]:
ruler = nlp.add_pipe("entity_ruler")

In [17]:
pattern = [
    {"label": "PHONE_NUMBER", "pattern":[{"SHAPE": "ddd"},
    {"ORTH": "-", "OP": "?"}, {"SHAPE": "dddd"}]}
]

In [18]:
ruler.add_patterns(pattern)

In [19]:
doc = nlp(text)

In [20]:
for ent in doc.ents:
  print(ent.text, ent.label_)

555-5555 PHONE_NUMBER


In [23]:
pattern = r"((\d){3}-(\d){5})"
text = "This is a sample number 555-55555"
matches = re.findall(pattern, text)
print(matches)

[('555-55555', '5', '5')]


In [26]:
text = "This is a sample number 555-5555."

nlp = spacy.blank("en")

ruler = nlp.add_pipe("entity_ruler")

pattern = [
    {
        "label": "PHONE_NUMBER", "pattern": [{"TEXT": {"REGEX": "(\d){4}"}}]
    }
]

ruler.add_patterns(pattern)

doc = nlp(text)

for ent in doc.ents:
  print(ent.text, ent.label_)

5555 PHONE_NUMBER


multiword tokens with RegEx

In [1]:
import re

text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."

pattern = r"Paul [A-Z]\w+"

matches = re.finditer(pattern, text)

for match in matches:
  print(match)

<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>


In [2]:
import spacy
from spacy.tokens import Span

text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."
pattern = r"Paul [A-Z]\w+"

nlp = spacy.blank("en")
doc = nlp(text)

original_ents = list(doc.ents)

In [5]:
mwt_ents = []
for match in re.finditer(pattern, doc.text):
  start, end = match.span()
  span = doc.char_span(start, end)
  if span is not None:
    mwt_ents.append((span.start, span.end, span.text))

In [6]:
print(mwt_ents)

[(0, 2, 'Paul Newman'), (8, 10, 'Paul Hollywood')]


In [7]:
for ent in mwt_ents:
  start, end, name = ent
  per_ent = Span(doc, start, end, label="PERSON")
  original_ents.append(per_ent)

In [8]:
doc.ents = original_ents

In [9]:
for ent in doc.ents:
  print(ent.text, ent.label_)

Paul Newman PERSON
Paul Hollywood PERSON


In [12]:
from spacy.language import Language

@Language.component("paul_ner")
def paul_ner(doc):
  pattern = r"Paul [A-Z]\w+"
  original_ents = list(doc.ents)

  mwt_ents = []
  for match in re.finditer(pattern, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    if span is not None:
      mwt_ents.append((span.start, span.end, span.text))

  for ent in mwt_ents:
    start, end, name = ent
    per_ent = Span(doc, start, end, label="PERSON")
    original_ents.append(per_ent)

  doc.ents = original_ents

  return (doc)

In [13]:
nlp2 = spacy.blank("en")
nlp2.add_pipe("paul_ner")

In [14]:
doc2 = nlp2(text)
print(doc2.ents)

(Paul Newman, Paul Hollywood)


In [24]:
from spacy.util import filter_spans

@Language.component("cinema_ner")
def cinema_ner(doc):
  pattern = r"Hollywood"
  original_ents = list(doc.ents)

  mwt_ents = []
  for match in re.finditer(pattern, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    if span is not None:
      mwt_ents.append((span.start, span.end, span.text))

  for ent in mwt_ents:
    start, end, name = ent
    per_ent = Span(doc, start, end, label="CINEMA")
    original_ents.append(per_ent)

  filtered = filter_spans(original_ents)
  doc.ents = filtered

  return (doc)

In [25]:
nlp3 = spacy.load("en_core_web_sm")
nlp3.add_pipe("cinema_ner")

In [27]:
doc3 = nlp3(text)
for ent in doc3.ents:
  print(ent.text, ent.label_)

Paul Newman PERSON
American NORP
Paul Hollywood PERSON
British NORP
Paul PERSON
