# Extract multiword token

In [45]:
import re 

In [46]:
text="Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."

In [47]:
pattern=r"Paul [A-Z]\w+"
# it means  find patter for any instance of paul proceeded by a capital letter until actual word break

In [48]:
matches=re.finditer(pattern,text)

In [49]:
for match in matches:
    print(match)

<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>


# Reconstructing the spans

In [50]:
import spacy
from spacy.tokens import Span

In [51]:
nlp=spacy.blank('en')
doc=nlp(text)
orginal_ents=list(doc.ents)
print(orginal_ents) 

[]


In [52]:
# here is empty list because  we have created a blank spaCy English model  we can see we will add to it

In [53]:
nlp=spacy.blank('en')
doc=nlp(text)
orginal_ents=list(doc.ents)
mwt_ents=[]
for match in re.finditer(pattern,doc.text):  # iterate over the results from re.finditer
    start,end=match.span()
    span=doc.char_span(start,end)
    if span is not None:
        mwt_ents.append((span.start,span.end,span.text))
        
# Inject the spans into the doc.ents
for ent in mwt_ents:
    start,end,name=ent
    per_ent= Span(doc,start,end,label="PERSON")
    orginal_ents.append(per_ent)

doc.ents=orginal_ents
for ent in doc.ents:
    print(ent.text,ent.label_)

Paul Newman PERSON
Paul Hollywood PERSON


# Giving priority to longer spans

lets create a new entity Maybe words associated with Cinema. So, we want to classify 
Hollywood as a tag “CINEMA”.
Now, in the above text, Hollywood is clearly associated with Paul Hollywood, but
let’s imagine for a moment that it is not. Let’s try and run the same code as above. 
If we do, we notice that we get an error.

In [54]:
mwt_ents=[]
for match in re.finditer(pattern,doc.text):  # iterate over the results from re.finditer
    start,end=match.span()
    span=doc.char_span(start,end)
    if span is not None:
        mwt_ents.append((span.start,span.end,span.text))
        
# Inject the spans into the doc.ents
for ent in mwt_ents:
    start,end,name=ent
    per_ent= Span(doc,start,end,label="CINEMA")
    orginal_ents.append(per_ent)

doc.ents=orginal_ents
for ent in doc.ents:
    print(ent.text,ent.label_)

ValueError: [E1010] Unable to set entity information for token 0 which is included in more than one span in entities, blocked, missing or outside.

It is beacause the word overlaps onece in person and another in cinema .therefore we have to give priority to longer span

In [59]:
# using custom components
from spacy.language import Language
from spacy.util import filter_spans # for giving priority to longer spans
@Language.component("cinema_ner")
def cinema_ner(doc):
    pattern=r"Hollywood"
    orginal_ents=list(doc.ents)
    mwt_ents=[]
    for match in re.finditer(pattern,doc.text):  # iterate over the results from re.finditer
        start,end=match.span()
        span=doc.char_span(start,end)
        if span is not None:
            mwt_ents.append((span.start,span.end,span.text))

    # Inject the spans into the doc.ents
    for ent in mwt_ents:
        start,end,name=ent
        per_ent= Span(doc,start,end,label="PERSON")
        orginal_ents.append(per_ent)
        
    filtered=filter_spans(orginal_ents)
    doc.ents=filtered
    return(doc)

In [60]:
nlp2=spacy.load("en_core_web_sm")
nlp2.add_pipe("cinema_ner")

<function __main__.cinema_ner(doc)>

In [61]:
doc2=nlp2(text)
for ent in doc2.ents:
    print(ent.text,ent.label_)

Paul Newman PERSON
American NORP
Paul Hollywood PERSON
British NORP
Paul PERSON
