In [188]:
import spacy

In [189]:
nlp = spacy.load("en_core_web_sm")

In [190]:
with open("Data/wiki_us.txt" , "r") as file:
    text = file.read()

In [191]:
# This shows the length of each text
print(text)

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a country primarily located in North America. It consists of 50 states, a federal district, five major unincorporated territories, 326 Indian reservations, and some minor possessions.[j] At 3.8 million square miles (9.8 million square kilometers), it is the world's third- or fourth-largest country by total area.[d] The United States shares significant land borders with Canada to the north and Mexico to the south, as well as limited maritime borders with the Bahamas, Cuba, and Russia.[22] With a population of more than 331 million people, it is the third most populous country in the world. The national capital is Washington, D.C., and the most populous city is New York.

Paleo-Indians migrated from Siberia to the North American mainland at least 12,000 years ago, and European colonization began in the 16th century. The United States emerged from the thirteen British colonies est

In [192]:
# creating a document object
doc = nlp(text)

In [193]:
print(len(text))
print(len(doc))

3521
654


In [194]:
# iterating over the text object
for token in text[:10]:
    print(token)

T
h
e
 
U
n
i
t
e
d


In [195]:
for token in doc[:10]:
    print(token)

The
United
States
of
America
(
U.S.A.
or
USA
)


In [196]:
for sent in doc.sents:
    print(sent)

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a country primarily located in North America.
It consists of 50 states, a federal district, five major unincorporated territories, 326 Indian reservations, and some minor possessions.[j]
At 3.8 million square miles (9.8 million square kilometers), it is the world's third- or fourth-largest country by total area.[d]
The United States shares significant land borders with Canada to the north and Mexico to the south, as well as limited maritime borders with the Bahamas, Cuba, and Russia.[22] With a population of more than 331 million people, it is the third most populous country in the world.
The national capital is Washington, D.C., and the most populous city is New York.


Paleo-Indians migrated from Siberia to the North American mainland at least 12,000 years ago, and European colonization began in the 16th century.
The United States emerged from the thirteen British colonies es

In [197]:
# Sentence Boundary Detection
sentence1 = list(doc.sents)[0]
print(sentence1)

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a country primarily located in North America.


In [198]:
# Token Attributes
token2 = sentence1[2]

In [199]:
print(token2)

States


In [200]:
# Finding information from a text
token2.text

'States'

In [201]:
# Using the token left edge, it is multiword used to make up a span
token2.left_edge

The

In [202]:
token2.right_edge

,

In [203]:
token2.right_edge

,

In [204]:
token2.ent_type

384

In [205]:
# GPE: geopolitical entity
token2.ent_type_

'GPE'

In [206]:
# This is begining of an entity or I means inside an entity or O is outside an entity
token2.ent_iob_

'I'

In [207]:
# This word looks the root word
token2.lemma_

'States'

In [208]:
# This is used to find the verb
sentence1[12].lemma_

'know'

In [209]:
# This shows the verb of the word
print(sentence1[12])

known


In [210]:
# morphological extraction
token2.morph

Number=Sing

In [211]:
sentence1[12].morph

Aspect=Perf|Tense=Past|VerbForm=Part

In [212]:
# The pos means part of speech, where propn means proper noun
token2.pos_

'PROPN'

In [213]:
# the dependencies it places in the article which means noun subject
print(sentence1[12].dep_)
print(token2.dep_)

acl
nsubj


In [214]:
token2.lang_

'en'

In [215]:
# dependency parse, and part of speech
text = "Mike enjoys playing football ."
doc2 = nlp(text)
print(doc2)

Mike enjoys playing football .


In [216]:
for token in doc2:
    print(token.text, token.pos_, token.dep_)

Mike PROPN nsubj
enjoys VERB ROOT
playing VERB xcomp
football NOUN dobj
. PUNCT punct


In [217]:
from spacy import displacy
displacy.render(doc2, style="dep")

In [218]:
for ent in doc.ents:
    print(ent.text, ent.label_)

The United States of America GPE
U.S.A. GPE
USA GPE
the United States GPE
U.S. GPE
US GPE
America GPE
North America LOC
50 CARDINAL
five CARDINAL
326 CARDINAL
Indian NORP
3.8 million square miles QUANTITY
9.8 million square kilometers QUANTITY
third- or fourth CARDINAL
The United States GPE
Canada GPE
Mexico GPE
Bahamas GPE
Cuba GPE
more than 331 million MONEY
third ORDINAL
Washington GPE
D.C. GPE
New York GPE
Paleo-Indians NORP
Siberia LOC
North American NORP
at least 12,000 years ago DATE
European NORP
the 16th century DATE
The United States GPE
thirteen CARDINAL
British NORP
the East Coast LOC
Great Britain GPE
the American Revolutionary War ORG
1775–1783 DATE
the late 18th century DATE
U.S. GPE
North America LOC
Native Americans NORP
1848 DATE
the United States GPE
United States GPE
the second half of the 19th century DATE
the American Civil War ORG
Spanish NORP
American War and World War I EVENT
U.S. GPE
World War II EVENT
the Cold War EVENT
the United States GPE
the Korean War EV

In [219]:
displacy.render(doc, style="ent")

In [220]:
# Word Vectors and spacy
import spacy
nlp = spacy.load("en_core_web_md")


In [221]:
with open("Data/wiki_us.txt" , "r") as file2:
    text_word = file2.read()

In [222]:
doc = nlp(text_word)
sentence2 = list(doc.sents)[0]
print(sentence2)

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a country primarily located in North America.


In [223]:
# How to use the word vector with spacy
# how do i find out how country is similiar to other words in our models word embeddings

# This code tells us words that are similar to the word "country"
import numpy as np
your_word = "country"
ms = nlp.vocab.vectors.most_similar(np.asarray([nlp.vocab.vectors[nlp.vocab.strings[your_word]]]), n=10)
words = [nlp.vocab.strings[w] for w in ms[0][0]]
distances = ms[2]
print(words)

['country', 'COUNTRY', 'NATION', 'nation', 'COUNTIRES', 'nations', 'member-states', 'worLd', 'World', 'world']


In [224]:
# Calculating document similar in spacy
doc3 = nlp("I like salty fries and hamburgers. ")
doc4 = nlp("Fast food tastes very good.")

In [225]:
print(doc3 , "<->", doc4, doc3.similarity(doc4))

I like salty fries and hamburgers.  <-> Fast food tastes very good. 0.7799485853415737


In [226]:
doc5 = nlp("The empire state building is in New York , it has a kitchen and restaurant underground")
print(doc3 , "<->", doc5, doc3.similarity(doc5))

I like salty fries and hamburgers.  <-> The empire state building is in New York , it has a kitchen and restaurant underground 0.6129685625662438


In [227]:
# Bigram
# a blanch model, and add sentensier.

nlp = spacy.blank("en")

In [228]:
# create a pipe
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x13b64cb80>

In [229]:
#  In this sentencizer pipe -> it is only interested in sentences 
nlp.analyze_pipes()

{'summary': {'sentencizer': {'assigns': ['token.is_sent_start', 'doc.sents'],
   'requires': [],
   'scores': ['sents_f', 'sents_p', 'sents_r'],
   'retokenizes': False}},
 'problems': {'sentencizer': []},
 'attrs': {'token.is_sent_start': {'assigns': ['sentencizer'], 'requires': []},
  'doc.sents': {'assigns': ['sentencizer'], 'requires': []}}}

In [230]:
# This code shows how a model such as en_core_web_sm categorises texts 
# ner: name recognition pip

nlp2 = spacy.load("en_core_web_sm")
nlp2.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
  'tagger': [],
  'parser': [],
  'attribute_ruler': [],
  'lemmatizer': [],
  'ner': []},
 'att

ENTITY RULER

In [231]:
import spacy
nlp = spacy.load("en_core_web_sm")
text = "West Chestertenfieldville was referenced in Mr. Deeds. "


In [232]:
docs = nlp(text)

In [233]:
# entity ruler
for ent in docs.ents:
    print(ent.text, ent.label_)

West Chestertenfieldville GPE
Deeds PERSON


In [234]:
ruler = nlp.add_pipe("entity_ruler")

In [235]:
nlp.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'entity_ruler': {'assigns': ['doc.ents', 'token.ent_type', 'token.ent_iob'],
   'requires': [],
   'scores': ['ents_f', 'ent

In [236]:
patterns = [
    {"label": "GPE", "pattern": "West Chestertenfieldville"}
]

In [237]:
# load pattern into ruler
ruler.add_patterns(patterns)

In [238]:
doc_obj = nlp(text)

for ent in doc_obj.ents:
    print(ent.text, ent.label_)

West Chestertenfieldville GPE
Deeds PERSON


In [239]:
nlp3 = spacy.load("en_core_web_sm")

In [240]:
ruler = nlp3.add_pipe("entity_ruler", before="ner")

In [241]:
ruler.add_patterns(patterns)

In [242]:
document = nlp3(text)
for ent in document.ents:
    print(ent.text, ent.label_)

West Chestertenfieldville GPE
Deeds PERSON


In [243]:
nlp3.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'entity_ruler': {'assigns': ['doc.ents', 'token.ent_type', 'token.ent_iob'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ent

In [244]:
nlp6 = spacy.load("en_core_web_sm")
ruler = nlp6.add_pipe("entity_ruler", before="ner")
patterns = [
    {"label": "GPE", "pattern": "West Chestertenfieldville"},
    {"label": "FILM", "pattern": "Mr. Deeds"}
]


In [245]:
ruler.add_patterns(patterns)
doc_obs = nlp6(text)
for ent in doc_obs.ents:
    print(ent.text, ent.label_)

West Chestertenfieldville GPE
Mr. Deeds FILM


MATCHER

In [246]:
# toponym resolution problem -> this is things that can have multiple labels that are dependent on content.
import spacy
from spacy.matcher import Matcher

In [247]:
nlp = spacy.load("en_core_web_sm")

In [248]:
# how to create a matcher -> In this stage you will want to add extra features
matcher = Matcher(nlp.vocab)
pattern = [{"LIKE_EMAIL": True}]
matcher.add("EMAIL_ADDRESS", [pattern])

In [249]:
doc = nlp("This is an email address: progress_match@aol.com")
matches = matcher(doc)
print(matches)
# the first value os called lexeme, the next values is: start to end

[(16571425990740197027, 6, 7)]


In [250]:
print(nlp.vocab[matches[0][0]].text)

EMAIL_ADDRESS


In [251]:
with open("Data/wiki_mlk.txt", "r") as file:
    text = file.read()

In [252]:
print(text)

Martin Luther King Jr. (born Michael King Jr.; January 15, 1929 – April 4, 1968) was an American Baptist minister and activist who became the most visible spokesman and leader in the American civil rights movement from 1955 until his assassination in 1968. King advanced civil rights through nonviolence and civil disobedience, inspired by his Christian beliefs and the nonviolent activism of Mahatma Gandhi. He was the son of early civil rights activist and minister Martin Luther King Sr.

King participated in and led marches for blacks' right to vote, desegregation, labor rights, and other basic civil rights.[1] King led the 1955 Montgomery bus boycott and later became the first president of the Southern Christian Leadership Conference (SCLC). As president of the SCLC, he led the unsuccessful Albany Movement in Albany, Georgia, and helped organize some of the nonviolent 1963 protests in Birmingham, Alabama. King helped organize the 1963 March on Washington, where he delivered his famous 

In [253]:
# Take this text in an extract all the proper nouns, 
# Take out the mult-word tokens, structure sequential order.
nlp = spacy.load("en_core_web_sm")


In [254]:
# This code proovides the individual proper nouns in the text
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN"}]
matcher.add("PROPER_NOUN", [pattern])
doc = nlp(text)
matches = matcher(doc)
print(len(matches)) 
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

103
(451313080118390996, 0, 1) Martin
(451313080118390996, 1, 2) Luther
(451313080118390996, 2, 3) King
(451313080118390996, 3, 4) Jr.
(451313080118390996, 6, 7) Michael
(451313080118390996, 7, 8) King
(451313080118390996, 8, 9) Jr.
(451313080118390996, 10, 11) January
(451313080118390996, 15, 16) April
(451313080118390996, 22, 23) American


In [255]:
# This code grabs all the proper nouns in the text
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN", "OP": "+"}]
matcher.add("PROPER_NOUN", [pattern], greedy="LONGEST")
doc = nlp(text)
matches = matcher(doc)

# This sorts token by the first index of the token
matches.sort(key= lambda x: x[1])
print(len(matches)) 
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

62
(451313080118390996, 0, 4) Martin Luther King Jr.
(451313080118390996, 6, 9) Michael King Jr.
(451313080118390996, 10, 11) January
(451313080118390996, 15, 16) April
(451313080118390996, 22, 24) American Baptist
(451313080118390996, 49, 50) King
(451313080118390996, 69, 71) Mahatma Gandhi
(451313080118390996, 83, 87) Martin Luther King Sr
(451313080118390996, 89, 90) King
(451313080118390996, 113, 114) King


In [256]:
# This code grabs all the proper nouns in the text, which is followed by a verb
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN", "OP": "+"}, {"POS": "VERB"}]
matcher.add("PROPER_NOUN", [pattern], greedy="LONGEST")
doc = nlp(text)
matches = matcher(doc)
# This sorts token by the first index of the token
matches.sort(key= lambda x: x[1])
print(len(matches)) 
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

6
(451313080118390996, 49, 51) King advanced
(451313080118390996, 89, 91) King participated
(451313080118390996, 113, 115) King led
(451313080118390996, 247, 252) Director J. Edgar Hoover considered
(451313080118390996, 322, 324) King won
(451313080118390996, 485, 488) United States beginning


In [257]:
with open("Data/alice_unclean.txt", "r") as file:
    data = file.read()

In [258]:
doc = nlp(data[265:])
sentence1 = list(doc.sents)[0]
print(sentence1)

Alice was beginning to get very tired of sitting by her sister
on the bank, and of having nothing to do:  once or twice she had
peeped into the book her sister was reading, but it had no
pictures or conversations in it, `and what is the use of a book,'
thought Alice `without pictures or conversation?'


In [259]:

quote_replace = str(sentence1)
quote_replaced = quote_replace.replace("`", "'")
print(quote_replaced)
 

Alice was beginning to get very tired of sitting by her sister
on the bank, and of having nothing to do:  once or twice she had
peeped into the book her sister was reading, but it had no
pictures or conversations in it, 'and what is the use of a book,'
thought Alice 'without pictures or conversation?'


In [260]:
print(type(quote_replace))

<class 'str'>


In [261]:
# This code allows us to grab all the quotation marks and try to identify, the person is thinking.
speak_lemmas = ["think", "say"]
matcher = Matcher(nlp.vocab)
pattern = [{"ORTH": "'"},
           {"IS_ALPHA": True, "OP": "+"},
           {"IS_PUNCT": True, "OP": "*"},
        #    {"ORTH": "'"},
        #    {"POS": "VERB", "LEMMA": {"IN": speak_lemmas}}
           ]

matcher.add("PROPER_NOUN", [pattern], greedy="LONGEST")
doc = nlp(quote_replaced)
matches = matcher(doc)

# This sorts token by the first index of the token
matches.sort(key= lambda x: x[1])
print(len(matches)) 
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])


2
(451313080118390996, 51, 62) 'and what is the use of a book,'
(451313080118390996, 65, 72) 'without pictures or conversation?'


In [262]:
# This code allows us to grab all the quotation marks and try to identify, the person is thinking.
speak_lemmas = ["think", "say"]
matcher = Matcher(nlp.vocab)
pattern = [{"ORTH": "'"},
           {"IS_ALPHA": True, "OP": "+"},
           {"IS_PUNCT": True, "OP": "*"},
           {"ORTH": "'"},
           {"IS_SPACE": True},
           {"POS": "VERB", "LEMMA": {"IN": speak_lemmas}},
           {"POS": "PROPN", "OP": "+"},
           {"ORTH": "'"},
           {"IS_ALPHA": True, "OP": "+"},
           {"IS_PUNCT": True, "OP": "*"},
           {"ORTH": "'"}
           ]

matcher.add("PROPER_NOUNS", [pattern], greedy="LONGEST")
doc = nlp(quote_replaced)
print(doc)
matches = matcher(doc)

# This sorts token by the first index of the token
matches.sort(key= lambda x: x[1])
print(len(matches)) 
for match in matches[:40]:
    print(match, doc[match[1]:match[2]], end=" ")

Alice was beginning to get very tired of sitting by her sister
on the bank, and of having nothing to do:  once or twice she had
peeped into the book her sister was reading, but it had no
pictures or conversations in it, 'and what is the use of a book,'
thought Alice 'without pictures or conversation?'
1
(3232560085755078826, 51, 72) 'and what is the use of a book,'
thought Alice 'without pictures or conversation?' 

In [263]:
import requests
from bs4 import BeautifulSoup
link_shakespare = requests.get("https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt")
soup = BeautifulSoup(link_shakespare.content).text.replace("-\n", "").replace("\n", ".")
# print(soup)

In [264]:
print(data)

Alice's Adventures in Wonderland

                ALICE'S ADVENTURES IN WONDERLAND

                          Lewis Carroll

               THE MILLENNIUM FULCRUM EDITION 3.0




                            CHAPTER I

                      Down the Rabbit-Hole


  Alice was beginning to get very tired of sitting by her sister
on the bank, and of having nothing to do:  once or twice she had
peeped into the book her sister was reading, but it had no
pictures or conversations in it, `and what is the use of a book,'
thought Alice `without pictures or conversation?'

  So she was considering in her own mind (as well as she could,
for the hot day made her feel very sleepy and stupid), whether
the pleasure of making a daisy-chain would be worth the trouble
of getting up and picking the daisies, when suddenly a White
Rabbit with pink eyes ran close by her.

  There was nothing so VERY remarkable in that; nor did Alice
think it so VERY much out of the way to hear the Rabbit say to
itself, `Oh d

In [265]:
file = open("Data/alice_unclean.txt", "r")
def read_in_chunk(file, chunk_size=100000):
    while True:
        data = file.read(chunk_size)
        if not data: break
        return data

# read_in_chunk(file)

In [266]:
with open("Data/alice_unclean.txt", "r") as file:
    data = file.read()
    
# print(data)

text = data.replace("\n", "").replace("\\", "")
print(text.strip())



CUSTOM COMPONENTS IN spaCy

In [267]:
import spacy
nlp = spacy.load("en_core_web_md")
doc = nlp("Britain is a place. Mary is a doctor.")
for ent in doc.ents:
    print(ent.text, ent.label_)

Britain GPE
Mary PERSON


In [268]:
from spacy.language import Language

In [269]:
@Language.component("remove_gpe")
def remove_gpe(doc):
    original_ents = list(doc.ents)
    for ent in doc.ents:
        if ent.label_ == "GPE":
            original_ents.remove(ent)
    doc.ents = original_ents
    return(doc)
    

In [270]:
nlp.add_pipe("remove_gpe")

<function __main__.remove_gpe(doc)>

In [271]:
nlp.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'remove_gpe': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
  

In [272]:

doc = nlp("Britain is a place. Mary is a doctor.")
for ent in doc.ents:
    print(ent.text, ent.label_)

Mary PERSON


In [273]:
# nlp.to_disk("data/new_en_core_web_sm")

REGREX 

In [274]:
import re
text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host.  "

In [275]:
pattern = r"Paul [A-Z]\w+"
matches = re.finditer(pattern, text)
for match in matches:
    print(match)

<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>


In [276]:
import spacy
from spacy.tokens import Span

In [278]:
#  Implementing a pipeline in spaCy
nlp = spacy.blank("en")
doc = nlp(text)
original_ents = list(doc.ents)
#  where mwt means multi-word tokens entity
mwt_ents = []
for match in re.finditer(pattern , doc.text):
    # the match.span() is a character span, the doc oject works on token level
    start, end = match.span()
    span = doc.char_span(start, end)
    print(span)
    if span is not None:
        mwt_ents.append((span.start, span.end, span.text))
    

Paul Newman
Paul Hollywood


In [279]:
print(mwt_ents)

[(0, 2, 'Paul Newman'), (8, 10, 'Paul Hollywood')]


In [283]:
# This is code if for inject the entities into the empty pipeline
nlp = spacy.blank("en")
doc = nlp(text)

# the doc object above is empty because  it is a empty pipeline which has no entities specified
print(doc.ents)
original_ents = list(doc.ents)
#  where mwt means multi-word tokens entity
mwt_ents = []
for match in re.finditer(pattern , doc.text):
    # the match.span() is a character span, the doc oject works on token level
    start, end = match.span()
    span = doc.char_span(start, end)
    print(span)
    if span is not None:
        mwt_ents.append((span.start, span.end, span.text))
for ent in mwt_ents:
    start, end , name = ent
    per_ent = Span(doc, start, end, label="PERSON")
    original_ents.append(per_ent)

doc.ents = original_ents
print(doc.ents)
for ent in doc.ents:
    print(ent.text, ent.label_)


()
Paul Newman
Paul Hollywood
(Paul Newman, Paul Hollywood)
Paul Newman PERSON
Paul Hollywood PERSON


Building a Custom Component in SPACY

In [285]:
# 1. First import the language
from spacy.language import Language
# 2. The next step is to convert to a custom pipe that find pipeline

@Language.component("paul_ner")
def paul_ner(doc):
    pattern = r"Paul [A-Z]\w+"
    original_ents = list(doc.ents)
#  where mwt means multi-word tokens entity
    mwt_ents = []
    for match in re.finditer(pattern , doc.text):
    # the match.span() is a character span, the doc oject works on token level
        start, end = match.span()
        span = doc.char_span(start, end)
        print(span)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))
    for ent in mwt_ents:
        start, end , name = ent
        per_ent = Span(doc, start, end, label="PERSON")
        original_ents.append(per_ent)

    doc.ents = original_ents
    return (doc)
    # print(doc.ents)
    # for ent in doc.ents:
    #     print(ent.text, ent.label_) 
    

In [286]:
nlp2 = spacy.blank("en")
doc = nlp2.add_pipe("paul_ner")

In [287]:
print(doc)

<function paul_ner at 0x1502239a0>


In [289]:
doc2 = nlp2(text)
print(doc2.ents)

Paul Newman
Paul Hollywood
(Paul Newman, Paul Hollywood)


In [296]:
# This code helps use the custom pipleline leverage on existing spacy models
# this allows you to filter out all of the spans that have been identified
from spacy.util import filter_spans
from spacy.language import Language
# 2. The next step is to convert to a custom pipe that find pipeline

@Language.component("cinema_ner")
def cinema_ner(doc):
    pattern = r"Hollywood"
    original_ents = list(doc.ents)
#  where mwt means multi-word tokens entity
    mwt_ents = []
    for match in re.finditer(pattern , doc.text):
    # the match.span() is a character span, the doc oject works on token level
        start, end = match.span()
        span = doc.char_span(start, end)
        print(span)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))
    for ent in mwt_ents:
        start, end , name = ent
        per_ent = Span(doc, start, end, label="PERSON")
        original_ents.append(per_ent)
    # it goes through and looks at all of the different start and end sections foor all of your entities 9 to 10
    filtered = filter_spans(original_ents)
    
    doc.ents = filtered
    return (doc)
    # print(doc.ents)
    # for ent in doc.ents:
    #     print(ent.text, ent.label_) 
    
 

In [297]:
nlp3 = spacy.load("en_core_web_sm")
nlp3.add_pipe("cinema_ner")
doc3 = nlp3(text)
# work with the filter
for ent in doc3.ents:
    print(ent.text, ent.label_)

Hollywood
Paul Newman PERSON
American NORP
Paul Hollywood PERSON
British NORP
