In [0]:
import spacy

In [0]:
nlp=spacy.load('en_core_web_sm')

In [0]:
doc=nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [0]:
doc

The quick brown fox jumped over the lazy dog's back.

In [0]:
print(doc)

The quick brown fox jumped over the lazy dog's back.


In [0]:
print(doc.text)

The quick brown fox jumped over the lazy dog's back.


In [0]:
#Grab a particular token
print(doc[4])

jumped


In [0]:
#Grab the token text
print(doc[4].text)

jumped


In [0]:
#Grab the parts of speech tag
print(doc[4].pos_)

VERB


In [0]:
#Grab the fine grain tag
print(doc[4].tag_)

VBD


In [0]:
#Get the parts of speech tag ID
#Just remove the underscore
print(doc[4].pos)

100


In [0]:
#Get the fine grain tag ID alone
#Just remove the underscore
print(doc[4].tag)

17109001835818727656


In [0]:
#Print the token text, parts of speech, grain tag and explain them
for token in doc:
    print(f"{token.text} {token.tag_} {token.pos_} {spacy.explain(token.tag_)}")

The DT DET determiner
quick JJ ADJ adjective
brown JJ ADJ adjective
fox NN NOUN noun, singular or mass
jumped VBD VERB verb, past tense
over IN ADP conjunction, subordinating or preposition
the DT DET determiner
lazy JJ ADJ adjective
dog NN NOUN noun, singular or mass
's POS PART possessive ending
back NN NOUN noun, singular or mass
. . PUNCT punctuation mark, sentence closer


In [0]:
#Print the token text, parts of speech, grain tag and explain them
#Print them with indentation
for token in doc:
    print(f"{token.text:{10}}{token.pos_:{10}}{token.tag_:{10}}{spacy.explain(token.tag_)}")

The       DET       DT        determiner
quick     ADJ       JJ        adjective
brown     ADJ       JJ        adjective
fox       NOUN      NN        noun, singular or mass
jumped    VERB      VBD       verb, past tense
over      ADP       IN        conjunction, subordinating or preposition
the       DET       DT        determiner
lazy      ADJ       JJ        adjective
dog       NOUN      NN        noun, singular or mass
's        PART      POS       possessive ending
back      NOUN      NN        noun, singular or mass
.         PUNCT     .         punctuation mark, sentence closer


In [0]:
#Same strings can have different meanings
doc=nlp("I read books on NLP.")

In [0]:
doc

I read books on NLP.

In [0]:
#Grab the word read
word=doc[1]

In [0]:
word.text

'read'

In [0]:
print(f"{word.text:{10}}{word.pos_:{10}}{word.tag_:{10}}{spacy.explain(word.tag_)}")

read      VERB      VBD       verb, past tense


In [0]:
#This is past tense
#I read a book on NLP
doc=nlp(u"I read a book on NLP")

In [0]:
word=doc[1]

In [0]:
word.text

'read'

In [0]:
print(f"{word.text:{10}}{word.pos_:{10}}{word.tag_:{10}}{spacy.explain(word.tag_)}")

read      VERB      VBD       verb, past tense


In [0]:
#Count the parts of speech tags
doc=nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [0]:
doc

The quick brown fox jumped over the lazy dog's back.

In [0]:
#Count the parts of speech
POS_counts=doc.count_by(spacy.attrs.POS)

In [0]:
#The output will be a dictionary
POS_counts

{90: 2, 84: 3, 92: 3, 100: 1, 85: 1, 94: 1, 97: 1}

In [0]:
#90 POS code is present 2 times
#84 POS code is present 3 times
#92 POS code is present 3 times
#100 POS code is present 1 time
#We want to know what 83 POS code is
doc.vocab[84].text
#There are 3 adjectives in the document

'ADJ'

In [0]:
doc[2].pos

84

In [0]:
#create a frequency list of pos tags in the entire document
for k,v in sorted(POS_counts.items()):
    print(f"{k}.    {doc.vocab[k].text:{10}}   {v}")

84.    ADJ          3
85.    ADP          1
90.    DET          2
92.    NOUN         3
94.    PART         1
97.    PUNCT        1
100.    VERB         1


In [0]:
#Create a frequency of fine grain parts of speech
TAG_counts=doc.count_by(spacy.attrs.TAG)

for k,v in sorted(TAG_counts.items()):
    print(f"{k}.    {doc.vocab[k].text:{10}}  {v}")

74.    POS         1
1292078113972184607.    IN          1
10554686591937588953.    JJ          3
12646065887601541794.    .           1
15267657372422890137.    DT          2
15308085513773655218.    NN          3
17109001835818727656.    VBD         1


In [0]:
#The numbers above are big because spacy hard codes certain values for certain words
#Thats the reasonn why spacy vocabulary is huge
len(doc.vocab)

508

In [0]:
#Create a frequency of syntactic dependencies
DEP_counts=doc.count_by(spacy.attrs.DEP)

for k,v in sorted(DEP_counts.items()):
    print(f"{k:<{30}}     {doc.vocab[k].text:{10}}        {v}")

402                                amod              3
415                                det               2
429                                nsubj             1
439                                pobj              1
440                                poss              1
443                                prep              1
445                                punct             1
8110129090154140942                case              1
8206900633647566924                ROOT              1


# Visualizing parts of speech

In [0]:
import spacy

In [0]:
nlp=spacy.load('en_core_web_sm')

In [0]:
doc=nlp(u"The quick brown fox jumped over the lazy dog.")

In [0]:
from spacy import displacy

In [0]:
displacy.render(doc,style='dep',jupyter=True)

In [0]:
#set the options
#Distance specifies teh distance between the words
#Depending on the distance chosen, the figure will be adjusted as compact
options={'distance':110,'compact':'True','color':'yellow','bg':'red','font':'Verdana'}

In [0]:
displacy.render(doc,style='dep', jupyter=True, options=options)

In [0]:
doc2=nlp("This is a sentence. This is another sentence, possibly longer than the other. ")

In [0]:
#We can create a list of spans
#We are going to seperate the sentences in the read doc
spans=list(doc2.sents)

In [0]:
print(spans)

[This is a sentence., This is another sentence, possibly longer than the other.]


In [0]:
displacy.serve(spans,style='dep',options={'distance':110})
#Go to 127.0.0.1.5000

  "__main__", mod_spec)



Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [26/Feb/2020 21:15:59] "GET / HTTP/1.1" 200 10602
127.0.0.1 - - [26/Feb/2020 21:15:59] "GET /favicon.ico HTTP/1.1" 200 10602


# Named Entity Recognition  - Part One

In [0]:
import spacy

In [0]:
nlp=spacy.load('en_core_web_sm')

In [0]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+ '----'+ ent.label_+ str(spacy.explain(ent.label_)))
    else:
        print("No entities found")

In [0]:
#Take a sample that has no named entities
doc=nlp(u'HI How are you?')

In [0]:
show_ents(doc)

No entities found


In [0]:
#Take a sample that has entities
doc=nlp(u"May I go to Washington, DC next May to see the Washington Monument?")

In [0]:
doc

May I go to Washington, DC next May to see the Washington Monument?

In [0]:
show_ents(doc)

Washington----GPECountries, cities, states
next May----DATEAbsolute or relative dates or periods
the Washington Monument----ORGCompanies, agencies, institutions, etc.


In [0]:
doc=nlp(u"Can I please have 500 dollars of Microsoft stock?")

In [0]:
doc

Can I please have 500 dollars of Microsoft stock?

In [0]:
show_ents(doc)

500 dollars----MONEYMonetary values, including unit
Microsoft----ORGCompanies, agencies, institutions, etc.


In [0]:
#Add a named entity to a span
doc=nlp(u"Tesla to build a U.K. factory for $6 million")

In [0]:
show_ents(doc)

U.K.----GPECountries, cities, states
$6 million----MONEYMonetary values, including unit


In [0]:
#Spacy isn't recognizing Tesla as an entity
#We will add Tesla to ORG entity now
from spacy.tokens import Span

In [0]:
ORG=doc.vocab.strings[u"ORG"]

In [0]:
ORG

383

In [0]:
Span(doc,0,1)

Tesla

In [0]:
Span(doc,0,3)

Tesla to build

In [0]:
#We want to create a span for the new entity
#Zero is indicating starting index of the Span being chosen
#one is indicating ending index of the Span being chosen
#Here we are labeling the content present in the selected span as ORG
#The Span selected here is 0 to 1 which is the word Tesla
new_ent=Span(doc,0,1,label=ORG)

In [0]:
#We are adding the new entity to the entities present in the document
doc.ents=list(doc.ents) + [new_ent ]

In [0]:
show_ents(doc)

Tesla----ORGCompanies, agencies, institutions, etc.
U.K.----GPECountries, cities, states
$6 million----MONEYMonetary values, including unit


# Named Entity Recognition  - Part Two

In [0]:
#Add multiple terms to Named Entity Recognition
doc=nlp(u"Our company created a brand new vaccum cleaner."
       u"This new vaccum-cleaner is teh best in show.")

In [0]:
doc

Our company created a brand new vaccum cleaner.This new vaccum-cleaner is teh best in show.

In [0]:
#First let us see teh existing entities in the document
show_ents(doc)

No entities found


In [0]:
#We want to add both "vaccum cleaner" "vaccum-cleaner" to the Named Entities
from spacy.matcher import PhraseMatcher

In [0]:
matcher=PhraseMatcher(nlp.vocab)

In [0]:
#create the desired list of patterns to be added as the Named entities
phrase_list=['vaccum cleaner','vaccum-cleaner']

In [0]:
phrase_patterns=[nlp(text) for text in phrase_list]

In [0]:
phrase_patterns

[vaccum cleaner, vaccum-cleaner]

In [0]:
#Here the name of the created new entity is newproduct
matcher.add('newproduct',None,*phrase_patterns)

In [0]:
found_matches=matcher(doc)

In [0]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [0]:
from spacy.tokens import Span

In [0]:
PROD=doc.vocab.strings[u"PRODUCT"]

In [0]:
#Here match[1] means match starting index 6
#Here match[2] means match ending index 
#Span(doc,match[1],match[2],label=PROD)

new_ents=[Span(doc,match[1],match[2],label=PROD) for match in found_matches]

In [0]:
#In list(doc.ents)-----> It is referring to document original entities
doc.ents=list(doc.ents) + new_ents

In [0]:
show_ents(doc)

vaccum cleaner----PRODUCTObjects, vehicles, foods, etc. (not services)
vaccum-cleaner----PRODUCTObjects, vehicles, foods, etc. (not services)


In [0]:
#We do not have built in method to count the number of entities
#We wanted to figure out how many times Money entity is mentioned in a document
doc=nlp(u"Originally I paid $29.95 for this car toy, but now it is marked down by 10 dollars")

In [0]:
doc

Originally I paid $29.95 for this car toy, but now it is marked down by 10 dollars

In [0]:
[ent for ent in doc.ents if ent.label_=="MONEY"]

[29.95, 10 dollars]

In [0]:
#Now check the length
len([ent for ent in doc.ents if ent.label_=="MONEY"])

2

# Visualizing Named Entity Recognition

In [0]:
import spacy

In [0]:
nlp=spacy.load('en_core_web_sm')

In [0]:
from spacy import displacy

In [0]:
doc=nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.")

In [0]:
doc

Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.

In [0]:
displacy.render(doc,style='dep',jupyter=True)

In [0]:
displacy.render(doc,style='ent',jupyter=True)

In [0]:
#Assume we have multiple lines
#We want to visualize entities line by line
doc=nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million."
       u"By contrast, Sony only sold 8 thousand Walkman music players.")

In [0]:
doc

Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.By contrast, Sony only sold 8 thousand Walkman music players.

In [0]:
#To view them line by line
#First we should perform sentence segmentation
for sent in doc.sents:
    displacy.render(nlp(sent.text),style='ent',jupyter=True)

In [0]:
#If we want to displayonly certain entities
#For example we want to display only ORG entities
options={'ents':'ORGANISATION'}

In [0]:
displacy.render(doc,style='ent',jupyter=True,options=options)

In [0]:
#We want to display in certain colotrs of ORGANISATION entity and MONEY entity#####
colors={'ORG':'red'}
options={'ents':'ORGANISATION','colors':colors}

In [0]:
displacy.render(doc,style='ent',jupyter=True,options=options)

In [0]:
colors={'ORG':'radial-gradient(yellow,red)'}
options={'ents':'ORGANISATION','colors':colors}

In [0]:
displacy.render(doc,style='ent',jupyter=True,options=options)

In [0]:
colors={'ORG':'linear-gradient(blue,green,yellow)'}
options={'ents':'ORGANISATION','colors':colors}

In [0]:
displacy.render(doc,style='ent',jupyter=True,options=options)

In [0]:
displacy.serve(doc,style='ent',options=options)

  "__main__", mod_spec)



Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [26/Feb/2020 22:29:39] "GET / HTTP/1.1" 200 1282
127.0.0.1 - - [26/Feb/2020 22:29:39] "GET /favicon.ico HTTP/1.1" 200 1282


# Sentence Segmentation

In [0]:
import spacy

In [0]:
nlp=spacy.load('en_core_web_sm')

In [0]:
doc=nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")

In [0]:
doc

This is the first sentence. This is another sentence. This is the last sentence.

In [0]:
#Grab the individual sentences
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [0]:
#We cannot index the sentences obtained above
doc.sents[0]

TypeError: 'generator' object is not subscriptable

In [0]:
#We can grab individual tokens from the doc
doc[0]

This

In [0]:
#But, we cannot grab individual sentences from the doc by indexing
doc.sents[0]

TypeError: 'generator' object is not subscriptable

In [0]:
#To index the sentences
#We should pass the extracted sentences to a list
list(doc.sents)

[This is the first sentence.,
 This is another sentence.,
 This is the last sentence.]

In [0]:
list(doc.sents)[0]

This is the first sentence.

In [0]:
list(doc.sents)[1]

This is another sentence.

In [0]:
list(doc.sents)[2]

This is the last sentence.

In [0]:
list(doc.sents)[3]

IndexError: list index out of range

In [0]:
type(list(doc.sents)[2])

spacy.tokens.span.Span

In [0]:
#For example We want to segment  a sentence based on a semi colon
doc=nlp(u'"Management is doing the right things; leadership is doing the right things." - Peter Drucker')

In [0]:
doc

"Management is doing the right things; leadership is doing the right things." - Peter Drucker

In [0]:
doc.text

'"Management is doing the right things; leadership is doing the right things." - Peter Drucker'

In [0]:
#Let us take a view at default segmentation
for sent in doc.sents:
    print(sent)
    print('\n')

"Management is doing the right things; leadership is doing the right things."


- Peter Drucker




In [0]:
#Add a SEGMENTATION RULE
#We want to segment at the semi colon
#Every token has its index position
#Let us see this
def set_custom_boundaries(doc):
    for token in doc:
        print(token.i)           #Prints the token index

In [0]:
set_custom_boundaries(doc)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18


In [0]:
#Let us print the token text and its index
def set_custom_boundaries(doc):
    for token in doc:
        print(token)          #prints the token
        print(token.i)        #prints the token index

In [0]:
set_custom_boundaries(doc)

"
0
Management
1
is
2
doing
3
the
4
right
5
things
6
;
7
leadership
8
is
9
doing
10
the
11
right
12
things
13
.
14
"
15
-
16
Peter
17
Drucker
18


In [0]:
#Now let us not include the very last token in the document
def set_custom_boundaries(doc):
    for token in doc[:-1]:                #We are specifying consider all the tokens excepyt the last one
        print(token)
        print(token.i)      

In [0]:
doc[:-1]

"Management is doing the right things; leadership is doing the right things." - Peter

In [0]:
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text ==";":
            doc[token.i+1].is_sent_start=True      #Matched index +1 i.e. matched next index should be the segmentation point
    return doc

In [0]:
nlp.add_pipe(set_custom_boundaries,before='parser')

In [0]:
nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [0]:
doc4=nlp(u'"Management is doing the right things; leadership is doing the right things." - Peter Drucker')

In [0]:
doc4

"Management is doing the right things; leadership is doing the right things." - Peter Drucker

In [0]:
for sent in doc4.sents:
    print(sent)
#We notice that now it gets seperated at the semi colon

"Management is doing the right things;
leadership is doing the right things."
- Peter Drucker


In [0]:
#Change the SEGMENTATION RULES
#We want to segment when tehre is a line break i.e. new line i.e. \n

In [0]:
nlp=spacy.load('en_core_web_sm')

In [0]:
mystring=u"This is a sentence. This is another.\n\nThis is a \nthird sentence."

In [0]:
mystring

'This is a sentence. This is another.\n\nThis is a \nthird sentence.'

In [0]:
print(mystring)

This is a sentence. This is another.

This is a 
third sentence.


In [0]:
#Lets check the default segmentation
doc=nlp(mystring)

In [0]:
#Lets check the default segmentation
for sent in doc.sents:
    print(sent)
#Here a full stop is considered as a new segmentation by default which is expected
#But we need only the line braker i.e. new line only as teh segmentation
#Now, we are going to modify the default segmentation rule in such a way that
#segmentation happens only during a new line

This is a sentence.
This is another.


This is a 
third sentence.


In [0]:
from spacy.pipeline import SentenceSegmenter

In [0]:
#We will create a function which will tell what should be considered for segmentation
def split_on_newlines(doc):
    start=0
    seen_newline=False
    
    
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start=word.i
            seen_newline=False
        elif word.text.startswith('\n'):
            seen_newline=True
    yield doc[start:]

In [0]:
#Here we are replacing default strategy with the created rule
sbd=SentenceSegmenter(nlp.vocab,strategy=split_on_newlines)

In [0]:
#We are going to add the above to nlp pipeline
nlp.add_pipe(sbd)

In [0]:
doc=nlp(mystring)

In [0]:
for sentence in doc.sents:
    print(sentence)
#So, Here we are segmenting based on the new rule i.e. line breaker

This is a sentence. This is another.


This is a 

third sentence.
