In [28]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [5]:
doc=nlp(u"the quick brown for jumper over the lazy dog's back.")
print(doc.text)

the quick brown for jumper over the lazy dog's back.


In [4]:
print(doc[4].text,doc[4].pos_,doc[4].tag_,spacy.explain(doc[4].tag_))

jumper PROPN NNP noun, proper singular


In [10]:
for token in doc:
    print(f"{token.text:{10}}{token.pos_:{8}}{token.tag_:{6}}{spacy.explain(token.tag_)}")

the       DET     DT    determiner
quick     ADJ     JJ    adjective (English), other noun-modifier (Chinese)
brown     NOUN    NN    noun, singular or mass
for       ADP     IN    conjunction, subordinating or preposition
jumper    NOUN    NN    noun, singular or mass
over      ADP     IN    conjunction, subordinating or preposition
the       DET     DT    determiner
lazy      ADJ     JJ    adjective (English), other noun-modifier (Chinese)
dog       NOUN    NN    noun, singular or mass
's        PART    POS   possessive ending
back      NOUN    NN    noun, singular or mass
.         PUNCT   .     punctuation mark, sentence closer


In [17]:
doc=nlp(u'I read books on NLP.')
r=doc[1]
print(f'{r.text:{10}}{r.pos_:{8}}{r.tag_:{6}}{spacy.explain(r.tag_)}')

read      VERB    VBP   verb, non-3rd person singular present


In [18]:
doc=nlp(u'I read book on NLP.')
r=doc[1]
print(f'{r.text:{10}}{r.pos_:{8}}{r.tag_:{6}}{spacy.explain(r.tag_)}')

read      VERB    VBP   verb, non-3rd person singular present


In [22]:
Dep_counts=doc.count_by(spacy.attrs.DEP)
for k,v in sorted(Dep_counts.items()):
    print(f'{k}.{doc.vocab[k].text:{4}}:{v}')

416.dobj:1
429.nsubj:1
439.pobj:1
443.prep:1
445.punct:1
8206900633647566924.ROOT:1


In [25]:
from spacy import displacy


In [28]:
doc=nlp(u"the quick brown fox jumbed over the lazy dog's back")
for token in doc:
    print(f"{token.text:{10}}{token.pos_:{7}}{token.dep_:{7}}{spacy.explain(token.tag_)}")
    

the       DET    det    determiner
quick     ADJ    amod   adjective (English), other noun-modifier (Chinese)
brown     ADJ    amod   adjective (English), other noun-modifier (Chinese)
fox       NOUN   nsubj  noun, singular or mass
jumbed    VERB   ROOT   verb, past tense
over      ADP    prep   conjunction, subordinating or preposition
the       DET    det    determiner
lazy      ADJ    amod   adjective (English), other noun-modifier (Chinese)
dog       NOUN   pobj   noun, singular or mass
's        PART   case   possessive ending
back      ADV    advmod adverb


In [32]:
displacy.render(doc,style='dep',jupyter=True,options={'distance':110})

In [42]:
#render is used inside the jupiter noteboon
#and server is used to print outside the jupyter notebook
doc2=nlp(u"thisis a sentence this is antoher possibly longer sesntence.")
spans=list(doc2.sents)
options={'distance':110,'compact':'False','color':'blue','bg':'#FF0000','font':'Times'}
displacy.render(doc2,style='dep',jupyter=True,options=options)

In [10]:
def show_ents(doc):
    if doc.ents:
         for ent in doc.ents:
              print(ent.text+'-'+ent.label_+'-'+str(spacy.explain(ent.label_)))
    else:
        print('No named entities found.')

In [11]:
doc=nlp(U"may i go to washington, de next may to see the washington monuments?")
show_ents(doc)

washington-GPE-Countries, cities, states
washington-GPE-Countries, cities, states


In [14]:
doc=nlp(u'can i please borrow 500 dolars from you to buy some microsoft stock?')
for ent in doc.ents:
    print(ent.text,ent.start,ent.end,ent.start_char,ent.end_char,ent.label_)

500 4 5 20 23 CARDINAL


In [16]:
doc=nlp(U"tesla to build U.K. factory for $6 million")
show_ents(doc)

U.K.-GPE-Countries, cities, states
$6 million-MONEY-Monetary values, including unit


In [18]:
doc=nlp(U"Tesla to build a U.K. facotry for $6million")
ORG=doc.vocab.strings[U'ORG']
new_ent=spacy.tokens.span.Span(doc,0,1,label=ORG)
doc.ents=list(doc.ents)+[new_ent]
show_ents(doc)

Tesla-ORG-Companies, agencies, institutions, etc.
U.K.-GPE-Countries, cities, states
6million-MONEY-Monetary values, including unit


In [21]:
doc=nlp(U"Our company plans to introduce a new vaccum cleaner."U"if sucessfull, the vaccum cleaner willl ve our first product.")
show_ents(doc)

first-ORDINAL-"first", "second", etc.


In [22]:
from spacy.matcher import PhraseMatcher
matcher=PhraseMatcher(nlp.vocab)

In [25]:
doc=nlp(U"Our company plans to introduce a new vaccum cleaner "U"Iif successful, the vaccum cleaner wil be our first product.")
phrase_list=['vaccum cleaner','vaccum_cleaner']
phrase_patterns=[nlp(text) for text in phrase_list]
matcher.add('newProduct',None,*phrase_patterns)
matches=matcher(doc)
matches

[(4452177204818730156, 7, 9), (4452177204818730156, 13, 15)]

In [29]:
#spacy default behaviour
doc=nlp(U'"Management is doing things right;leadership is doing the right thigs."-peter drucker')
for sent in doc.sents:
    print(sent)
    print('\n')

"Management is doing things right;leadership is doing the right thigs.


"-peter drucker




In [30]:
doc

"Management is doing things right;leadership is doing the right thigs."-peter drucker

In [31]:
#(-) is used to remove the last word
doc[:-1]

"Management is doing things right;leadership is doing the right thigs."-peter

In [32]:
from spacy.language import Language
@Language.component("set_custom_boundaries")

#add a new rule to the pipeline
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text==';':
            doc[token.i+1].is_sent_start=True
    return doc    

In [33]:
nlp.add_pipe("set_custom_boundaries",before='parser')
print(nlp.pipe_names)

['tok2vec', 'tagger', 'set_custom_boundaries', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [35]:
doc4=nlp(U'"Management is doing things right; leadership is doing the right thigs."-peter drucker')
for sent in doc4.sents:
    print(sent)

"Management is doing things right;
leadership is doing the right thigs.
"-peter drucker
