In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back")

In [3]:
print(doc.text)

The quick brown fox jumped over the lazy dog's back


In [6]:
print(doc[4].tag_)

VBD


In [7]:
print(doc[4].pos_)

VERB


In [11]:
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

The        DET        DT         determiner
quick      ADJ        JJ         adjective
brown      ADJ        JJ         adjective
fox        NOUN       NN         noun, singular or mass
jumped     VERB       VBD        verb, past tense
over       ADP        IN         conjunction, subordinating or preposition
the        DET        DT         determiner
lazy       ADJ        JJ         adjective
dog        NOUN       NN         noun, singular or mass
's         PART       POS        possessive ending
back       NOUN       NN         noun, singular or mass


In [12]:
#SAME WORDS CAN HAVE DIFFERENT MEANINGS
doc = nlp(u"I read books on NLP.")

In [13]:
word = doc[1]
word.text

'read'

In [15]:
#EXPLANATION
token = word
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

read       VERB       VBP        verb, non-3rd person singular present


In [16]:
#ANOTHER DOCUMENT USING READ
doc = nlp("I read a book on NLP.")

In [17]:
#SPACY IDENTIFIES PRESENT AND PAST TENSE
token = doc[1]
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

read       VERB       VBD        verb, past tense


In [18]:
#FREQUENCY COUNT OF WORDS IN DOCUMENT
POS_counts = doc.count_by(spacy.attrs.POS)

In [19]:
#NUMBERS REPRESENT POS CODE AND VALUE IS THE FREQUENCY IN DOCUMENT
POS_counts

{96: 1, 99: 1, 84: 1, 89: 1, 91: 1, 94: 1, 95: 1}

In [23]:
#CONVERT POS CODE TO POS TEXT
doc.vocab[96].text

'PUNCT'

In [24]:
#FREQUENCY LIST USING POS 

#K IS POS CODE AND V IS FREQUENCY
for k,v in sorted(POS_counts.items()):
    print(f"{k} {doc.vocab[k].text:{5}} {v}")

84 ADP   1
89 DET   1
91 NOUN  1
94 PRON  1
95 PROPN 1
96 PUNCT 1
99 VERB  1


In [27]:
#FOR FINE-GRAINED POS TAG COUNTS
TAG_counts = doc.count_by(spacy.attrs.TAG)

#TAG CODES ARE V LONG BECAUSE THEY ARE NOT USED AS COMMONLY
for k, v in sorted(TAG_counts.items()):
    print(f"{k} {doc.vocab[k].text:{5}} {v}")

1292078113972184607 IN    1
12646065887601541794 .     1
13656873538139661788 PRP   1
15267657372422890137 DT    1
15308085513773655218 NN    1
15794550382381185553 NNP   1
17109001835818727656 VBD   1


In [28]:
len(doc.vocab) #TOTAL NUMBER OF IDENTIFIABLE TOKENS

57864

In [29]:
#COUNT SYNTACTIC DEPENDENCIES
DEP_counts = doc.count_by(spacy.attrs.DEP)

for k,v in sorted(DEP_counts.items()):
    print(f"{k} {doc.vocab[k].text:{5}} {v}")

412 det   1
413 dobj  1
426 nsubj 1
436 pobj  1
440 prep  1
442 punct 1
8206900633647566924 ROOT  1


In [30]:
### VISUALISING PARTS OF SPEECH: using displacy
doc = nlp(u"The quick brown fox jumped over the lazy dog")

from spacy import displacy

In [32]:
displacy.render(doc, style = 'dep', jupyter = True)

In [33]:
#MODIFYING THE RENDERING
options = {'distance':110, 'compact':'True', 'color':'yellow', 'bg':'#09a3d5', 'font':'Times'}

In [34]:
displacy.render(doc, style = 'dep', jupyter = True, options=options)

In [35]:
#WE CAN PASS A LIST OF SPANS AS WELL
doc2 = nlp(u"This is a sentence. This is another sentence, possibly longer than another")

In [36]:
spans = list(doc2.sents)