### Installation
    pip install -U spaCy
    python -m spacy download en
 

### Tokenization

In [1]:
import spacy

nlp = spacy.load("en")

doc = nlp("Hello     World!")
for token in doc:
    print('"' + token.text + '"')

"Hello"
"    "
"World"
"!"


Tokenization preserves the positional index of the words

In [2]:
import spacy

nlp = spacy.load("en")

doc = nlp("Hello     World!")
for token in doc:
    print(token.idx, '"' + token.text + '"')

0 "Hello"
6 "    "
10 "World"
15 "!"


The Token class exposes a lot of word-level attributes. 

In [3]:
doc = nlp("Next week I'll   be in Madrid.")
for token in doc:
    print(
        "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
            token.idx,
            token.text,
            token.lemma_,
            token.is_punct,
            token.is_space,
            token.shape_,
            token.pos_,
            token.tag_,
        )
    )

0	Next	next	False	False	Xxxx	ADJ	JJ
5	week	week	False	False	xxxx	NOUN	NN
10	I	-PRON-	False	False	X	PRON	PRP
11	'll	will	False	False	'xx	AUX	MD
15	  	  	False	True	  	SPACE	_SP
17	be	be	False	False	xx	VERB	VB
20	in	in	False	False	xx	ADP	IN
23	Madrid	Madrid	False	False	Xxxxx	PROPN	NNP
29	.	.	True	False	.	PUNCT	.


### Sentence detection

In [4]:
doc = nlp("These are apples. These are oranges.")

for sent in doc.sents:
    print(sent)

These are apples.
These are oranges.


### Part Of Speech Tagging

In [5]:
doc = nlp("Next week I'll be in Madrid.")
print([(token.text, token.tag_) for token in doc])

[('Next', 'JJ'), ('week', 'NN'), ('I', 'PRP'), ("'ll", 'MD'), ('be', 'VB'), ('in', 'IN'), ('Madrid', 'NNP'), ('.', '.')]


### Named Entity Recognition
- Doing NER with spaCy is super easy and the pretrained model performs pretty well

In [6]:
doc = nlp("Next week I'll be in Madrid.")
for ent in doc.ents:
    print(ent.text, ent.label_)

Next week DATE
Madrid GPE


##### IOB style tagging

In [7]:
from nltk.chunk import conlltags2tree


doc = nlp("Next week I'll be in Madrid.")
iob_tagged = [
    (
        token.text,
        token.tag_,
        "{0}-{1}".format(token.ent_iob_, token.ent_type_)
        if token.ent_iob_ != "O"
        else token.ent_iob_,
    )
    for token in doc
]

print(iob_tagged)

# In case you like the nltk.Tree format
print(conlltags2tree(iob_tagged))

[('Next', 'JJ', 'B-DATE'), ('week', 'NN', 'I-DATE'), ('I', 'PRP', 'O'), ("'ll", 'MD', 'O'), ('be', 'VB', 'O'), ('in', 'IN', 'O'), ('Madrid', 'NNP', 'B-GPE'), ('.', '.', 'O')]
(S
  (DATE Next/JJ week/NN)
  I/PRP
  'll/MD
  be/VB
  in/IN
  (GPE Madrid/NNP)
  ./.)


In [9]:
doc = nlp(
    "I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ"
)
for ent in doc.ents:
    print(ent.text, ent.label_)

2 CARDINAL
9 a.m. TIME
30% PERCENT
just 2 days DATE
WSJ ORG


In [8]:
from spacy import displacy

doc = nlp(
    "I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ"
)
displacy.render(doc, style="ent", jupyter=True)

### Chunking
spaCy automatically detects noun-phrases as well:


In [11]:
doc = nlp(
    "Wall Street Journal just published an interesting piece on crypto currencies"
)
for chunk in doc.noun_chunks:
    print(chunk.text, "\t", chunk.label_, "\t", chunk.root.text)

Wall Street Journal 	 NP 	 Journal
an interesting piece 	 NP 	 piece
crypto currencies 	 NP 	 currencies


### Dependency Parsing


In [17]:
doc = nlp(
    "Wall Street Journal just published an interesting piece on crypto currencies"
)

for token in doc:
    print(
        "{0}/{1} <--{2}-- {3}/{4}".format(
            token.text, token.tag_, token.dep_, token.head.text, token.head.tag_
        )
    )

Wall/NNP <--compound-- Street/NNP
Street/NNP <--compound-- Journal/NNP
Journal/NNP <--nsubj-- published/VBD
just/RB <--advmod-- published/VBD
published/VBD <--ROOT-- published/VBD
an/DT <--det-- piece/NN
interesting/JJ <--amod-- piece/NN
piece/NN <--dobj-- published/VBD
on/IN <--prep-- piece/NN
crypto/JJ <--amod-- currencies/NNS
currencies/NNS <--pobj-- on/IN


In [18]:
from spacy import displacy

doc = nlp(
    "Wall Street Journal just published an interesting piece on crypto currencies"
)
displacy.render(doc, style="dep", jupyter=True, options={"distance": 90})