### SpaCy demo


In [3]:
import spacy
from nltk.corpus import wordnet as wn
from spacy.tokens import Token
from scipy import spatial
import nltk
from spacy import displacy
from nltk.chunk import conlltags2tree

nltk.download('wordnet')

# Nedd to run these from a command prompt after activating your Conda environment:
# python3 -m spacy download en_core_web_sm
# python3 -m spacy download en_core_web_lg


[nltk_data] Downloading package wordnet to /Users/macbook/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# 1 Example

nlp = spacy.load('en_core_web_sm')
doc = nlp('Hello     testing Whizlabs demo!')
for token in doc:
    print('"' + token.text + '"')

"Hello"
"    "
"testing"
"Whizlabs"
"demo"
"!"


In [None]:
# 2 Index

nlp = spacy.load('en_core_web_sm')
doc = nlp('Hello     testing Whizlabs demo!')
for token in doc:
    print('"' + token.text + '"', token.idx)

In [None]:
# 3 Word-level attributes

doc = nlp("I'm learning how spaCy works in Python.")
for token in doc:
    print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
        token.text,
        token.idx,
        token.lemma_,
        token.is_punct,
        token.is_space,
        token.shape_,
        token.pos_,
        token.tag_
    ))


In [5]:
# 4 Sentence detection
doc = nlp("These are apples. These are oranges.")
 
for sent in doc.sents:
    print(sent)

These are apples.
These are oranges.


In [6]:
# 5 Named Entity Recognition

doc = nlp("I'm learning how spaCy works in Python.")
for ent in doc.ents:
    print(ent.text, ent.label_)

Python GPE


In [7]:
# 5.1

doc = nlp("I'm learning how spaCy works in Python using an Apple mac at 6 p.m.")
for ent in doc.ents:
    print(ent.text, ent.label_)

Python ORG
Apple ORG
6 p.m. TIME


In [8]:
# 5.2

from spacy import displacy

doc = nlp("I'm learning how spaCy works in Python using an Apple mac at 6 p.m.")
displacy.render(doc, style='ent', jupyter=True)

In [9]:
# 6 Part Of Speech Tagging

doc = nlp("I'm learning how spaCy works in Python.")
print([(token.text, token.tag_) for token in doc])

[('I', 'PRP'), ("'m", 'VBP'), ('learning', 'VBG'), ('how', 'WRB'), ('spaCy', 'JJ'), ('works', 'NNS'), ('in', 'IN'), ('Python', 'NNP'), ('.', '.')]


In [10]:
# 6.1

doc = nlp("I'm learning how spaCy works in Python.")
iob_tagged = [
    (
        token.text, 
        token.tag_, 
        "{0}-{1}".format(token.ent_iob_, token.ent_type_) if token.ent_iob_ != 'O' else token.ent_iob_
    ) for token in doc
]
 
print(iob_tagged)
print(conlltags2tree(iob_tagged))

[('I', 'PRP', 'O'), ("'m", 'VBP', 'O'), ('learning', 'VBG', 'O'), ('how', 'WRB', 'O'), ('spaCy', 'JJ', 'O'), ('works', 'NNS', 'O'), ('in', 'IN', 'O'), ('Python', 'NNP', 'B-GPE'), ('.', '.', 'O')]
(S
  I/PRP
  'm/VBP
  learning/VBG
  how/WRB
  spaCy/JJ
  works/NNS
  in/IN
  (GPE Python/NNP)
  ./.)


In [11]:
# 7 Chunking

doc = nlp("I'm learning how spaCy works in Python from a course from Whizlabs")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.label_, chunk.root.text)

I NP I
Python NP Python
a course NP course
Whizlabs NP Whizlabs


In [12]:
# 8 Dependency Parsing

doc = nlp("I'm learning how spaCy works in Python from a course from Whizlabs")
for token in doc:
    print("{0}/{1} <--{2}-- {3}/{4}".format(
        token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))

I/PRP <--nsubj-- learning/VBG
'm/VBP <--aux-- learning/VBG
learning/VBG <--ROOT-- learning/VBG
how/WRB <--advmod-- works/NNS
spaCy/JJ <--nsubj-- works/NNS
works/NNS <--ccomp-- learning/VBG
in/IN <--prep-- works/NNS
Python/NNP <--pobj-- in/IN
from/IN <--prep-- works/NNS
a/DT <--det-- course/NN
course/NN <--pobj-- from/IN
from/IN <--prep-- course/NN
Whizlabs/NNP <--pobj-- from/IN


In [13]:
# 8.1

doc = nlp("I'm learning how spaCy works in Python from a course from Whizlabs")
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

In [14]:
# 9 Word Vectors

# python -m spacy download en_core_web_lg

nlp = spacy.load('en_core_web_lg')
print(nlp.vocab['banana'].vector)

[ 2.0228e-01 -7.6618e-02  3.7032e-01  3.2845e-02 -4.1957e-01  7.2069e-02
 -3.7476e-01  5.7460e-02 -1.2401e-02  5.2949e-01 -5.2380e-01 -1.9771e-01
 -3.4147e-01  5.3317e-01 -2.5331e-02  1.7380e-01  1.6772e-01  8.3984e-01
  5.5107e-02  1.0547e-01  3.7872e-01  2.4275e-01  1.4745e-02  5.5951e-01
  1.2521e-01 -6.7596e-01  3.5842e-01 -4.0028e-02  9.5949e-02 -5.0690e-01
 -8.5318e-02  1.7980e-01  3.3867e-01  1.3230e-01  3.1021e-01  2.1878e-01
  1.6853e-01  1.9874e-01 -5.7385e-01 -1.0649e-01  2.6669e-01  1.2838e-01
 -1.2803e-01 -1.3284e-01  1.2657e-01  8.6723e-01  9.6721e-02  4.8306e-01
  2.1271e-01 -5.4990e-02 -8.2425e-02  2.2408e-01  2.3975e-01 -6.2260e-02
  6.2194e-01 -5.9900e-01  4.3201e-01  2.8143e-01  3.3842e-02 -4.8815e-01
 -2.1359e-01  2.7401e-01  2.4095e-01  4.5950e-01 -1.8605e-01 -1.0497e+00
 -9.7305e-02 -1.8908e-01 -7.0929e-01  4.0195e-01 -1.8768e-01  5.1687e-01
  1.2520e-01  8.4150e-01  1.2097e-01  8.8239e-02 -2.9196e-02  1.2151e-03
  5.6825e-02 -2.7421e-01  2.5564e-01  6.9793e-02 -2

In [15]:
# 9.1 "man" - "woman" + "queen" = "king"
 
cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)
 
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector
queen = nlp.vocab['queen'].vector
king = nlp.vocab['king'].vector
 
# We now need to find the closest vector in the vocabulary to the result of "man" - "woman" + "queen"
maybe_king = man - woman + queen
computed_similarities = []
 
for word in nlp.vocab:
    # Ignore words without vectors
    if not word.has_vector:
        continue
    similarity = cosine_similarity(maybe_king, word.vector)
    computed_similarities.append((word, similarity))
    
computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])
print([w[0].text for w in computed_similarities[:10]])


['queen', 'king', 'man', 'Mr.', 'He', 'he', 'Let', 'let', 'nothin', 'Nothin']


In [16]:
# 10 Compute Similarity

banana = nlp.vocab['banana']
dog = nlp.vocab['dog']
fruit = nlp.vocab['fruit']
animal = nlp.vocab['animal']
 
print(dog.similarity(animal), dog.similarity(fruit))  
print(banana.similarity(fruit), banana.similarity(animal))  

0.66185343 0.2355285
0.67148364 0.2427285


In [17]:
# 10.1 Sentences

target = nlp("Cats are beautiful animals.")
 
doc1 = nlp("Dogs are awesome.")
doc2 = nlp("Some gorgeous creatures are felines.")
doc3 = nlp("Dolphins are swimming mammals.")
 
print(target.similarity(doc1))   
print(target.similarity(doc2))  
print(target.similarity(doc3))  

0.8901766262114666
0.9115828449161616
0.7822955760597128


In [18]:
# 11 Pipelines
def penn_to_wn(tag):
    if tag.startswith('N'):
        return 'n'
 
    if tag.startswith('V'):
        return 'v'
 
    if tag.startswith('J'):
        return 'a'
 
    if tag.startswith('R'):
        return 'r'
 
    return None
 
class WordnetPipeline(object):
    def __init__(self, nlp):
        Token.set_extension('synset', default=None, force=True)
 
    def __call__(self, doc):
        for token in doc:
            wn_tag = penn_to_wn(token.tag_)
            if wn_tag is None:
                continue
 
            ss = wn.synsets(token.text, wn_tag)[0]
            token._.set('synset', ss)
 
        return doc
 




In [19]:
nlp = spacy.load('en_core_web_sm')
wn_pipeline = WordnetPipeline(nlp)
nlp.add_pipe(wn_pipeline, name='wn_synsets')
doc = nlp("apple world this")
 
for token in doc:
    print(token.text, "-", token._.synset)

apple - Synset('apple.n.01')
world - Synset('universe.n.01')
this - None


In [20]:
# 11.1 Pipeline structure

print(nlp.pipe_names)

['tagger', 'parser', 'ner', 'wn_synsets']
