Reference:
- https://spacy.io/docs/usage/

In [1]:
import spacy

# Load Language engine

## First, you need to select Language and load engine of the language. This might take time (~15s).
- If you are using spacy for the first time, you have to download via `python -m spacy download en`.
- You can customize your pipeline via giving custom modules via keyword arguments

In [2]:
nlp = spacy.load('en')

In [3]:
sentence = 'New York city is located east side of America.'

## That's it! 'nlp' engine already processed all the nlp stuffs

In [4]:
nlp(sentence)

New York city is located east side of America.

# Brief overview of Tagging schema

In [5]:
for token in nlp(sentence):
    print(token)

New
York
city
is
located
east
side
of
America
.


In [6]:
token = nlp(sentence)[0]
token

New

In [7]:
type(token)

spacy.tokens.token.Token

In [8]:
token.lemma_

'new'

In [9]:
token.is_digit

False

In [10]:
token.is_alpha

True

In [11]:
token.is_oov

False

In [12]:
token.is_title

True

In [13]:
token.is_stop

False

In [14]:
for token in nlp(sentence):
    print(token, token.text)

New New
York York
city city
is is
located located
east east
side side
of of
America America
. .


## Tag

In [15]:
for token in nlp(sentence):
    print(token, token.tag_)

New NNP
York NNP
city NN
is VBZ
located VBN
east JJ
side NN
of IN
America NNP
. .


## Part-of-speech

In [16]:
for word in nlp(sentence):
    print(word.text, word.pos_)

New PROPN
York PROPN
city NOUN
is VERB
located VERB
east ADJ
side NOUN
of ADP
America PROPN
. PUNCT


## All at once!

In [17]:
for word in nlp(sentence):
    print(word.text, word.lemma, word.lemma_, word.tag, word.tag_, word.pos, word.pos_)

New 652 new 441 NNP 93 PROPN
York 19784 york 441 NNP 93 PROPN
city 1210 city 440 NN 89 NOUN
is 488 be 459 VBZ 97 VERB
located 14550 locate 457 VBN 97 VERB
east 3462 east 433 JJ 82 ADJ
side 850 side 440 NN 89 NOUN
of 471 of 432 IN 83 ADP
America 6413 america 441 NNP 93 PROPN
. 419 . 419 . 94 PUNCT


# Vocabulary

In [18]:
nlp.vocab

<spacy.vocab.Vocab at 0x108bd1e18>

In [19]:
nlp.vocab.lang

'en'

In [20]:
len(nlp.vocab) # vocab size

1297536

In [21]:
nlp.vocab.length # vocab size

1297536

In [22]:
banana = nlp.vocab['banana']

In [23]:
banana.lower_

'banana'

# Word Vectors

- Default: GloVe
- If you need to train a word2vec model, I recommend Gensim.

In [24]:
for token in nlp(sentence):
    print(token, token.vector)

New [ 0.06091202 -0.02814697  0.05866676  0.05346338 -0.03529691 -0.03849789
  0.02129635  0.03168401 -0.05434816 -0.03154293  0.08842376 -0.00135269
 -0.02059177  0.06654339  0.00568608 -0.13325265  0.03638098 -0.05855861
  0.01254767 -0.0814036   0.00161687 -0.04857254 -0.00398078 -0.03185371
  0.04168598  0.00447796  0.01811987  0.00621205 -0.04407947  0.12058681
  0.01810922  0.02297143 -0.06686816  0.07535616 -0.02029658 -0.07539672
  0.04005671  0.01079927 -0.04817985  0.02248427 -0.07212576  0.05554451
  0.0516827   0.03630877 -0.05155753  0.00084072  0.00144955  0.04128152
 -0.00107929 -0.02126518 -0.04690605 -0.01571302  0.04980611  0.01955733
 -0.10944337  0.04211747  0.11314263 -0.09329472  0.0511044  -0.07160425
  0.0127802  -0.01953491  0.00854017 -0.02044434  0.08825135 -0.01502737
 -0.02833337 -0.02666052  0.01279467  0.09357623 -0.00849786  0.0517188
 -0.11997576  0.08887577  0.03696658 -0.04806582  0.09018949 -0.04436592
  0.10431013 -0.05176604  0.11444873 -0.04567201

In [25]:
for token in nlp('apple, peach, banana and orange'):
    print(token)

apple
,
peach
,
banana
and
orange


In [26]:
apple, _, peach, _, banana, and_, orange = nlp('apple, peach, banana and orange')

In [27]:
apple.vector.shape

(300,)

In [28]:
apple.similarity(peach)

0.27504403340499511

In [29]:
peach.similarity(apple)

0.27504403340499511

In [30]:
apple.similarity(orange)

0.22885475784967607

In [31]:
peach.similarity(banana)

0.3370778377992128

In [32]:
apple.similarity(and_)

0.088770645850664423

## The most similar words of given word!
- find words whose vectors are similar to given word's

In [33]:
def most_similar(word):
    queries = [w for w in word.vocab if w.is_lower == word.is_lower and w.prob >= -15]
    by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
    top_ten = by_similarity[:10]
    return [lexeme.lower_ for lexeme in top_ten]

In [34]:
most_similar(nlp.vocab['apple'])

['apple',
 'blackberry',
 'doritos',
 'asus',
 'onion',
 'apples',
 'oreo',
 'melon',
 'mango',
 'dell']

In [35]:
most_similar(nlp.vocab['ipad'])

['ipad',
 'iphone',
 'ipod',
 'ios',
 'ps3',
 'smartphone',
 'dreamcast',
 'netbook',
 'smartphones',
 'xbox']

In [36]:
most_similar(nlp.vocab['reddit'])

['reddit',
 'digg',
 'tumblr',
 'vimeo',
 'deviantart',
 'github',
 'craigslist',
 'instagram',
 'pinterest',
 'wordpress']

In [37]:
most_similar(nlp.vocab['fuck'])

['fuck',
 'bugger',
 'shit',
 'nigga',
 'fucking',
 'goddamn',
 'piss',
 'whoop',
 "c'mon",
 'whoa']

# Spacy processes multiple sentences at once!

In [38]:
sentences = "I'd like to drink some beer. Why do you learn deep learning? IPad is awesome."

In [39]:
doc = nlp(sentences)

In [40]:
list(doc.sents)

[I'd like to drink some beer.,
 Why do you learn deep learning?,
 IPad is awesome.]

# Parsing
- visualizer:
    - https://demos.explosion.ai/displacy/
    - https://github.com/explosion/displacy

In [41]:
doc = nlp('London is a big city in the United Kingdom.')
doc

London is a big city in the United Kingdom.

In [42]:
parsed_tokens = []
for token in doc:
    parsed_tokens.append([token, token.pos_, token.dep_, token.head, token.head.pos_])

In [43]:
import pandas as pd

In [44]:
columns = ['Word', 'POS', 'Dependency', 'head', 'POS of head']

In [45]:
pd.DataFrame(parsed_tokens, columns=columns)

Unnamed: 0,Word,POS,Dependency,head,POS of head
0,London,PROPN,nsubj,is,VERB
1,is,VERB,ROOT,is,VERB
2,a,DET,det,city,NOUN
3,big,ADJ,amod,city,NOUN
4,city,NOUN,attr,is,VERB
5,in,ADP,prep,city,NOUN
6,the,DET,det,Kingdom,PROPN
7,United,PROPN,compound,Kingdom,PROPN
8,Kingdom,PROPN,pobj,in,ADP
9,.,PUNCT,punct,is,VERB


### Visualization
- [word-level](https://demos.explosion.ai/displacy/?text=London%20is%20a%20big%20city%20in%20the%20United%20Kingdom.&model=en&cpu=1&cph=1)
- [phrase-level](https://demos.explosion.ai/displacy/?text=London%20is%20a%20big%20city%20in%20the%20United%20Kingdom.&model=en&cpu=1&cph=0)

### Traversing tree

In [46]:
from spacy.symbols import nsubj, VERB

In [47]:
verbs = set()

In [48]:
for token in doc:
    if token.dep == nsubj and token.head.pos == VERB:
        verbs.add(token.head)

In [49]:
verbs

{is}

# Entity Recognition

IOB formatting (In, Begin, Out).

CONLL 2002
- geo = Geographical Entity
- org = Organization
- per = Person
- gpe = Geopolitical Entity
- tim = Time indicator
- art = Artifact
- eve = Event
- nat = Natural Phenomenon

In [50]:
for token in nlp('London is a big city in the United Kingdom.'):
    print(token.text, token.ent_iob, token.ent_type_) # GPE = Geopolitical Entity

London 3 GPE
is 2 
a 2 
big 2 
city 2 
in 2 
the 3 GPE
United 1 GPE
Kingdom 1 GPE
. 2 
