In [None]:
# Follow the instructions here to install spacy and be able to run the commands
# given below:
#    https://spacy.io/docs#getting-started

In [1]:
import spacy

In [2]:
nlp = spacy.load('en')

In [3]:
text = u"Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29. Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group. Rudolph Agnew, 55 years old and former chairman of Consolidated Gold Fields PLC, was named a director of this British industrial conglomerate."


In [4]:
doc = nlp(text)

In [7]:
# Show the detected sentences.
for sent in doc.sents:
    print (sent)

Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.
Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group.
Rudolph Agnew, 55 years old and former chairman of Consolidated Gold Fields PLC, was named a director of this British industrial conglomerate.


In [9]:
# Show the detected tokens.
index = 0
for tok in doc:
    print (index, tok)
    index += 1

0 Pierre
1 Vinken
2 ,
3 61
4 years
5 old
6 ,
7 will
8 join
9 the
10 board
11 as
12 a
13 nonexecutive
14 director
15 Nov.
16 29
17 .
18 Mr.
19 Vinken
20 is
21 chairman
22 of
23 Elsevier
24 N.V.
25 ,
26 the
27 Dutch
28 publishing
29 group
30 .
31 Rudolph
32 Agnew
33 ,
34 55
35 years
36 old
37 and
38 former
39 chairman
40 of
41 Consolidated
42 Gold
43 Fields
44 PLC
45 ,
46 was
47 named
48 a
49 director
50 of
51 this
52 British
53 industrial
54 conglomerate
55 .


In [10]:
# Show word similarities.
# 29 is "group", 39 is "chairman", 49 is "director", 54 is "conglomerate"
print (doc[39].similarity(doc[49]), "chairman <-> director")
print (doc[39].similarity(doc[54]), "chairman <-> conglomerate")
print ("----")
print (doc[29].similarity(doc[49]), "group <-> director")
print (doc[29].similarity(doc[54]), "group <-> conglomerate")


0.615261651339 chairman <-> director
0.35017640956 chairman <-> conglomerate
----
0.311771623351 group <-> director
0.322598796783 group <-> conglomerate


In [11]:
# Show the lemmas of each of the tokens, if not the same as the orthographic.
# form.
for tok in doc:
    if (tok.lower_ != tok.lemma_):
        print (tok, tok.lemma_)

years year
is be
years year
was be
named name


In [12]:
# Rank the words from most probable to least.
word_probs = set([(tok.prob, tok.orth_) for tok in doc])
sorted_word_probs = sorted(word_probs,key=lambda tup: tup[0])
sorted_word_probs.reverse()
for prob,word in sorted_word_probs:
    print (prob, word)

-3.0678977966308594 .
-3.4549596309661865 ,
-3.528766632080078 the
-3.92978835105896 a
-4.113108158111572 and
-4.27587366104126 of
-4.457748889923096 is
-5.252320289611816 was
-5.36181640625 this
-5.53448486328125 as
-6.199834823608398 will
-7.368987560272217 years
-7.845602989196777 old
-8.761202812194824 group
-9.715755462646484 board
-9.882078170776367 join
-10.133160591125488 named
-10.352326393127441 British
-10.373620986938477 former
-10.547660827636719 Mr.
-10.910798072814941 Gold
-11.282051086425781 29
-11.463842391967773 director
-11.507254600524902 industrial
-11.646064758300781 55
-11.688076972961426 Dutch
-12.308273315429688 publishing
-12.375371932983398 Nov.
-12.898519515991211 61
-13.167049407958984 Pierre
-13.772281646728516 Fields
-13.928001403808594 chairman
-14.147188186645508 Rudolph
-14.507896423339844 conglomerate
-15.370661735534668 PLC
-16.369218826293945 Consolidated
-16.572589874267578 Agnew
-16.884357452392578 Elsevier
-19.190162658691406 N.V.
-19.50202941894

In [13]:
# Get the first sentence.
s1 = next(doc.sents)

In [14]:
# Show the shapes of the tokens.
for tok in s1:
    print (tok, tok.shape_)

Pierre Xxxxx
Vinken Xxxxx
, ,
61 dd
years xxxx
old xxx
, ,
will xxxx
join xxxx
the xxx
board xxxx
as xx
a x
nonexecutive xxxx
director xxxx
Nov. Xxx.
29 dd
. .


In [15]:
# Show the part-of-speech tags.
for tok in s1:
    print (tok.pos_,tok)

PROPN Pierre
PROPN Vinken
PUNCT ,
NUM 61
NOUN years
ADJ old
PUNCT ,
VERB will
VERB join
DET the
NOUN board
ADP as
DET a
ADJ nonexecutive
NOUN director
PROPN Nov.
NUM 29
PUNCT .


In [16]:
# Fine-grained POS tags too.
for tok in s1:
    print (tok.pos_,tok.tag_,tok)

PROPN NNP Pierre
PROPN NNP Vinken
PUNCT , ,
NUM CD 61
NOUN NNS years
ADJ JJ old
PUNCT , ,
VERB MD will
VERB VB join
DET DT the
NOUN NN board
ADP IN as
DET DT a
ADJ JJ nonexecutive
NOUN NN director
PROPN NNP Nov.
NUM CD 29
PUNCT . .


In [17]:
# Noun-phrase chunks.
for chunk in doc.noun_chunks:
    print (chunk.label_, chunk.orth_)

NP Pierre Vinken
NP the board
NP a nonexecutive director
NP Mr. Vinken
NP chairman
NP Elsevier N.V.
NP Rudolph Agnew
NP Consolidated Gold Fields PLC
NP this British industrial conglomerate


In [18]:
# Named entities.
for ent in doc.ents:
    print (ent.label_, ent)

PERSON Pierre Vinken
DATE 61 years old
DATE Nov. 29
PERSON Vinken
ORG Elsevier N.V.
NORP Dutch
PERSON Rudolph Agnew
DATE 55 years old
ORG Consolidated Gold Fields PLC
NORP British


In [19]:
# Show the syntactic head for each token.
for tok in s1:
    print (tok, ("--%s-->" % tok.dep_), tok.head.orth_)

Pierre --compound--> Vinken
Vinken --nsubj--> join
, --punct--> Vinken
61 --nummod--> years
years --npadvmod--> old
old --amod--> Vinken
, --punct--> Vinken
will --aux--> join
join --ROOT--> join
the --det--> board
board --dobj--> join
as --prep--> join
a --det--> director
nonexecutive --amod--> director
director --pobj--> as
Nov. --npadvmod--> join
29 --nummod--> Nov.
. --punct--> join


In [20]:
# Show the path from a token to the root. Adapted from spaCy docs.
def dependency_labels_to_root(token):
    '''Walk up the syntactic tree, collecting the arc labels.'''
    dep_labels = []
    while token.head is not token:
        dependency = token.orth_+ " " +("--%s-->" % token.dep_)+" " + token.head.orth_
        dep_labels.append(dependency)
        token = token.head
    return dep_labels

In [21]:
# Show the first sentence again, as a reminder.
print (s1)

Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.


In [22]:
# Path from "years"
for dep in dependency_labels_to_root(s1[4]):
    print (dep)

years --npadvmod--> old
old --amod--> Vinken
Vinken --nsubj--> join


In [23]:
# Path from "nonexecutive"
for dep in dependency_labels_to_root(s1[13]):
    print (dep)

nonexecutive --amod--> director
director --pobj--> as
as --prep--> join
