In [None]:
# Follow the instructions here to install spacy and be able to run the commands
# given below:
#    https://spacy.io/docs#getting-started

In [None]:
import spacy

In [None]:
nlp = spacy.load('en')

In [None]:
text = u"Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29. Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group. Rudolph Agnew, 55 years old and former chairman of Consolidated Gold Fields PLC, was named a director of this British industrial conglomerate."


In [None]:
doc = nlp(text)

In [None]:
# Show the detected sentences.
for sent in doc.sents:
    print sent

In [None]:
# Show the detected tokens.
index = 0
for tok in doc:
    print index, tok
    index += 1

In [None]:
# Show word similarities.
# 29 is "group", 39 is "chairman", 49 is "director", 54 is "conglomerate"
print doc[39].similarity(doc[49]), "chairman <-> director"
print doc[39].similarity(doc[54]), "chairman <-> conglomerate"
print "----"
print doc[29].similarity(doc[49]), "group <-> director"
print doc[29].similarity(doc[54]), "group <-> conglomerate"


In [None]:
# Show the lemmas of each of the tokens, if not the same as the orthographic.
# form.
for tok in doc:
    if (tok.lower_ != tok.lemma_):
        print tok, tok.lemma_        

In [None]:
# Rank the words from most probable to least.
word_probs = set([(tok.prob, tok.orth_) for tok in doc])
sorted_word_probs = sorted(word_probs,key=lambda tup: tup[0])
sorted_word_probs.reverse()
for prob,word in sorted_word_probs:
    print prob, word

In [None]:
# Get the first sentence.
s1 = next(doc.sents)

In [None]:
# Show the shapes of the tokens.
for tok in s1:
    print tok, tok.shape_

In [None]:
# Show the part-of-speech tags.
for tok in s1:
    print tok.pos_,tok

In [None]:
# Fine-grained POS tags too.
for tok in s1:
    print tok.pos_,tok.tag_,tok

In [None]:
# Noun-phrase chunks.
for chunk in doc.noun_chunks:
    print chunk.label_, chunk.orth_

In [None]:
# Named entities.
for ent in doc.ents:
    print ent.label_, ent

In [None]:
# Show the syntactic head for each token.
for tok in s1:
    print tok, ("--%s-->" % tok.dep_), tok.head.orth_

In [None]:
# Show the path from a token to the root. Adapted from spaCy docs.
def dependency_labels_to_root(token):
    '''Walk up the syntactic tree, collecting the arc labels.'''
    dep_labels = []
    while token.head is not token:
        dependency = token.orth_+ " " +("--%s-->" % token.dep_)+" " + token.head.orth_
        dep_labels.append(dependency)
        token = token.head
    return dep_labels

In [None]:
# Show the first sentence again, as a reminder.
print s1

In [None]:
# Path from "years"
for dep in dependency_labels_to_root(s1[4]):
    print dep

In [None]:
# Path from "nonexecutive"
for dep in dependency_labels_to_root(s1[13]):
    print dep