In [1]:
from spacy.en import English

In [2]:
parser = English()

In [3]:
multi_sentence = "There is an art, it says, or rather, a knack to flying." \
                 "The knack lies in learning how to throw yourself at the ground and miss." \
                 "In the beginning the Universe was created. This has made a lot of people "\
                 "very angry and been widely regarded as a bad move."

In [4]:
parsed_data = parser(unicode(multi_sentence))

In [5]:
for i, token in enumerate(parsed_data):
    print("original:", token.orth, token.orth_)
    print("lowercased:", token.lower, token.lower_)
    print("lemma:", token.lemma, token.lemma_)
    print("shape:", token.shape, token.shape_)
    print("prefix:", token.prefix, token.prefix_)
    print("suffix:", token.suffix, token.suffix_)
    print("log probability:", token.prob)
    print("Brown cluster id:", token.cluster)
    print("----------------------------------------")
    if i > 1:
        break

('original:', 640, u'There')
('lowercased:', 530, u'there')
('lemma:', 530, u'there')
('shape:', 489815, u'Xxxxx')
('prefix:', 2907, u'T')
('suffix:', 48458, u'ere')
('log probability:', -7.347356796264648)
('Brown cluster id:', 1918)
----------------------------------------
('original:', 474, u'is')
('lowercased:', 474, u'is')
('lemma:', 488, u'be')
('shape:', 21581, u'xx')
('prefix:', 570, u'i')
('suffix:', 474, u'is')
('log probability:', -4.457748889923096)
('Brown cluster id:', 762)
----------------------------------------
('original:', 523, u'an')
('lowercased:', 523, u'an')
('lemma:', 523, u'an')
('shape:', 21581, u'xx')
('prefix:', 469, u'a')
('suffix:', 523, u'an')
('log probability:', -6.014852046966553)
('Brown cluster id:', 3)
----------------------------------------


In [13]:
# DISPLAY SENTENCES
sents = []
for span in parsed_data.sents:
    sent = ''.join(parsed_data[i].string for i in range(span.start,span.end)).strip()
    sents.append(sent)
for sentence in sents:
    print sentence

There is an art, it says, or rather, a knack to flying.
The knack lies in learning how to throw yourself at the ground and miss.
In the beginning the Universe was created.
This has made a lot of people very angry and been widely regarded as a bad move.


In [15]:
# PART-OF-SPEECH TAGGING
for span in parsed_data.sents:
    sent = [parsed_data[i] for i in range(span.start,span.end)]
    break

for token in sent:
    print token.orth_, token.pos_

There ADV
is VERB
an DET
art NOUN
, PUNCT
it PRON
says VERB
, PUNCT
or CONJ
rather ADV
, PUNCT
a DET
knack NOUN
to ADP
flying NOUN
. PUNCT


In [18]:
# DEPENDENCIES
example = u"The boy with the spotted dog quickly ran after the firetruck."
parsed_example = parser(example)
for token in parsed_example:
    print token.orth_, token.dep_, token.head.orth_, \
         [t.orth_ for t in token.lefts], [t.orth_ for t in token.rights]

The det boy [] []
boy nsubj ran [u'The'] [u'with']
with prep boy [] [u'dog']
the det dog [] []
spotted amod dog [] []
dog pobj with [u'the', u'spotted'] []
quickly advmod ran [] []
ran ROOT ran [u'boy', u'quickly'] [u'after', u'.']
after prep ran [] [u'firetruck']
the det firetruck [] []
firetruck pobj after [u'the'] []
. punct ran [] []


In [20]:
# NAMED ENTITY RECOGNITION
example = u"Apple's stocks dropped dramatically after the death of Steve Jobs in October."
parsed_example = parser(example)
for token in parsed_example:
    print token.orth_, token.ent_type_ if token.ent_type_ != "" else "(not an entity)"
    
print "----------------- entities only -----------------"
ents = list(parsed_example.ents)
for entity in ents:
    print entity.label, entity.label_, ' '.join(t.orth_ for t in entity)

Apple ORG
's (not an entity)
stocks (not an entity)
dropped (not an entity)
dramatically (not an entity)
after (not an entity)
the (not an entity)
death (not an entity)
of (not an entity)
Steve PERSON
Jobs PERSON
in (not an entity)
October DATE
. (not an entity)
----------------- entities only -----------------
349 ORG Apple
346 PERSON Steve Jobs
356 DATE October


In [24]:
# PARSING MESSY DATA
messy_data = u"lol that is rly funny :) This is gr8 i rate it 8/8!!!"
parsed_data = parser(messy_data)
for token in parsed_data:
    print token.orth_, token.pos_, token.lemma_

lol NOUN lol
that ADJ that
is VERB be
rly ADV rly
funny ADJ funny
:) PUNCT :)
This DET this
is VERB be
gr8 VERB gr8
i PRON i
rate VERB rate
it PRON it
8/8 NUM 8/8
! PUNCT !
! PUNCT !
! PUNCT !


In [111]:
# WORD EMBEDDINGS AND SIMILARITY
from numpy import dot
from numpy.linalg import norm

nasa = parser.vocab[u'NASA']

cosine = lambda v1, v2: dot(v1,v2) / (norm(v1)*norm(v2))

all_words = [w for w in parser.vocab
             if w.has_vector and w.orth_.islower() and w.lower_ != u'NASA']

all_words.sort(key=lambda w: cosine(w.vector,nasa.vector), reverse=True)
print "Top 10 most similar words to NASA"
for word in all_words[:10]:
    print word.orth_

Top 10 most similar words to NASA
nasa
jpl
noaa
darpa
esa
cern
nih
norad
spacex
fema


In [116]:
# WORD VECTOR ALGEBRA
king = parser.vocab[u'king']
man = parser.vocab[u'man']
woman = parser.vocab[u'queen']

result = king.vector - man.vector + woman.vector

all_words = [w for w in parser.vocab
             if w.has_vector and w.orth_.islower() 
                             and w.lower_ not in ['king','man','woman']]
all_words.sort(key=lambda w: cosine(w.vector,result), reverse=True)
print '\n-----------------------\nTop 3 closest results for king - man + woman:'
for word in all_words[:3]:
    print word.orth_


-----------------------
Top 3 closest results for king - man + woman:
queen
norodom
hrh
