In [None]:
#This is a typical set of instructions for importing and working with spacy. 
# Don't be surprises if this takes a while - spacy has fairly large library to load:

In [2]:
#Import spaCy and load the language librbary
import spacy
nlp = spacy.load("en_core_web_sm")

# Create a Doc object
doc = nlp(u"Tesla is looking at buying U.S. startup for $6 million ")

#Print each token in the Doc object separately
for token in doc:
    print(token.text, token.pos_, token.dep_)   

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN dobj
startup VERB advcl
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [None]:
# TOKENIZATION
# The first step in processing text is to split up all the component parts (words & punctuation) into "tokens".
#  These tokens are annotated inside the Doc object to contain descriptive information.
#  We'll go into much more detail on tokenization in an upcoming lecture. For now, let's look at anotherÂ example:

In [3]:
doc2 = nlp(u"Tesla isn't   looking into startups anymore.")
for token in doc2:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
   SPACE dep
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [4]:
doc2

Tesla isn't   looking into startups anymore.

In [5]:
doc2[0]

Tesla

In [7]:
type(doc2)

spacy.tokens.doc.Doc

In [None]:
# Part of speech tagging(POS)
#The next step after splitting the text up into tokens is to assign parts of speech. 
# In the above example, Tesla was recognized to be a proper noun. Here some statistical modeling is required. For example, words that follow "the" are typically nouns.
# For a full list of POS Tags visit https://spacy.io/api/annotationttpos-tagging

In [8]:
doc2[0].pos_

'PROPN'

In [9]:
doc2[0].dep_

'nsubj'

In [10]:
spacy.explain("PROPN")

'proper noun'

In [11]:
spacy.explain("nsubj")

'nominal subject'

In [12]:
# Lemmas (the base form of a word):
print(doc2[4].text)  # looking
print(doc2[4].lemma_)  # look

looking
look


In [14]:
# Simple parts-of-speech & detailed tags:
print(doc2[4].pos_)  # VERB
print(doc2[4].tag_ + ' / ' + spacy.explain(doc2[4].tag_))  # VBG

VERB
VBG / verb, gerund or present participle


In [20]:
# Word shapes:
print(doc2[0].text+': '+doc2[0].shape_)  # xxxx
print(doc[5].text+' : '+doc[5].shape_)  # $d ddddddd

Tesla: Xxxxx
U.S. : X.X.


In [22]:
# Boolean values:
print(doc2[0].is_alpha)  # True
print(doc2[0].is_stop)  # False

True
False


In [24]:
doc3 = nlp(
    u"Although commonly attributed to John Lennon from his song 'Beautiful Boy', "
    "the phrase 'Life is what happens to us while we are making other plans' was written by "
    "cartoonist Allen Saunders and published in Reader's Digest in 1957, when Lennon was 17."
)

In [25]:
life_quote = doc3[16:30]
print(life_quote.text)  # Life is what happens to us while we are making other plans

'Life is what happens to us while we are making other plans'


In [26]:
type(life_quote)

spacy.tokens.span.Span

In [None]:
# Sentances

In [29]:
doc4= nlp(u"This is a sentence. This is another sentence. This is a third.")
for sent in doc4.sents:
    print(sent)

This is a sentence.
This is another sentence.
This is a third.
