### Tokenization:

In [27]:
import spacy # importing the library

In [7]:
nlp = spacy.load('en_core_web_sm') # loading the vocabulary of words!

In [3]:
mystring = '"We\'re moving to L.A!"'

In [4]:
mystring

'"We\'re moving to L.A!"'

In [5]:
print(mystring)

"We're moving to L.A!"


In [14]:
doc = nlp(mystring) # mystring document 

token_list = []

for token in doc:
    print(token.text) # reports back the text of each token in 'doc' string
    token_list.append(token.text) 
    
print(token_list)
print('\n')
print(len(token_list))

"
We
're
moving
to
L.A
!
"
['"', 'We', "'re", 'moving', 'to', 'L.A', '!', '"']


8


In [15]:
doc2 = nlp(u"We're here to help! send us snail-mail at support@oursite.com, or contact us at http://www.outsite.com")

In [17]:
for token in doc2:
    print(token) # Spacy is smart enough to understand the the punctuations, website links and simple words

We
're
here
to
help
!
send
us
snail
-
mail
at
support@oursite.com
,
or
contact
us
at
http://www.outsite.com


In [20]:
doc3 = nlp(u"A 5km NYC cab ride costs $10.30")

for token in doc3:
    print(token.text) # Here the distance unit and $ sign have assigned their own tokens! and Spacy is smart enough to keep the price together

A
5
km
NYC
cab
ride
costs
$
10.30


In [21]:
doc4 = nlp(u"Let's visit St. Louis in U.S next year!")

for token in doc4:
    print(token.text)

Let
's
visit
St.
Louis
in
U.S
next
year
!


In [25]:
len(doc4.vocab) # reports back the length of 'en_core_web_sm' vocabulary we imported!

57852

In [26]:
type(doc4)

spacy.tokens.doc.Doc

### Named Entity Recognition:

In [28]:
doc5 = nlp(u"Apple to built a Hong Kong factory for $6 millions!")

In [29]:
for token in doc5:
    print(token.text, end = '| ')

Apple| to| built| a| Hong| Kong| factory| for| $| 6| millions| !| 

In [31]:
for entity in doc5.ents:
    print(entity) # reports back the entities in context
    print(entity.label_) # off of the entity, it can also extract the labels for each
    print('\n')

Apple
ORG


Hong Kong
GPE


$6 millions
MONEY




In [35]:
# Lets say we want to know, the meanings of each label for entities
# Well, we have a simple spacy method for this

for entity in doc5.ents:
    print(entity)
    print(str(spacy.explain(entity.label_)))
    print(entity.label_)
    print('\n')

Apple
Companies, agencies, institutions, etc.
ORG


Hong Kong
Countries, cities, states
GPE


$6 millions
Monetary values, including unit
MONEY




### Noun Chunks:

In [40]:
# with Spacy we can also easily grab the Noun chunks from our text
# Noun chunk: A word associated with a noun, essentially more highlighting it's meanings

# e.g Autonomous Cars, Here, Cars is a noun which is associated with the word 'Autonomous'

doc6 = nlp(u"Autonomous cars shift insurance liability toward the manufacturers")

for chunk in doc6.noun_chunks:
    print(chunk, end = ' | ')
    # report back the noun chunks!

Autonomous cars | insurance liability | the manufacturers | 