In [3]:
#!pip install -U spacy

In [5]:
#!python -m spacy download en

In [7]:
import spacy

In [9]:
sp = spacy.load('en_core_web_sm')

In [11]:
sentence = sp(u'Manchester United is looking to sign a forward for $90 million')

In [13]:
for word in sentence:
    print(word.text)

Manchester
United
is
looking
to
sign
a
forward
for
$
90
million


In [16]:
# check parts of speech - each word or token in our sentence has been assigned a part of speech.
for word in sentence:
    print(word.text, word.pos_)

Manchester PROPN
United PROPN
is AUX
looking VERB
to PART
sign VERB
a DET
forward NOUN
for ADP
$ SYM
90 NUM
million NUM


In [18]:
# dependencies
sentence2 = sp(u"Manchester United isn't looking to sign any forward.")

In [20]:
#For dependency parsing, the attribute dep_ is used as below

for word in sentence2:
    print(word.text,  word.pos_, word.dep_)

Manchester PROPN compound
United PROPN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
to PART aux
sign VERB xcomp
any DET advmod
forward ADV advmod
. PUNCT punct


In [22]:
document = sp(u'Hello from Stackabuse. The site with the best Python Tutorials. What are you looking for?')

In [24]:
for sentence in document.sents:
    print(sentence)

Hello from Stackabuse.
The site with the best Python Tutorials.
What are you looking for?


In [31]:
document[4]

The

In [33]:
document[4].is_sent_start

True

In [35]:
#Tokenization

sentence4 = sp(u"Hello, I am non-vegetarian, email me the menu at abc-xyz@gmai.com")
print(sentence4)

Hello, I am non-vegetarian, email me the menu at abc-xyz@gmai.com


In [37]:
for word in sentence4:
    print(word.text)

Hello
,
I
am
non
-
vegetarian
,
email
me
the
menu
at
abc-xyz@gmai.com


In [40]:
# Detecting Entities

sentence5 = sp(u'Manchester United is looking to sign Harry Kane for $90 million')  
for word in sentence5:
    print(word.text)

Manchester
United
is
looking
to
sign
Harry
Kane
for
$
90
million


In [41]:
# here "Manchester United" is a single word, therefore it should not be tokenized into two words.
#Similarly, "Harry Kane" is the name of a person, and "$90 million" is a currency value.
#These should not be tokenized either

In [45]:
# named entity recognition 
for entity in sentence5.ents:
    print(entity.text + ' - ' + entity.label_ + ' - ' + str(spacy.explain(entity.label_)))

Manchester United - PERSON - People, including fictional
Harry Kane - PERSON - People, including fictional
$90 million - MONEY - Monetary values, including unit


In [47]:
#Detecting Nouns
for noun in sentence5.noun_chunks:
    print(noun.text)

Manchester United
Harry Kane


In [50]:
#Stemming
#spaCy doesn't contain any function for stemming as it relies on lemmatization

# hence NLTK

#There are two types of stemmers in NLTK: 

#1. Porter Stemmer 
#2. Snowball stemmers. 

#Both of them have been implemented using different algorithms

In [52]:
import nltk
from nltk.stem.porter import *

In [54]:
stemmer = PorterStemmer()

In [55]:
tokens = ['compute', 'computer', 'computed', 'computing']

In [56]:
for token in tokens:
    print(token + ' --> ' + stemmer.stem(token))

compute --> comput
computer --> comput
computed --> comput
computing --> comput


In [57]:
#Snowball stemmer is a slightly improved version of the Porter stemmer and is usually preferred over the latter

In [63]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer(language='english')

tokens = ['compute', 'computer', 'computed', 'computing']

for token in tokens:
    print(token + ' --> ' + stemmer.stem(token))

compute --> comput
computer --> comput
computed --> comput
computing --> comput


In [64]:
# You can see that the results are the same.
#  We still got "comput" as the stem. Again, this word "comput" actually isn't a dictionary word.

# This is where lemmatization comes handy.

In [62]:
# Lemmatization

# - Lemmatization reduces the word to its stem as it appears in the dictionary. 
# - The stems returned through lemmatization are actual dictionary words.
# - they are semantically complete unlike the words returned by stemmer.

In [66]:
sentence6 = sp(u'compute computer computed computing')

In [68]:
for word in sentence6:
    print(word.text,  word.lemma_)

compute compute
computer computer
computed compute
computing computing


In [69]:
# Lemmatization converts words in the second or third forms to their first form variants

In [70]:
sentence7 = sp('A letter has been written, asking him to be released')

for word in sentence7:
    print(word.text + '  ===>', word.lemma_)

A  ===> a
letter  ===> letter
has  ===> have
been  ===> be
written  ===> write
,  ===> ,
asking  ===> ask
him  ===> -PRON-
to  ===> to
be  ===> be
released  ===> release
