In [1]:
import nltk

from nltk.stem.porter import *

In [2]:
p_stemmer = PorterStemmer()

In [3]:
words = ['run', 'runner', 'running', 'ran', 'easily', 'fairly']

In [4]:
#Note how the stemmer recognizes "runner" as a noun, not a verb form or participle. Also, the adverbs "easily" and "fairly" are stemmed to the unusual root "easili" and "fairli"
for word in words:
    print(word+' ---> '+p_stemmer.stem(word))

run ---> run
runner ---> runner
running ---> run
ran ---> ran
easily ---> easili
fairly ---> fairli


In [5]:
from nltk.stem.snowball import SnowballStemmer

#The Snowball Stemmer requires that you pass a language parameter
s_stemmer = SnowballStemmer(language='english')

In [6]:
words = ['run','runner','running','ran','runs','easily','fairly']

In [7]:
#In this case the stemmer performed the same as the Porter Stemmer, with the exception that it handled the stem of "fairly" more appropriately with "fair"

for word in words:
    print(word+' ---> '+s_stemmer.stem(word))

run ---> run
runner ---> runner
running ---> run
ran ---> ran
runs ---> run
easily ---> easili
fairly ---> fair


In [8]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [9]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma)

I 	 PRON 	 561228191312463089 	 561228191312463089
am 	 AUX 	 10382539506755952630 	 10382539506755952630
a 	 DET 	 11901859001352538922 	 11901859001352538922
runner 	 NOUN 	 12640964157389618806 	 12640964157389618806
running 	 VERB 	 12767647472892411841 	 12767647472892411841
in 	 ADP 	 3002984154512732771 	 3002984154512732771
a 	 DET 	 11901859001352538922 	 11901859001352538922
race 	 NOUN 	 8048469955494714898 	 8048469955494714898
because 	 SCONJ 	 16950148841647037698 	 16950148841647037698
I 	 PRON 	 561228191312463089 	 561228191312463089
love 	 VERB 	 3702023516439754181 	 3702023516439754181
to 	 PART 	 3791531372978436496 	 3791531372978436496
run 	 VERB 	 12767647472892411841 	 12767647472892411841
since 	 SCONJ 	 10066841407251338481 	 10066841407251338481
I 	 PRON 	 561228191312463089 	 561228191312463089
ran 	 VERB 	 12767647472892411841 	 12767647472892411841
today 	 NOUN 	 11042482332948150395 	 11042482332948150395


In [None]:
In the above sentence, running, run and ran all point to the same lemma run 

In [11]:
#Here we're using an f-string to format the printed text by setting minimum field widths and adding a left-align to the lemma hash value.

def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [12]:
doc2 = nlp(u"I saw eighteen mice today!")

show_lemmas(doc2)

I            PRON   561228191312463089     -PRON-
saw          VERB   11925638236994514241   see
eighteen     NUM    9609336664675087640    eighteen
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


In [14]:
doc3 = nlp(u"I am meeting him tomorrow at the meeting")

show_lemmas(doc3)

I            PRON   561228191312463089     -PRON-
am           AUX    10382539506755952630   be
meeting      VERB   6880656908171229526    meet
him          PRON   561228191312463089     -PRON-
tomorrow     NOUN   3573583789758258062    tomorrow
at           ADP    11667289587015813222   at
the          DET    7425985699627899538    the
meeting      NOUN   14798207169164081740   meeting


In [15]:
doc4 = nlp(u"That's an enormous automobile")

show_lemmas(doc4)

That         DET    4380130941430378203    that
's           AUX    10382539506755952630   be
an           DET    15099054000809333061   an
enormous     ADJ    17917224542039855524   enormous
automobile   NOUN   7211811266693931283    automobile


In [16]:
# Print the set of spaCy's default stop words (remember that sets are unordered):

print(nlp.Defaults.stop_words)

{'back', 'most', 'nothing', 'sometimes', 'namely', 'whoever', '‘s', 'anyway', 'several', 'seems', 'together', 'her', 'may', "'ll", 'him', 'keep', 'us', 'do', 'between', 'too', 'whereby', 'last', 'however', 'empty', 'is', 'thence', 'anyhow', 'ever', 'does', 'my', 'should', 'fifteen', 'thereby', '’m', 'even', 'full', 'per', 'onto', 'such', 'the', 'across', 'he', 'or', 'whole', 'though', 'they', 'until', 'quite', 'neither', 'whatever', 'nor', 'n’t', 'both', 'seem', 'as', 'could', 'go', 'became', 'hence', 'why', 'get', 'within', 'can', 'hundred', 'at', '’s', 'down', 'yours', 'that', 'over', '‘re', 'moreover', 'where', 'forty', 'besides', 'hereupon', 'somewhere', 'third', 'seemed', 'all', 'therein', 'six', 'no', 'twenty', 'whom', 'only', 'bottom', 'off', '‘ve', 'when', 'show', 'everywhere', 'which', 'yet', 'least', 'beforehand', '’ve', 'towards', 'due', 'some', 'wherever', 'fifty', 'mostly', 'below', 'whether', 're', 'was', 'less', 'thru', 'your', 'please', 'using', 'whose', 'more', 'either

In [17]:
len(nlp.Defaults.stop_words)

326

In [18]:
nlp.vocab['myself'].is_stop

True

In [19]:
nlp.vocab['mystery'].is_stop

False

In [20]:
# Add the word to the set of stop words. Use lowercase!

nlp.Defaults.stop_words.add('btw')

# Set the stop_word tag on the lexeme
nlp.vocab['btw'].is_stop = True

In [21]:
len(nlp.Defaults.stop_words)

327

In [22]:
nlp.vocab['btw'].is_stop

True

In [23]:
# Remove the word from the set of stop words
nlp.Defaults.stop_words.remove('beyond')

# Remove the stop_word tag from the lexeme
nlp.vocab['beyond'].is_stop = False

In [24]:
len(nlp.Defaults.stop_words)

326

In [25]:
nlp.vocab['beyond'].is_stop

False