In [None]:
# Stemming
# --------
#  We may want to get variations of a word from a text file like 'boat',
#  searching for 'boat' might return 'boats', 'boating', 'boater', and so on.
#  So, 'boat' would be the stem for ['boats', 'boating', 'boater']

#  - Stemming is a method for cataloging related words.
#  - Stemming basically chops off the end of words
#  - Spacy doesn't include a stemmer, it rely entirely on lemmatization.

# Why Stemming? People do it before running analysis in order to try to reduce words to their root idea.
#  example: words like am, are, is belong to the verb <Be>

In [13]:
import nltk

from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

p_stemmer = PorterStemmer()
s_stemmer = SnowballStemmer(language='english')
words = [
    'run','runner','ran',
    'runs', 'easily', 'fairly',
    'generous', 'generation', 'generously',
    'generate'
]
for word in words:
    print(f"Word: {word}\t\t-PorterStemmer: {p_stemmer.stem(word)}\t-SnowballStemmer: {s_stemmer.stem(word)}")

Word: run		-PorterStemmer: run	-SnowballStemmer: run
Word: runner		-PorterStemmer: runner	-SnowballStemmer: runner
Word: ran		-PorterStemmer: ran	-SnowballStemmer: ran
Word: runs		-PorterStemmer: run	-SnowballStemmer: run
Word: easily		-PorterStemmer: easili	-SnowballStemmer: easili
Word: fairly		-PorterStemmer: fairli	-SnowballStemmer: fair
Word: generous		-PorterStemmer: gener	-SnowballStemmer: generous
Word: generation		-PorterStemmer: gener	-SnowballStemmer: generat
Word: generously		-PorterStemmer: gener	-SnowballStemmer: generous
Word: generate		-PorterStemmer: gener	-SnowballStemmer: generat


In [19]:
phrase = "I went to the lawyer's office to discuss my health insurance"
stemms = [s_stemmer.stem(word) for word in phrase.split()]
print(f"phrase: {phrase}")
print(f"Stemmed phrase: {' '.join(stemms)}")


phrase: I went to the lawyer's office to discuss my health insurance
Stemmed phrase: i went to the lawyer offic to discuss my health insur
