#### NLTK - stemming
Start by defining some words:

In [1]:
words = ["game","gaming","gamed","games"]

In [2]:
from nltk.stem import PorterStemmer

In [3]:
ps = PorterStemmer()

In [4]:
for word in words:
    print(ps.stem(word))


game
game
game
game


#### Example2

In [5]:
sentence = "gaming, the gamers play games"

In [6]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [7]:
words = word_tokenize(sentence)

In [8]:
print(words)

['gaming', ',', 'the', 'gamers', 'play', 'games']


In [9]:
from nltk.stem import PorterStemmer

In [10]:
ps = PorterStemmer()

In [11]:
ps.stem(word)

'game'

In [13]:
for word in words:
    print(word + ":" + ps.stem(word))

gaming:game
,:,
the:the
gamers:gamer
play:play
games:game


#### Stemming with NLTK
There are more stemming algorithms, but Porter (PorterStemer) is the most popular.

#### Example3

In [12]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [14]:
text = "A quick brown fox jumps over the lazy dog."

In [15]:
text = text.lower()

In [16]:
# tokenize text 
words = word_tokenize(text)
 
print (words)

['a', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']


In [17]:
stemmer = PorterStemmer()


In [18]:
words_stem = []
for word in words:
    words_stem.append(stemmer.stem(word))

In [19]:
print(words_stem)

['a', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazi', 'dog', '.']


In [20]:
#  a shorter version of the above code

In [21]:
words_stem = [stemmer.stem(word) for word in words]

#### Using split() function

#### Note: Tokenizing sentences into words is useful as it separates punctuations from the words. In below example, the last word dog will be taken as dog. (with full-stop at the end). The punctuation mark is not separated from the word.

In [22]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
text = "A quick brown fox jumps over the lazy dog."
text_stem = " ".join([stemmer.stem(word) for word in text.split()])
print (text_stem)

A quick brown fox jump over the lazi dog.


#### Stemming Non-English Words

There are other different stemmers like:
    
* SnowballStemmer, LancasterStemmer, ISRIStemmer, RSLPStemmer, RegexpStemmer.

Stemming Spanish Words using SnowballStemmer​

Let’s stem some Spanish words.​

Here’s the English translation of the Spanish words:

In [23]:
# -*- coding: utf-8 -*- 
# The above line is added to solve the following error: 
# SyntaxError: Non-ASCII character '\xc3' in file
 
from nltk.stem import SnowballStemmer
 
stemmer_spanish = SnowballStemmer('spanish')
 
print (stemmer_spanish.stem('trabajando')) # output: trabaj
print (stemmer_spanish.stem('trabajos')) # output: trabaj
#print (stemmer_spanish.stem('trabajó'.decode('utf-8'))) # output: trabaj
 
# UTF-8 decode is done to solve the following error: 
# UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3

trabaj
trabaj


#### Stemming English Words using SnowballStemmer

In [24]:
stemmer_english = SnowballStemmer('english')
print (stemmer_english.stem('working')) # output: work
print (stemmer_english.stem('works')) # output: work
print (stemmer_english.stem('worked')) # output: work

work
work
work
