In [1]:
### Text Preprocessing: Stemming Using NLTK

In [2]:
from nltk.stem import PorterStemmer

porter = PorterStemmer()

words = ["eating", "eats", "eaten", "writing", "writes", "programming", "programs", "history", "finally", "finalized"]

for word in words:
    print(f"{word} -> {porter.stem(word)})")

eating -> eat)
eats -> eat)
eaten -> eaten)
writing -> write)
writes -> write)
programming -> program)
programs -> program)
history -> histori)
finally -> final)
finalized -> final)


In [3]:
### Additional Examples Highlighting Stemming Disadvantages
print(porter.stem("congratulations"))
print(porter.stem("sitting"))

congratul
sit


In [4]:
### Regex Stemmer: A Customizable Stemming Approach
from nltk.stem import RegexpStemmer

# Initialize RegexpStemmer with a regex pattern to remove common suffixes
regexp_stemmer = RegexpStemmer('(ing|s|e|able)$')

words = ["eating", "writes", "disable", "finalize"]

for word in words:
    print(f"{word} -> {regexp_stemmer.stem(word)}")

eating -> eat
writes -> write
disable -> dis
finalize -> finaliz


In [5]:
### Snowball Stemmer: An Improved Stemming Technique
from nltk.stem import SnowballStemmer

snowball = SnowballStemmer("english")

words = ["fairly","sportingly","eating", "eats", "eaten", "writing", "writes", "programming", "programs", "history", "finally", "finalized"]

for word in words:
    print(f"{word} -> {snowball.stem(word)}")

fairly -> fair
sportingly -> sport
eating -> eat
eats -> eat
eaten -> eaten
writing -> write
writes -> write
programming -> program
programs -> program
history -> histori
finally -> final
finalized -> final


In [6]:
### Snowball Stemmer provides more meaningful stems, improving accuracy in text preprocessing.

print(porter.stem("fairly"))
print(porter.stem("sportingly"))
print(snowball.stem("fairly"))
print(snowball.stem("sportingly"))

fairli
sportingli
fair
sport


In [7]:
### Limitations of Stemming
print(snowball.stem("goes"))
print(porter.stem("goes"))

goe
goe


In [8]:
'''When to Use Lemmatization To overcome stemming's disadvantages, 
lemmatization is used. Lemmatization uses a dictionary of root words and returns grammatically correct base forms. 
For example, "goes" becomes "go", and "fairly" becomes "fair". 
Lemmatization is preferred in applications requiring accurate word forms.'''

'When to Use Lemmatization To overcome stemming\'s disadvantages, \nlemmatization is used. Lemmatization uses a dictionary of root words and returns grammatically correct base forms. \nFor example, "goes" becomes "go", and "fairly" becomes "fair". \nLemmatization is preferred in applications requiring accurate word forms.'