In [15]:
# Imports NLTK and downloads required datasets for tokenization and lemmatization
import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [26]:
# Sample text used for demonstrating NLP preprocessing techniques
text = "I am Yashraj from CSE - AI."

In [27]:
# Splits text into tokens using spaces as separators
from nltk.tokenize import WhitespaceTokenizer

wt = WhitespaceTokenizer()
print("Whitespace Tokenization:")
print(wt.tokenize(text))


Whitespace Tokenization:
['I', 'am', 'Yashraj', 'from', 'CSE', '-', 'AI.']


In [28]:
# Separates words and punctuation into individual tokens
from nltk.tokenize import wordpunct_tokenize

print("\nPunctuation-based Tokenization:")
print(wordpunct_tokenize(text))



Punctuation-based Tokenization:
['I', 'am', 'Yashraj', 'from', 'CSE', '-', 'AI', '.']


In [21]:
# Tokenizes text using English grammar and punctuation rules
from nltk.tokenize import TreebankWordTokenizer

twt = TreebankWordTokenizer()
print("\nTreebank Tokenization:")
print(twt.tokenize(text))



Treebank Tokenization:
['I', 'am', 'Yashraj', 'from', 'CSE', '-', 'AI', '.']


In [22]:
# Tokenizes social media text handling hashtags, mentions, and emojis
from nltk.tokenize import TweetTokenizer

tweet_tokenizer = TweetTokenizer()
print("\nTweet Tokenization:")
print(tweet_tokenizer.tokenize(text))



Tweet Tokenization:
['I', 'am', 'Yashraj', 'from', 'CSE', '-', 'AI', '.']


In [23]:
# Combines predefined multi-word expressions into single tokens
from nltk.tokenize import MWETokenizer

mwe = MWETokenizer([('machine', 'learning'), ('artificial', 'intelligence')], separator='_')
sentence = "Machine learning and artificial intelligence are related fields"
print("\nMWE Tokenization:")
print(mwe.tokenize(sentence.lower().split()))



MWE Tokenization:
['machine_learning', 'and', 'artificial_intelligence', 'are', 'related', 'fields']


**Stemming**

In [24]:
# Reduces words to their root form using Porter stemming algorithm
from nltk.stem import PorterStemmer

ps = PorterStemmer()
words = ["running", "runner", "ran", "easily", "fairly"]

print("\nPorter Stemmer:")
for word in words:
    print(word, "→", ps.stem(word))



Porter Stemmer:
running → run
runner → runner
ran → ran
easily → easili
fairly → fairli


In [25]:
# Performs improved and language-specific stemming
from nltk.stem import SnowballStemmer

ss = SnowballStemmer("english")

print("\nSnowball Stemmer:")
for word in words:
    print(word, "→", ss.stem(word))



Snowball Stemmer:
running → run
runner → runner
ran → ran
easily → easili
fairly → fair


**Lemmatization (Using WordNet Lemmatizer)**

In [14]:
# Converts words into meaningful base forms using WordNet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
words = ["running", "better", "cars", "went"]

print("\nLemmatization:")
print("running →", lemmatizer.lemmatize("running", pos="v"))
print("better →", lemmatizer.lemmatize("better", pos="a"))
print("cars →", lemmatizer.lemmatize("cars", pos="n"))
print("went →", lemmatizer.lemmatize("went", pos="v"))



Lemmatization:
running → run
better → good
cars → car
went → go
