In [2]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [4]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re


In [8]:
# Step 1: Download required NLTK packages (handles missing resources)
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


True

In [10]:
# Step 2: Initialize text
text = "Tokenization is the first step in text analytics. The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization."

In [14]:
# Step 3: Perform Tokenization
print("\n--- Tokenization ---")
tokenized_sentences = sent_tokenize(text)  # Sentence Tokenization
tokenized_words = word_tokenize(text)  # Word Tokenization
print("Sentences:", tokenized_sentences)
print("Words:", tokenized_words)


--- Tokenization ---
Sentences: ['Tokenization is the first step in text analytics.', 'The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization.']
Words: ['Tokenization', 'is', 'the', 'first', 'step', 'in', 'text', 'analytics', '.', 'The', 'process', 'of', 'breaking', 'down', 'a', 'text', 'paragraph', 'into', 'smaller', 'chunks', 'such', 'as', 'words', 'or', 'sentences', 'is', 'called', 'Tokenization', '.']


In [16]:
# Step 4: Removing Punctuation & Stop Words
stop_words = set(stopwords.words("english"))

In [18]:
# Remove punctuation and lowercase the text
clean_text = re.sub(r'[^\w\s]', '', text.lower())

In [20]:
# Tokenize and remove stopwords
filtered_words = [word for word in word_tokenize(clean_text) if word not in stop_words]

In [22]:
print("\n--- Stopword Removal ---")
print("Filtered Words:", filtered_words)


--- Stopword Removal ---
Filtered Words: ['tokenization', 'first', 'step', 'text', 'analytics', 'process', 'breaking', 'text', 'paragraph', 'smaller', 'chunks', 'words', 'sentences', 'called', 'tokenization']


In [24]:
# Step 5: Perform Stemming
ps = PorterStemmer()
sample_words = ["wait", "waiting", "waited", "waits"]

In [26]:
print("\n--- Stemming ---")
print([ps.stem(word) for word in sample_words])


--- Stemming ---
['wait', 'wait', 'wait', 'wait']


In [28]:
# Step 6: Perform Lemmatization
lemmatizer = WordNetLemmatizer()
lem_words = ["studies", "studying", "cries", "cry"]

In [30]:
print("\n--- Lemmatization ---")
print([lemmatizer.lemmatize(word) for word in lem_words])


--- Lemmatization ---
['study', 'studying', 'cry', 'cry']


In [32]:
# Step 7: Apply POS Tagging
data = "The pink sweater fit her perfectly"
words = word_tokenize(data)

In [34]:
print("\n--- POS Tagging ---")
print(nltk.pos_tag(words))


--- POS Tagging ---
[('The', 'DT'), ('pink', 'NN'), ('sweater', 'NN'), ('fit', 'VBP'), ('her', 'PRP$'), ('perfectly', 'RB')]
