1. Write a program for pre-processing of a text document such as stop word removal, stemming.

In [14]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize


In [15]:
# Download NLTK resources if not already done
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yogesh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yogesh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yogesh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
text = """
Natural Language Processing is a subfield of Artificial Intelligence (AI). 
It helps computers understand and process human language!! Numbers like 2023, and symbols like %$, are often present.
Stemming, lemmatization, stop word removal—these are basic preprocessing steps.
Contact us: example@email.com or visit https://nlp.org
"""


In [17]:
# 1. Lowercasing
text = text.lower()


In [18]:
# 2. Remove URLs and emails
text = re.sub(r"https?://\S+|www\.\S+", "", text)
text = re.sub(r"\S+@\S+", "", text)


In [19]:
# 3. Tokenization
words = word_tokenize(text)
sent = sent_tokenize(text)


In [20]:
# 4. Remove digits and punctuation
text = re.sub(r"\d+", "", text)
text = re.sub(r"[^\w\s]", "", text)


In [21]:
# 5. Remove stopwords
stop_words = set(stopwords.words("english"))
words_nostop = [w for w in words if w not in stop_words]


In [22]:
# 6. Remove short words (single letters, etc.)
words_noshort = [w for w in words_nostop if len(w) > 2]

In [23]:
# 7. Stemming
stemmer = PorterStemmer()
stemmed = [stemmer.stem(w) for w in words_noshort]

In [24]:
# 8. Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(w) for w in words_noshort]

In [25]:
print("Original tokens:\n", words)
print("Sentence tokens:\n", sent)

Original tokens:
 ['natural', 'language', 'processing', 'is', 'a', 'subfield', 'of', 'artificial', 'intelligence', '(', 'ai', ')', '.', 'it', 'helps', 'computers', 'understand', 'and', 'process', 'human', 'language', '!', '!', 'numbers', 'like', '2023', ',', 'and', 'symbols', 'like', '%', '$', ',', 'are', 'often', 'present', '.', 'stemming', ',', 'lemmatization', ',', 'stop', 'word', 'removal—these', 'are', 'basic', 'preprocessing', 'steps', '.', 'contact', 'us', ':', 'or', 'visit']
Sentence tokens:
 ['\nnatural language processing is a subfield of artificial intelligence (ai).', 'it helps computers understand and process human language!!', 'numbers like 2023, and symbols like %$, are often present.', 'stemming, lemmatization, stop word removal—these are basic preprocessing steps.', 'contact us:  or visit']


In [26]:
print("\nTokens after stopword & short word removal:\n", words_noshort)
print("\nAfter stemming:\n", stemmed)
print("\nAfter lemmatization:\n", lemmatized)


Tokens after stopword & short word removal:
 ['natural', 'language', 'processing', 'subfield', 'artificial', 'intelligence', 'helps', 'computers', 'understand', 'process', 'human', 'language', 'numbers', 'like', '2023', 'symbols', 'like', 'often', 'present', 'stemming', 'lemmatization', 'stop', 'word', 'removal—these', 'basic', 'preprocessing', 'steps', 'contact', 'visit']

After stemming:
 ['natur', 'languag', 'process', 'subfield', 'artifici', 'intellig', 'help', 'comput', 'understand', 'process', 'human', 'languag', 'number', 'like', '2023', 'symbol', 'like', 'often', 'present', 'stem', 'lemmat', 'stop', 'word', 'removal—thes', 'basic', 'preprocess', 'step', 'contact', 'visit']

After lemmatization:
 ['natural', 'language', 'processing', 'subfield', 'artificial', 'intelligence', 'help', 'computer', 'understand', 'process', 'human', 'language', 'number', 'like', '2023', 'symbol', 'like', 'often', 'present', 'stemming', 'lemmatization', 'stop', 'word', 'removal—these', 'basic', 'prep