In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize

# Download NLTK resources if not already done
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

text = """
Natural Language Processing is a subfield of Artificial Intelligence (AI). 
It helps computers understand and process human language!! Numbers like 2023, and symbols like %$, are often present.
Stemming, lemmatization, stop word removal—these are basic preprocessing steps.
Contact us: example@email.com or visit https://nlp.org
"""

# 1. Lowercasing
text = text.lower()

# 2. Remove URLs and emails
text = re.sub(r'https?://\S+|www\.\S+', '', text)
text = re.sub(r'\S+@\S+', '', text)

# 3. Tokenization
words = word_tokenize(text)
sent=sent_tokenize(text)

# 4. Remove digits and punctuation
text = re.sub(r'\d+', '', text)
text = re.sub(r'[^\w\s]', '', text)



# 5. Remove stopwords
stop_words = set(stopwords.words('english'))
words_nostop = [w for w in words if w not in stop_words]

# 6. Remove short words (single letters, etc.)
words_noshort = [w for w in words_nostop if len(w) > 2]

# 7. Stemming
stemmer = PorterStemmer()
stemmed = [stemmer.stem(w) for w in words_noshort]

# 8. Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(w) for w in words_noshort]

print("Original tokens:\n", words)
print("Sentence tokens:\n",sent)

print("\nTokens after stopword & short word removal:\n", words_noshort)
print("\nAfter stemming:\n", stemmed)
print("\nAfter lemmatization:\n", lemmatized)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rv401\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rv401\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rv401\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Original tokens:
 ['natural', 'language', 'processing', 'is', 'a', 'subfield', 'of', 'artificial', 'intelligence', '(', 'ai', ')', '.', 'it', 'helps', 'computers', 'understand', 'and', 'process', 'human', 'language', '!', '!', 'numbers', 'like', '2023', ',', 'and', 'symbols', 'like', '%', '$', ',', 'are', 'often', 'present', '.', 'stemming', ',', 'lemmatization', ',', 'stop', 'word', 'removal—these', 'are', 'basic', 'preprocessing', 'steps', '.', 'contact', 'us', ':', 'or', 'visit']
Sentence tokens:
 ['\nnatural language processing is a subfield of artificial intelligence (ai).', 'it helps computers understand and process human language!!', 'numbers like 2023, and symbols like %$, are often present.', 'stemming, lemmatization, stop word removal—these are basic preprocessing steps.', 'contact us:  or visit']

Tokens after stopword & short word removal:
 ['natural', 'language', 'processing', 'subfield', 'artificial', 'intelligence', 'helps', 'computers', 'understand', 'process', 'human',