## **Text Preprocessing in NLP**

In [2]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/590.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


In [3]:
# Import libraries
import nltk
import re
import emoji
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [4]:
# Download required resources (first run only)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

## **1. Sample Raw Text**

In [5]:
text = "The cats ARE playing in the Garden!!! NLP makes machines 🤖 intelligent. Price = $100."
print("Original Text:\n", text)

Original Text:
 The cats ARE playing in the Garden!!! NLP makes machines 🤖 intelligent. Price = $100.


## **2. Lowercasing**

In [7]:
lower_text = text.lower()
print("Lowercased Text:\n", lower_text)

Lowercased Text:
 the cats are playing in the garden!!! nlp makes machines 🤖 intelligent. price = $100.


## **3. Tokenization**

In [9]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [10]:
# Sentence tokenization
sent_tokens = sent_tokenize(text)
print("Sentence Tokens:\n", sent_tokens)

Sentence Tokens:
 ['The cats ARE playing in the Garden!!!', 'NLP makes machines 🤖 intelligent.', 'Price = $100.']


In [11]:
# Word tokenization
word_tokens = word_tokenize(text)
print("\nWord Tokens:\n", word_tokens)


Word Tokens:
 ['The', 'cats', 'ARE', 'playing', 'in', 'the', 'Garden', '!', '!', '!', 'NLP', 'makes', 'machines', '🤖', 'intelligent', '.', 'Price', '=', '$', '100', '.']


## **4. Stopword Removal**

In [12]:
stop_words = set(stopwords.words('english'))
filtered_tokens = [w for w in word_tokens if w.lower() not in stop_words and w.isalpha()]
print("After Stopword Removal:\n", filtered_tokens)

After Stopword Removal:
 ['cats', 'playing', 'Garden', 'NLP', 'makes', 'machines', 'intelligent', 'Price']


## **5. Stemming**

In [13]:
stemmer = PorterStemmer()
stemmed = [stemmer.stem(w) for w in filtered_tokens]
print("After Stemming:\n", stemmed)

After Stemming:
 ['cat', 'play', 'garden', 'nlp', 'make', 'machin', 'intellig', 'price']


## **6. Lemmatization**

In [14]:
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(w, pos='v') for w in filtered_tokens]
print("After Lemmatization:\n", lemmatized)

After Lemmatization:
 ['cat', 'play', 'Garden', 'NLP', 'make', 'machine', 'intelligent', 'Price']


## **7. Removing Punctuation & Numbers**

In [15]:
clean_text = re.sub(r'[^a-zA-Z\s]', '', text)  # keep only alphabets
print("Without Punctuation & Numbers:\n", clean_text)

Without Punctuation & Numbers:
 The cats ARE playing in the Garden NLP makes machines  intelligent Price  


## **8. Handling Emojis & Special Characters**

In [16]:
# Extract emojis
emojis = [c for c in text if c in emoji.EMOJI_DATA]
print("Extracted Emojis:", emojis)

Extracted Emojis: ['🤖']


In [17]:
# Remove emojis from text
no_emoji_text = emoji.replace_emoji(text, replace='')
print("Text without Emojis:\n", no_emoji_text)

Text without Emojis:
 The cats ARE playing in the Garden!!! NLP makes machines  intelligent. Price = $100.


## **9. Full Preprocessing Function**

In [18]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation/numbers
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [lemmatizer.lemmatize(w, pos='v') for w in tokens]
    return tokens

In [19]:
print("Final Clean Tokens:\n", preprocess(text))

Final Clean Tokens:
 ['cat', 'play', 'garden', 'nlp', 'make', 'machine', 'intelligent', 'price']
