In [1]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob


# 1. Lowercasing
Converting text to lowercase ensures uniformity (e.g., "Apple" and "apple" are treated as the same word).

In [2]:
text = "Natural Language Processing is FUN!"
text_lower = text.lower()
print("Lowercased Text:", text_lower)


Lowercased Text: natural language processing is fun!


# 2. Removing Numbers
If numbers are not meaningful in your dataset, you can remove them.

In [3]:
text = "I have 2 apples and 3 bananas."
text_no_numbers = re.sub(r'\d+', '', text)
print("Text without Numbers:", text_no_numbers)


Text without Numbers: I have  apples and  bananas.


# 3. Removing Punctuation
Stripping punctuation marks to focus on words.

In [4]:
text = "Hello! How's it going? Great, isn't it?"
text_no_punctuation = text.translate(str.maketrans('', '', string.punctuation))
print("Text without Punctuation:", text_no_punctuation)


Text without Punctuation: Hello Hows it going Great isnt it


# 4. Tokenization
Splitting text into individual words or sentences.

In [5]:
from nltk.tokenize import sent_tokenize, word_tokenize

text = "Tokenization splits text into meaningful chunks. Let's tokenize it!"
sentence_tokens = sent_tokenize(text)
word_tokens = word_tokenize(text)
print("Sentence Tokens:", sentence_tokens)
print("Word Tokens:", word_tokens)


Sentence Tokens: ['Tokenization splits text into meaningful chunks.', "Let's tokenize it!"]
Word Tokens: ['Tokenization', 'splits', 'text', 'into', 'meaningful', 'chunks', '.', 'Let', "'s", 'tokenize', 'it', '!']


# 5. Removing Stopwords
Stopwords are common words (e.g., "is", "the", "and") that don't add significant meaning.

In [6]:
from nltk.corpus import stopwords

text = "This is an example sentence to demonstrate stopword removal."
tokens = word_tokenize(text)
stop_words = set(stopwords.words('english'))
tokens_no_stopwords = [word for word in tokens if word.lower() not in stop_words]
print("Tokens without Stopwords:", tokens_no_stopwords)


Tokens without Stopwords: ['example', 'sentence', 'demonstrate', 'stopword', 'removal', '.']


# 6. Removing Short Words
Filter out words with fewer than 3 characters.

In [7]:
tokens = ["I", "am", "working", "on", "NLP"]
tokens_no_short = [word for word in tokens if len(word) > 2]
print("Tokens without Short Words:", tokens_no_short)


Tokens without Short Words: ['working', 'NLP']


# 7. Expanding Contractions
Convert contractions to their full forms for better context understanding.

In [8]:
!pip install contractions





In [9]:
import re

# Dictionary for contraction mapping
contractions_dict = {
    "can't": "cannot",
    "won't": "will not",
    "it's": "it is",
    "I'm": "I am",
    "you're": "you are",
    "we're": "we are",
    "they're": "they are",
    "didn't": "did not",
    "don't": "do not",
    "doesn't": "does not",
    # Add more contractions as needed
}

# Function to expand contractions
def expand_contractions(text):
    contractions_pattern = re.compile(r'\b(' + '|'.join(contractions_dict.keys()) + r')\b')
    return contractions_pattern.sub(lambda x: contractions_dict[x.group()], text)

# Example text
text = "I can't go because it's raining."
expanded_text = expand_contractions(text)
print("Expanded Text:", expanded_text)


Expanded Text: I cannot go because it is raining.


# 10. Spelling Correction
Fix spelling mistakes in the text.

In [10]:
from textblob import TextBlob

text = "This sentnce has speling erors."
corrected_text = str(TextBlob(text).correct())
print("Corrected Text:", corrected_text)


Corrected Text: His sentence has spelling errors.


# 11. Removing URLs and Email Addresses



In [11]:
text = "Contact us at example@example.com or visit https://example.com."
text_no_urls_emails = re.sub(r'http\S+|www\S+|@\S+', '', text)
print("Text without URLs and Emails:", text_no_urls_emails)


Text without URLs and Emails: Contact us at example or visit 


# 12. Handling Frequent or Rare Words
For rare words, define a frequency threshold.

In [12]:
from collections import Counter

tokens = ["apple", "banana", "apple", "orange", "banana", "apple", "grape"]
freq = Counter(tokens)
threshold = 2
filtered_tokens = [word for word in tokens if freq[word] > threshold]
print("Filtered Tokens (Frequent Words):", filtered_tokens)


Filtered Tokens (Frequent Words): ['apple', 'apple', 'apple']


# 13. Removing Non-ASCII Characters
Remove characters that are not standard English.



In [13]:
text = "Thís tèxt hás nön-ÁSCII cháráctèrs."
text_ascii = text.encode('ascii', 'ignore').decode()
print("Text without Non-ASCII Characters:", text_ascii)


Text without Non-ASCII Characters: Ths txt hs nn-SCII chrctrs.


# 14. Removing HTML Tags

In [14]:
from bs4 import BeautifulSoup

html_text = "<p>This is <b>bold</b> and <i>italic</i> text.</p>"
text_no_html = BeautifulSoup(html_text, "html.parser").get_text()
print("Text without HTML Tags:", text_no_html  )


Text without HTML Tags: This is bold and italic text.
