<a href="https://colab.research.google.com/github/varunchandra10/NLP/blob/main/NLP_Text_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
nltk.download('all')

!pip install emoji spacy

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_r



TEXT PROPROCESSING IN NLP USING NLTK library

i) Lowercasing

ii) Removing Punctuation

iii) Stopwords removal

iv) handling special character and emojis

v) Normalization

vi) Tokenization and its types

vii) Stemming and its types

viii) Lemmation

ix) Parts of speech tagging

x) Name entity recognition tagging

In [None]:
# 1) Lower casing

text = "This IS a SAMPLE Text!"
lowercase_txt = text.lower()
print(lowercase_txt)

this is a sample text!


In [None]:
# 2) Removing Punctuation

import string

text = "Hello, World! How are you?"
no_punc = text.translate(str.maketrans("", "", string.punctuation))
print(no_punc)

# """
# translate() => method to replace or remove characters in a string.

# str.maketrans(x,y,z):
# x -> characters to change
# y -> characters to change to
# z -> characters to remove

# x = "" -> nothing to change
# y = "" -> nothing to replace with
# z = string.punctuation -> remove all punctuations(these characters should be removed)

# """

Hello World How are you


In [None]:
# 3) Stopword removal

from nltk.corpus import stopwords

text = "this is a sample text with some words"
stop_words = set(stopwords.words('english'))
words = text.split()
filtered_words = []
for word in words:
  if word.lower() not in stop_words:
    filtered_words.append(word)
filtered_text = " ".join(filtered_words)
print(filtered_text)

# '''
# o/p of: words = text.split()
# ["this", "is", "a", "sample", "text", "with", "some", "words"]
# stop_words = set(stopwords.words('english')) loads the set of english stopwords

# '''

sample text words


In [None]:
# 4) Handling special characters and Emojis

import re
text = "Hello 😊! How are you? #special_chars"
no_spec_chars = re.sub(r'[^a-zA-Z0-9\s]', '', text)
print(no_spec_chars)

import emoji
text_with_emojis = emoji.demojize(text)
print(text_with_emojis)

Hello  How are you specialchars
Hello :smiling_face_with_smiling_eyes:! How are you? #special_chars


In [None]:
# 5) Normalization

text = "running RUNNING Run"
normalized_text = text.lower()
print(normalized_text)

running running run


In [None]:
# 6) Tokenization

text_1 = "This is a sample sentence."
word_tokens = nltk.word_tokenize(text_1)
print("Word tokens:", word_tokens)

text_2 = "This is a sample. Here is another one!"
sentence_tokens = nltk.sent_tokenize(text_2)
print("Sentence Tokens:", sentence_tokens)

custom_tokens = re.findall(r'\w+', text_2)
print("Custom tokens:", custom_tokens)

Word tokens: ['This', 'is', 'a', 'sample', 'sentence', '.']
Sentence Tokens: ['This is a sample.', 'Here is another one!']
Custom tokens: ['This', 'is', 'a', 'sample', 'Here', 'is', 'another', 'one']


In [None]:
# 7) Stemming

from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer

text = "running runs runner"
words = nltk.word_tokenize(text)

# Porter Stemmer
porter = PorterStemmer()
porter_stemmed = [porter.stem(word) for word in words]
print("Porter Stemming:", porter_stemmed)

# Snowball Stemmer
snowball = SnowballStemmer('english')
snowball_stemmed = [snowball.stem(word) for word in words]
print("Snowball Stemming:", snowball_stemmed)

# Lancaster Stemmer (more aggressive)
lancaster = LancasterStemmer()
lancaster_stemmed = [lancaster.stem(word) for word in words]
print("Lancaster Stemming:", lancaster_stemmed)

Porter Stemming: ['run', 'run', 'runner']
Snowball Stemming: ['run', 'run', 'runner']
Lancaster Stemming: ['run', 'run', 'run']


In [None]:
# 8) Lemmation

from nltk.stem import WordNetLemmatizer

text = "running runs geese"
words = nltk.word_tokenize(text)
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in words]
print("Lemmatized:", lemmatized)

# With POS tagging for better results (example with verb assumption)
lemmatized_verbs = [lemmatizer.lemmatize(word, pos='v') for word in words]
print("Lemmatized (verbs):", lemmatized_verbs)

Lemmatized: ['running', 'run', 'goose']
Lemmatized (verbs): ['run', 'run', 'geese']


In [None]:
# 9) Parts of speech tagging

text = "The cat is running fast"
tokens = nltk.word_tokenize(text)
pos_tags = nltk.pos_tag(tokens)
print("POS Tags:", pos_tags)

POS Tags: [('The', 'DT'), ('cat', 'NN'), ('is', 'VBZ'), ('running', 'VBG'), ('fast', 'RB')]


In [None]:
# Named entitiy recognition (NER) Tagging

text = "Apple is opening a store in New York on January 15"

tokens = nltk.word_tokenize(text)
pos_tags = nltk.pos_tag(tokens)
ner_tags = nltk.ne_chunk(pos_tags)

print("NER Tags:")

for chunk in ner_tags:
    if hasattr(chunk, 'label'):
        print(f"{chunk.label()}: {' '.join(c[0] for c in chunk)}")
    else:
        print(chunk)

NER Tags:
GPE: Apple
('is', 'VBZ')
('opening', 'VBG')
('a', 'DT')
('store', 'NN')
('in', 'IN')
GPE: New York
('on', 'IN')
('January', 'NNP')
('15', 'CD')


TEXT PROPROCESSING IN NLP USING SPACY library

i) Lowercasing

ii) Removing Punctuation

iii) Stopwords removal

iv) handling special character and emojis

v) Normalization

vi) Tokenization and its types

vii) Stemming and its types

viii) Lemmation

ix) Parts of speech tagging

x) Name entity recognition tagging

In [None]:
import spacy
import re
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer  # For stemming
import emoji

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Sample text for demonstration
sample_text = "Hello, World! 😊 Running in New York on January 15th, 2023... #NLP is FUN! and Apple is opening a store in New York on January 15"

# i) Lowercasing
def lowercase_text(text):
    """Convert text to lowercase."""
    return text.lower()

# ii) Removing Punctuation
def remove_punctuation(text):
    """Remove punctuation using spaCy."""
    doc = nlp(text)
    return " ".join([token.text for token in doc if not token.is_punct])

# iii) Stopwords Removal
def remove_stopwords(text):
    """Remove stopwords using spaCy."""
    doc = nlp(text)
    return " ".join([token.text for token in doc if not token.is_stop])

# iv) Handling Special Characters and Emojis
def handle_special_chars_and_emojis(text):
    """Handle special characters and convert emojis to text."""
    # Convert emojis to text description
    text_with_emojis = emoji.demojize(text)
    # Remove special characters (keep alphanumeric and spaces)
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text_with_emojis)
    return cleaned_text

# v) Normalization (basic example)
def normalize_text(text):
    """Basic normalization (lowercase and simple cleanup)."""
    text = text.lower()
    # Additional normalization rules could be added (e.g., contractions)
    return text.strip()

# vi) Tokenization and its Types
def tokenize_text(text):
    """Perform word and sentence tokenization using spaCy."""
    doc = nlp(text)
    # Word tokenization
    word_tokens = [token.text for token in doc]
    # Sentence tokenization
    sentence_tokens = [sent.text for sent in doc.sents]
    return {"word_tokens": word_tokens, "sentence_tokens": sentence_tokens}

# vii) Stemming and its Types (using NLTK since spaCy doesn't support stemming)
def stem_text(text, stemmer_type="porter"):
    """Apply stemming using different stemmers from NLTK."""
    doc = nlp(text)
    tokens = [token.text for token in doc]

    if stemmer_type == "porter":
        stemmer = PorterStemmer()
    elif stemmer_type == "snowball":
        stemmer = SnowballStemmer("english")
    elif stemmer_type == "lancaster":
        stemmer = LancasterStemmer()
    else:
        raise ValueError("Unsupported stemmer type")

    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return " ".join(stemmed_tokens)

# viii) Lemmatization
def lemmatize_text(text):
    """Perform lemmatization using spaCy."""
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

# ix) Parts of Speech Tagging
def pos_tagging(text):
    """Perform POS tagging using spaCy."""
    doc = nlp(text)
    return [(token.text, token.pos_) for token in doc]

# x) Named Entity Recognition (NER) Tagging
def ner_tagging(text):
    """Perform NER tagging using spaCy."""
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

# Main function to run all preprocessing steps
def preprocess_text(text):
    """Run all preprocessing steps and display results."""
    print("Original Text:", text)
    print("\n--- Preprocessing Steps ---")

    # i) Lowercasing
    lowercased = lowercase_text(text)
    print("i) Lowercased:", lowercased)
    print()

    # ii) Removing Punctuation
    no_punc = remove_punctuation(text)
    print("ii) No Punctuation:", no_punc)
    print()

    # iii) Stopwords Removal
    no_stopwords = remove_stopwords(text)
    print("iii) No Stopwords:", no_stopwords)
    print()

    # iv) Handling Special Characters and Emojis
    cleaned_special = handle_special_chars_and_emojis(text)
    print("iv) Special Chars & Emojis Handled:", cleaned_special)
    print()

    # v) Normalization
    normalized = normalize_text(text)
    print("v) Normalized:", normalized)
    print()

    # vi) Tokenization
    tokens = tokenize_text(text)
    print("vi) Tokenization:")
    print("   Word Tokens:", tokens["word_tokens"])
    print("   Sentence Tokens:", tokens["sentence_tokens"])
    print()

    # vii) Stemming (using Porter as example)
    stemmed = stem_text(text, stemmer_type="porter")
    print("vii) Stemmed (Porter):", stemmed)
    print()

    # viii) Lemmatization
    lemmatized = lemmatize_text(text)
    print("viii) Lemmatized:", lemmatized)
    print()

    # ix) POS Tagging
    pos_tags = pos_tagging(text)
    print("ix) POS Tags:", pos_tags)
    print()

    # x) NER Tagging
    ner_tags = ner_tagging(text)
    print("x) NER Tags:", ner_tags)
    print()

# Run the preprocessing
if __name__ == "__main__":
    preprocess_text(sample_text)

Original Text: Hello, World! 😊 Running in New York on January 15th, 2023... #NLP is FUN! and Apple is opening a store in New York on January 15

--- Preprocessing Steps ---
i) Lowercased: hello, world! 😊 running in new york on january 15th, 2023... #nlp is fun! and apple is opening a store in new york on january 15

ii) No Punctuation: Hello World 😊 Running in New York on January 15th 2023 NLP is FUN and Apple is opening a store in New York on January 15

iii) No Stopwords: Hello , World ! 😊 Running New York January 15th , 2023 ... # NLP FUN ! Apple opening store New York January 15

iv) Special Chars & Emojis Handled: Hello World smilingfacewithsmilingeyes Running in New York on January 15th 2023 NLP is FUN and Apple is opening a store in New York on January 15

v) Normalized: hello, world! 😊 running in new york on january 15th, 2023... #nlp is fun! and apple is opening a store in new york on january 15

vi) Tokenization:
   Word Tokens: ['Hello', ',', 'World', '!', '😊', 'Running', 'i