<a href="https://colab.research.google.com/github/vishalkandukuri97/AI-ML-Projects/blob/main/NLP_Pre_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install nltk scikit-learn



In [4]:
import nltk

In [5]:
import re

In [6]:
import string

In [7]:
from nltk.corpus import stopwords

In [8]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [9]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [10]:
nltk.download('stopwords')
nltk.download('wordnet')
text = "NLP is amazing! It helps computers understand language. Don't you love it?"

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [11]:
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove numbers, punctuation, and special characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [12]:
clean_text = preprocess_text(text)
print("Preprocessed Text:", clean_text)

Preprocessed Text: nlp is amazing it helps computers understand language dont you love it


In [13]:
word_tokens = word_tokenize(clean_text)
print("Word Tokens:", word_tokens)

Word Tokens: ['nlp', 'is', 'amazing', 'it', 'helps', 'computers', 'understand', 'language', 'dont', 'you', 'love', 'it']


In [14]:
sent_tokens = sent_tokenize(text)
print("Sentence Tokens:", sent_tokens)

Sentence Tokens: ['NLP is amazing!', 'It helps computers understand language.', "Don't you love it?"]


In [15]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stems = [stemmer.stem(word) for word in word_tokens]
lemmas = [lemmatizer.lemmatize(word) for word in word_tokens]

print("Stemming:", stems)
print("Lemmatization:", lemmas)


Stemming: ['nlp', 'is', 'amaz', 'it', 'help', 'comput', 'understand', 'languag', 'dont', 'you', 'love', 'it']
Lemmatization: ['nlp', 'is', 'amazing', 'it', 'help', 'computer', 'understand', 'language', 'dont', 'you', 'love', 'it']


In [16]:
stop_words = set(stopwords.words("english"))
filtered_tokens = [word for word in word_tokens if word not in stop_words]
print("After Stop Word Removal:", filtered_tokens)

After Stop Word Removal: ['nlp', 'amazing', 'helps', 'computers', 'understand', 'language', 'dont', 'love']


In [17]:
docs = [
    "The cat is sitting on the mat.",
    "NLP helps computers understand human language.",
    "The dog is playing with the cat."
]

In [18]:
vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(docs)
print("\nBag of Words Vocabulary:", vectorizer.get_feature_names_out())
print("BoW Matrix:\n", bow.toarray())



Bag of Words Vocabulary: ['cat' 'computers' 'dog' 'helps' 'human' 'is' 'language' 'mat' 'nlp' 'on'
 'playing' 'sitting' 'the' 'understand' 'with']
BoW Matrix:
 [[1 0 0 0 0 1 0 1 0 1 0 1 2 0 0]
 [0 1 0 1 1 0 1 0 1 0 0 0 0 1 0]
 [1 0 1 0 0 1 0 0 0 0 1 0 2 0 1]]


In [19]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(docs)
print("\nTF-IDF Vocabulary:", tfidf.get_feature_names_out())
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())


TF-IDF Vocabulary: ['cat' 'computers' 'dog' 'helps' 'human' 'is' 'language' 'mat' 'nlp' 'on'
 'playing' 'sitting' 'the' 'understand' 'with']
TF-IDF Matrix:
 [[0.29898437 0.         0.         0.         0.         0.29898437
  0.         0.39312851 0.         0.39312851 0.         0.39312851
  0.59796874 0.         0.        ]
 [0.         0.40824829 0.         0.40824829 0.40824829 0.
  0.40824829 0.         0.40824829 0.         0.         0.
  0.         0.40824829 0.        ]
 [0.29898437 0.         0.39312851 0.         0.         0.29898437
  0.         0.         0.         0.         0.39312851 0.
  0.59796874 0.         0.39312851]]
