In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
nltk.download('all', quiet=True) #NLTK ke saare resources download hote hain.

True

In [26]:
text = "The sky is blue and beautiful. The sun is bright and shining."

In [27]:
# Tokenization
tokens = word_tokenize(text)
print("Tokens:", tokens)

Tokens: ['The', 'sky', 'is', 'blue', 'and', 'beautiful', '.', 'The', 'sun', 'is', 'bright', 'and', 'shining', '.']


In [28]:
# POS Tagging
pos_tags = nltk.pos_tag(tokens)
print("POS Tags:", pos_tags)

POS Tags: [('The', 'DT'), ('sky', 'NN'), ('is', 'VBZ'), ('blue', 'JJ'), ('and', 'CC'), ('beautiful', 'JJ'), ('.', '.'), ('The', 'DT'), ('sun', 'NN'), ('is', 'VBZ'), ('bright', 'JJ'), ('and', 'CC'), ('shining', 'NN'), ('.', '.')]


In [29]:

# Stop Words Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("After Stopwords Removal:", filtered_tokens)

After Stopwords Removal: ['sky', 'blue', 'beautiful', '.', 'sun', 'bright', 'shining', '.']


In [30]:
# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("After Stemming:", stemmed_tokens)

After Stemming: ['sky', 'blue', 'beauti', '.', 'sun', 'bright', 'shine', '.']


In [31]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("After Lemmatization:", lemmatized_tokens)

After Lemmatization: ['sky', 'blue', 'beautiful', '.', 'sun', 'bright', 'shining', '.']


In [32]:
# TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform([" ".join(lemmatized_tokens)])
print("TF-IDF Features:", vectorizer.get_feature_names_out())
print("TF-IDF Matrix:", X.toarray())

TF-IDF Features: ['beautiful' 'blue' 'bright' 'shining' 'sky' 'sun']
TF-IDF Matrix: [[0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829]]
