In [1]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sunny\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Tokenization using NLTK 

In [4]:
text = "Natural Language Processing (NLP) is an exciting field of AI. It enables computers to understand human language."

# word tokenize
word_tokens = word_tokenize(text)
print("Word Tokenization : ", word_tokens)
print()
# sentence tokenization
sent_tokens = sent_tokenize(text)
print("Sentence Tokenization:", sent_tokens)

Word Tokenization :  ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'an', 'exciting', 'field', 'of', 'AI', '.', 'It', 'enables', 'computers', 'to', 'understand', 'human', 'language', '.']

Sentence Tokenization: ['Natural Language Processing (NLP) is an exciting field of AI.', 'It enables computers to understand human language.']


#### Removing Stopwords

In [5]:
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
filtered_words = [word for word in word_tokens if word.lower() not in stop_words]

print("Filtered Words (After Stopword Removal):", filtered_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sunny\AppData\Roaming\nltk_data...


Filtered Words (After Stopword Removal): ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'exciting', 'field', 'AI', '.', 'enables', 'computers', 'understand', 'human', 'language', '.']


[nltk_data]   Unzipping corpora\stopwords.zip.


#### Stemming vs. Lemmatization

In [6]:
# stemming
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_words]

print("Stemmed Words:", stemmed_words)

Stemmed Words: ['natur', 'languag', 'process', '(', 'nlp', ')', 'excit', 'field', 'ai', '.', 'enabl', 'comput', 'understand', 'human', 'languag', '.']


#### Feature Extraction: Bag of Words (BoW)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    "NLP is fun.",
    "NLP is a subfield of AI.",
    "AI and NLP power modern chatbots."
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

print("Feature Names:", vectorizer.get_feature_names_out())
print("Bag of Words Matrix:\n", X.toarray())


Feature Names: ['ai' 'and' 'chatbots' 'fun' 'is' 'modern' 'nlp' 'of' 'power' 'subfield']
Bag of Words Matrix:
 [[0 0 0 1 1 0 1 0 0 0]
 [1 0 0 0 1 0 1 1 0 1]
 [1 1 1 0 0 1 1 0 1 0]]


#### Feature Extraction: TF-IDF

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(corpus)

print("TF-IDF Feature Names:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Matrix:\n", X_tfidf.toarray())


TF-IDF Feature Names: ['ai' 'and' 'chatbots' 'fun' 'is' 'modern' 'nlp' 'of' 'power' 'subfield']
TF-IDF Matrix:
 [[0.         0.         0.         0.72033345 0.54783215 0.
  0.42544054 0.         0.         0.        ]
 [0.40619178 0.         0.         0.         0.40619178 0.
  0.31544415 0.53409337 0.         0.53409337]
 [0.34261996 0.45050407 0.45050407 0.         0.         0.45050407
  0.26607496 0.         0.45050407 0.        ]]
