## Text Processing with NLTK

In [1]:
# Import necessary libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

In [14]:
# Download necessary NLTK resources (only run once)
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\devar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\devar\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\devar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\devar\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\devar\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


True

In [15]:
# Example text for processing
text = """The enthralling 2-2 draw for the Anderson-Tendulkar trophy between England and India provided a dramatic start to the new World Test Championship cycle. It was an epic contest, each of the five Tests going into the final day, four in fact into the final session, providing some of the best individual and collective performances the five-day format has seen in recent years."""

In [16]:
# Tokenize the text
tokens = word_tokenize(text)

In [17]:
# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word.isalpha()]

In [18]:
# POS tagging
pos_tags = nltk.pos_tag(filtered_tokens)

In [19]:
# Count nouns, verbs, adjectives
pos_counts = Counter(tag for word, tag in pos_tags)

num_nouns = sum(count for tag, count in pos_counts.items() if tag.startswith('NN'))
num_verbs = sum(count for tag, count in pos_counts.items() if tag.startswith('VB'))
num_adjectives = sum(count for tag, count in pos_counts.items() if tag.startswith('JJ'))

In [20]:
# Output results
print("Original Tokens:", tokens)
print("Filtered Tokens (No Stopwords):", filtered_tokens)
print("POS Tags:", pos_tags)
print(f"Number of Nouns: {num_nouns}")
print(f"Number of Verbs: {num_verbs}")
print(f"Number of Adjectives: {num_adjectives}")

Original Tokens: ['The', 'enthralling', '2-2', 'draw', 'for', 'the', 'Anderson-Tendulkar', 'trophy', 'between', 'England', 'and', 'India', 'provided', 'a', 'dramatic', 'start', 'to', 'the', 'new', 'World', 'Test', 'Championship', 'cycle', '.', 'It', 'was', 'an', 'epic', 'contest', ',', 'each', 'of', 'the', 'five', 'Tests', 'going', 'into', 'the', 'final', 'day', ',', 'four', 'in', 'fact', 'into', 'the', 'final', 'session', ',', 'providing', 'some', 'of', 'the', 'best', 'individual', 'and', 'collective', 'performances', 'the', 'five-day', 'format', 'has', 'seen', 'in', 'recent', 'years', '.']
Filtered Tokens (No Stopwords): ['enthralling', 'draw', 'trophy', 'England', 'India', 'provided', 'dramatic', 'start', 'new', 'World', 'Test', 'Championship', 'cycle', 'epic', 'contest', 'five', 'Tests', 'going', 'final', 'day', 'four', 'fact', 'final', 'session', 'providing', 'best', 'individual', 'collective', 'performances', 'format', 'seen', 'recent', 'years']
POS Tags: [('enthralling', 'VBG'),