In [6]:
# Install required libraries (if not already installed)
!pip install --quiet nltk wordcloud requests beautifulsoup4 scikit-learn


In [32]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('vader_lexicon')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package 

True

In [24]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import requests
from io import StringIO

# 20 Newsgroups dataset (subset)
newsgroups = fetch_20newsgroups(subset='train', remove=('headers','footers','quotes'))
texts = newsgroups.data[:3]  # small sample

# SMS Spam dataset
url = 'https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv'
resp = requests.get(url)
sms_df = pd.read_csv(StringIO(resp.text), sep='\t', names=['label', 'message'])

print("Newsgroups sample:", texts[0][:300], "\n")
print("SMS sample:\n", sms_df.head())


Newsgroups sample: I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I k 

SMS sample:
   label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [25]:
import nltk
import os

try:
    punkt_tab_path = nltk.data.find('tokenizers/punkt_tab/english/')
    print(f"punkt_tab resource found at: {punkt_tab_path}")
except LookupError:
    print("punkt_tab resource not found in NLTK data paths.")

print("\nNLTK data paths:")
for path in nltk.data.path:
    print(path)

punkt_tab resource found at: /root/nltk_data/tokenizers/punkt_tab/english

NLTK data paths:
/root/nltk_data
/usr/nltk_data
/usr/share/nltk_data
/usr/lib/nltk_data
/usr/share/nltk_data
/usr/local/share/nltk_data
/usr/lib/nltk_data
/usr/local/lib/nltk_data


In [26]:
import re, string

text = "Hello, Usha! NLP is FUN, isn't it?"

# Lowercasing
lower = text.lower()

# Remove punctuation
clean = lower.translate(str.maketrans('', '', string.punctuation))

print("Original:", text)
print("Lowercased:", lower)
print("Without punctuation:", clean)


Original: Hello, Usha! NLP is FUN, isn't it?
Lowercased: hello, usha! nlp is fun, isn't it?
Without punctuation: hello usha nlp is fun isnt it


In [27]:
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk

# Ensure punkt_tab is downloaded before use
try:
    nltk.data.find('tokenizers/punkt_tab/english/')
except LookupError:
    nltk.download('punkt_tab')


text = "Usha is learning NLP. She loves tokenization!"

print("Sentence tokens:", sent_tokenize(text))
print("Word tokens:", word_tokenize(text))

Sentence tokens: ['Usha is learning NLP.', 'She loves tokenization!']
Word tokens: ['Usha', 'is', 'learning', 'NLP', '.', 'She', 'loves', 'tokenization', '!']


In [28]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

text = "Usha is learning Natural Language Processing in Python."
tokens = word_tokenize(text)

filtered = [w for w in tokens if w.lower() not in stop_words and w.isalpha()]

print("Original tokens:", tokens)
print("After stopword removal:", filtered)


Original tokens: ['Usha', 'is', 'learning', 'Natural', 'Language', 'Processing', 'in', 'Python', '.']
After stopword removal: ['Usha', 'learning', 'Natural', 'Language', 'Processing', 'Python']


In [29]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

words = ['running', 'runs', 'runner', 'better', 'easily']

print("Stemming:")
for w in words:
    print(w, "->", ps.stem(w))

print("\nLemmatization (noun default):")
for w in words:
    print(w, "->", lemmatizer.lemmatize(w))


Stemming:
running -> run
runs -> run
runner -> runner
better -> better
easily -> easili

Lemmatization (noun default):
running -> running
runs -> run
runner -> runner
better -> better
easily -> easily


In [30]:
text = "Usha is learning Natural Language Processing."
tokens = word_tokenize(text)
pos_tags = nltk.pos_tag(tokens)
print(pos_tags)


[('Usha', 'NNP'), ('is', 'VBZ'), ('learning', 'VBG'), ('Natural', 'NNP'), ('Language', 'NNP'), ('Processing', 'NNP'), ('.', '.')]


In [33]:
from nltk import ne_chunk

text = "Apple is buying a startup in London for $1 billion."
tokens = word_tokenize(text)
pos_tags = nltk.pos_tag(tokens)
tree = ne_chunk(pos_tags)

entities = []
for subtree in tree:
    if hasattr(subtree, 'label'):
        entity_name = " ".join([token for token, pos in subtree.leaves()])
        entities.append((entity_name, subtree.label()))

print("Entities:", entities)


Entities: [('Apple', 'GPE'), ('London', 'GPE')]


In [34]:
from nltk.wsd import lesk

sent1 = "I went to the bank to deposit money"
sent2 = "The river bank was full of fishers"

print("Sentence1:", lesk(sent1.split(), 'bank'))
print("Sentence2:", lesk(sent2.split(), 'bank'))


Sentence1: Synset('savings_bank.n.02')
Sentence2: Synset('deposit.v.02')


In [35]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

def preprocess(text):
    t = text.lower()
    t = t.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(t)
    tokens = [w for w in tokens if w.isalpha() and w not in stop_words]
    stems = [ps.stem(w) for w in tokens]
    lemmas = [lemmatizer.lemmatize(w) for w in tokens]
    return tokens, stems, lemmas

for i, row in sms_df.head(3).iterrows():
    print("Message:", row['message'])
    print("Tokens, Stems, Lemmas:", preprocess(row['message']))
    print()


Message: Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
Tokens, Stems, Lemmas: (['go', 'jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amore', 'wat'], ['go', 'jurong', 'point', 'crazi', 'avail', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amor', 'wat'], ['go', 'jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amore', 'wat'])

Message: Ok lar... Joking wif u oni...
Tokens, Stems, Lemmas: (['ok', 'lar', 'joking', 'wif', 'u', 'oni'], ['ok', 'lar', 'joke', 'wif', 'u', 'oni'], ['ok', 'lar', 'joking', 'wif', 'u', 'oni'])

Message: Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
Tokens, Stems, Lemmas: (['free', 'entry', 'wkly', 'comp', 'win', 'fa', 'cup', 'final', 'tkts', 'may', 