In [1]:
from nltk.tokenize import regexp_tokenize

sentence = "Hello, world! NLP is amazing—don't you agree?"
tokens = regexp_tokenize(sentence, pattern=r"\w+")
print("Tokens:", tokens)


Tokens: ['Hello', 'world', 'NLP', 'is', 'amazing', 'don', 't', 'you', 'agree']


In [4]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

sentence = "The new AI model is generating highly accurate results in seconds!"
tokens = word_tokenize(sentence)
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

print("Filtered tokens:", filtered_tokens)


Filtered tokens: ['new', 'AI', 'model', 'generating', 'highly', 'accurate', 'results', 'seconds', '!']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [5]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('wordnet')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

words = ["running", "flies", "better", "happily", "driving", "swimming", "writing"]

print("Stemming:", [stemmer.stem(word) for word in words])
print("Lemmatization:", [lemmatizer.lemmatize(word, pos='v') for word in words])


[nltk_data] Downloading package wordnet to /root/nltk_data...


Stemming: ['run', 'fli', 'better', 'happili', 'drive', 'swim', 'write']
Lemmatization: ['run', 'fly', 'better', 'happily', 'drive', 'swim', 'write']


In [7]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag

nltk.download('punkt')
# Download the specific resource for English
nltk.download('averaged_perceptron_tagger_eng')

sentence = "I will book a flight, but I saw a duck near the lake."
tokens = word_tokenize(sentence)
tagged = pos_tag(tokens)

print("POS Tags:", tagged)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


POS Tags: [('I', 'PRP'), ('will', 'MD'), ('book', 'NN'), ('a', 'DT'), ('flight', 'NN'), (',', ','), ('but', 'CC'), ('I', 'PRP'), ('saw', 'VBD'), ('a', 'DT'), ('duck', 'NN'), ('near', 'IN'), ('the', 'DT'), ('lake', 'NN'), ('.', '.')]


In [8]:
import spacy

# Load the small English model
nlp = spacy.load("en_core_web_sm")

# Modified sentence with date, money value, and organization
doc = nlp("On March 14, 2024, Elon Musk announced a $2 billion investment in OpenAI.")

# Print named entities and their labels
for ent in doc.ents:
    print(ent.text, "-", ent.label_)


March 14, 2024 - DATE
Elon Musk - PERSON
$2 billion - MONEY
OpenAI - GPE
