# Install Libraries and Dependencies

In [1]:
# Install necessary libraries
!pip install nltk spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m53.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# Download NLTK Data

In [2]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# Tokenization using Regex

In [3]:
from nltk.tokenize import regexp_tokenize

sentence = "Wow! NLP, or Natural Language Processing, is really fun to learn; isn't it?"
tokens = regexp_tokenize(sentence, pattern=r"\w+")

print("Tokens:", tokens)


Tokens: ['Wow', 'NLP', 'or', 'Natural', 'Language', 'Processing', 'is', 'really', 'fun', 'to', 'learn', 'isn', 't', 'it']


# Stop Words Removal

In [5]:
import nltk

# Download necessary NLTK data packages
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab') # This line downloads the missing 'punkt_tab' resource

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Create a set of English stop words
stop_words = set(stopwords.words('english'))

# The sentence to be tokenized and filtered
sentence = "I love learning NLP, but sometimes understanding stop words removal can be tricky!"

# Tokenize the sentence
tokens = word_tokenize(sentence)

# Filter out the stop words
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

# Print the filtered tokens
print("Filtered Tokens:", filtered_tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...


Filtered Tokens: ['love', 'learning', 'NLP', ',', 'sometimes', 'understanding', 'stop', 'words', 'removal', 'tricky', '!']


[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# Stemming and Lemmatization

In [6]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

words = ["running", "flies", "better", "happily", "arguing", "studied", "swimming"]

print("Stemming:", [stemmer.stem(word) for word in words])
print("Lemmatization:", [lemmatizer.lemmatize(word, pos='v') for word in words])


Stemming: ['run', 'fli', 'better', 'happili', 'argu', 'studi', 'swim']
Lemmatization: ['run', 'fly', 'better', 'happily', 'argue', 'study', 'swim']


# POS Tagging

In [8]:
import nltk

# Download the missing 'averaged_perceptron_tagger_eng' resource
nltk.download('averaged_perceptron_tagger_eng')

from nltk import pos_tag
from nltk.tokenize import word_tokenize

sentence = "She can duck quickly when she sees a duck."
tokens = word_tokenize(sentence)
tagged = pos_tag(tokens)

print("POS Tags:", tagged)


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


POS Tags: [('She', 'PRP'), ('can', 'MD'), ('duck', 'VB'), ('quickly', 'RB'), ('when', 'WRB'), ('she', 'PRP'), ('sees', 'VBZ'), ('a', 'DT'), ('duck', 'NN'), ('.', '.')]


# Named Entity Recognition (NER)

In [9]:
import spacy

nlp = spacy.load("en_core_web_sm")

sentence = "Microsoft acquired Activision Blizzard for $68.7 billion on January 18, 2022."
doc = nlp(sentence)

print("Named Entities:")
for ent in doc.ents:
    print(ent.text, "-", ent.label_)


Named Entities:
Microsoft - ORG
Activision Blizzard - PERSON
$68.7 billion - MONEY
January 18, 2022 - DATE
