In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
# Load spaCy model
import spacy

In [2]:
nltk.download('punkt')  # Tokenization
nltk.download('stopwords')  # Tokenization
nltk.download('averaged_perceptron_tagger')  # POS Tagging

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
text = "This is a great good morning example sentence but too short for demonstrating this feature so I should've remove it or revising it."
phrases = ["good morning", "good afternoon", "good night", "how are you", "i am fine", "are you okay", "are you happy", "are you sad", "I understand", "I don't know"]

# Function to handle tokenization and preserve phrases
def tokenize_and_preserve_phrases(text, phrases):
    for phrase in phrases:
        text = text.replace(phrase, phrase.replace(" ", "_"))
    
    tokens = word_tokenize(text)
    
    # Normalize and replace underscores back to spaces
    tokens = [word.lower().replace("_", " ") for word in tokens if word.isalpha() or "_" in word]
    
    return tokens

# Tokenization with phrase preservation
tokens = tokenize_and_preserve_phrases(text, phrases)

# Stop words removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]

print("Filtered Tokens:", filtered_tokens)

Filtered Tokens: ['great', 'good morning', 'example', 'sentence', 'short', 'demonstrating', 'feature', 'remove', 'revising']


In [4]:
# POS Tagging
pos_tags = nltk.pos_tag(filtered_tokens)
print(pos_tags)

[('great', 'JJ'), ('good morning', 'VBG'), ('example', 'NN'), ('sentence', 'NN'), ('short', 'JJ'), ('demonstrating', 'NN'), ('feature', 'NN'), ('remove', 'VB'), ('revising', 'NN')]


In [5]:
nlp = spacy.load('en_core_web_sm')
# Process text
doc = nlp(text)
# NER
entities = [(entity.text, entity.label_) for entity in doc.ents]
print(entities)

# Lemmatization
lemmas = [token.lemma_ for token in doc]
print(lemmas)


[('morning', 'TIME')]
['this', 'be', 'a', 'great', 'good', 'morning', 'example', 'sentence', 'but', 'too', 'short', 'for', 'demonstrate', 'this', 'feature', 'so', 'I', 'should', "'ve", 'remove', 'it', 'or', 'revise', 'it', '.']
