In [1]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kokar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kokar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
from nltk.tokenize import word_tokenize,regexp_tokenize,TweetTokenizer,MWETokenizer ,TreebankWordTokenizer
sentence = "NLTK is a powerful library for Natural Language Processing! Let's tokenize and stem."
# Whitespace tokenization splits the sentence based on spaces (whitespace characters).
# It’s a simple technique where each word or sequence of characters separated by whitespace is treated as a separate token.
# Here's how the output will look for whitespace tokenization using the sample sentence:
print("Whitespace tokenization")
Whitespace_tokenization=sentence.split()
print(Whitespace_tokenization)

Whitespace tokenization
['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'Natural', 'Language', 'Processing!', "Let's", 'tokenize', 'and', 'stem.']


In [9]:
# Punctuation-based tokenization splits tokens based on whitespace and punctuation marks
# (e.g., commas, periods, exclamation marks, etc.). In this approach, punctuation is treated as separate tokens. 
# Here’s how the output looks when we use a regular expression to tokenize the sentence by whitespace or punctuation
print("Punctuation-based tokenization")
toke=regexp_tokenize(sentence,pattern=r'\s|[\.,?!;"]')
print(toke)

Punctuation-based tokenization
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '!', ' ', ' ', ' ', ' ', '.']


In [13]:
# Treebank Tokenization is a method of tokenizing text based on rules that are specifically 
# designed for processing English text as it appears in the Penn Treebank corpus. 
# It is part of the nltk.tokenize module and handles more complex tokenization cases, such as contractions, 
# possessive forms, punctuation, and special characters, with greater accuracy than simpler methods 
# like whitespace-based or punctuation-based tokenization.

# Key Features:
# Handles contractions: It splits contractions (e.g., "don't" into "do" and "n't").

# Handles possessives: It separates possessive forms (e.g., "John's" becomes "John" and "'s").

# Accurate punctuation handling: It separates punctuation like periods and 
# commas as individual tokens, while still associating them with the words they follow.
print("Treebank Tokenization")
tokenation=TreebankWordTokenizer()
tokenation.tokenize(sentence)

Treebank Tokenization


['NLTK',
 'is',
 'a',
 'powerful',
 'library',
 'for',
 'Natural',
 'Language',
 'Processing',
 '!',
 'Let',
 "'s",
 'tokenize',
 'and',
 'stem',
 '.']

In [15]:
# weet Tokenization is a method of tokenizing text specifically designed to 
# handle the unique characteristics of social media text, particularly tweets. 
# Tweets often contain hashtags, mentions, emoticons, and informal language, making them different from regular text.
# The TweetTokenizer in the nltk.tokenize module is tailored to handle these elements properly.

# Features of Tweet Tokenization:
# Handles Hashtags: It keeps hashtags intact as a single token (e.g., #NLTK stays as #NLTK).

# Handles Mentions: Mentions (e.g., @user) are treated as individual tokens.

# Handles Emojis and Emoticons: It keeps emoticons and emojis as separate tokens.

# Handles URLs: URLs are separated as tokens (e.g., http://example.com becomes a single token).

# Handles Retweets: "RT" is treated as a token in retweets.
print("\nTweet Tokenization:")
tweet_tokenizer = TweetTokenizer()
tweet_tokens = tweet_tokenizer.tokenize(sentence)
print(tweet_tokens)


Tweet Tokenization:
['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'Natural', 'Language', 'Processing', '!', "Let's", 'tokenize', 'and', 'stem', '.']


In [23]:
# MWE (Multi-Word Expression) Tokenization refers to tokenizing text in such a way that multi-word expressions (MWEs) 
# are treated as single tokens. MWEs are sequences of words that form a single unit of meaning, such as:

# Phrasal verbs: "take off", "give up"

# Idiomatic expressions: "kick the bucket", "break the ice"

# Named entities: "New York", "United States"

# Collocations: "strong coffee", "fast food"
print("(Multi-Word Expression) Tokenization ")
tokenazation=MWETokenizer([("Natural","Language")])
a=tokenazation.tokenize(sentence.split())
print(a)

(Multi-Word Expression) Tokenization 
['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'Natural_Language', 'Processing!', "Let's", 'tokenize', 'and', 'stem.']


In [None]:
# Algorithm: The Snowball Stemmer, also known as the English Snowball Stemmer, is an improved version of the Porter Stemmer. It follows a more flexible, generalized approach and has been expanded to cover more languages.

# Language Coverage: The Snowball Stemmer supports multiple languages, including English, French, German, Italian, Dutch, and others.

In [31]:
from nltk.stem import PorterStemmer , SnowballStemmer ,WordNetLemmatizer
porter_stemmer = PorterStemmer()
snowball_stemmer = SnowballStemmer("english")

In [33]:
poter_steam=[porter_stemmer.stem(word) for word in word_tokenize(sentence)]

In [35]:
poter_steam

['nltk',
 'is',
 'a',
 'power',
 'librari',
 'for',
 'natur',
 'languag',
 'process',
 '!',
 'let',
 "'s",
 'token',
 'and',
 'stem',
 '.']

In [41]:
lemmitation=WordNetLemmatizer()
lemmatized_words = [lemmitation.lemmatize(word) for word in word_tokenize(sentence)]

In [43]:
lemmatized_words

['NLTK',
 'is',
 'a',
 'powerful',
 'library',
 'for',
 'Natural',
 'Language',
 'Processing',
 '!',
 'Let',
 "'s",
 'tokenize',
 'and',
 'stem',
 '.']