Perform tokenization (Whitespace, Punctuation-based, Treebank, Tweet, MWE) using NLTK
library. 

Use porter stemmer and snowball stemmer for stemming. 

Use any technique for lemmatization.

Input / Dataset â€“use any sample sentence

In [32]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [33]:
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

##Tokenization

In [None]:
from nltk.tokenize import word_tokenize,TreebankWordTokenizer,TweetTokenizer,MWETokenizer

In [34]:
#Whitespace Tokenization
text = "This is a sample sentence for whitespace tokenization."
tokens = text.split()
print(tokens)

['This', 'is', 'a', 'sample', 'sentence', 'for', 'whitespace', 'tokenization.']


In [35]:
#Punctuation-Based Tokenization
text = "This is a sample sentence for punctuation-based tokenization."
tokens = word_tokenize(text)
print(tokens)

['This', 'is', 'a', 'sample', 'sentence', 'for', 'punctuation-based', 'tokenization', '.']


In [36]:
#Treebank Tokenization
tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(text)
print(tokens)

['This', 'is', 'a', 'sample', 'sentence', 'for', 'punctuation-based', 'tokenization', '.']


In [37]:
#Tweet Tokenization
tokenizer = TweetTokenizer()
tokens = tokenizer.tokenize(text)
print(tokens)

['This', 'is', 'a', 'sample', 'sentence', 'for', 'punctuation-based', 'tokenization', '.']


In [38]:
#MWE Tokenization (multi-word expressions)
tokenizer = MWETokenizer([('hot', 'dog'), ('New', 'York', 'City')])
tokens = tokenizer.tokenize(word_tokenize(text))
print(tokens)

['This', 'is', 'a', 'sample', 'sentence', 'for', 'punctuation-based', 'tokenization', '.']


##Stemming

In [39]:
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer

In [40]:
# Porter Stemming
porter_stemmer = PorterStemmer()
stemmed_words = [porter_stemmer.stem(word) for word in tokens]
print("Porter Stemmed words:", stemmed_words)

Porter Stemmed words: ['thi', 'is', 'a', 'sampl', 'sentenc', 'for', 'punctuation-bas', 'token', '.']


In [41]:
# Snowball Stemming
snowball_stemmer = SnowballStemmer('english')
snowball_stemmed_words = [snowball_stemmer.stem(word) for word in tokens]
print("Snowball Stemmed words:", snowball_stemmed_words)

Snowball Stemmed words: ['this', 'is', 'a', 'sampl', 'sentenc', 'for', 'punctuation-bas', 'token', '.']


##Lemmatization

In [42]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
print("Lemmatized words:", lemmatized_words)

Lemmatized words: ['This', 'is', 'a', 'sample', 'sentence', 'for', 'punctuation-based', 'tokenization', '.']
