# MGTA 415 - Session 1 Demo: Utilize Python NLTK Package for Text Preprocessing
1. Tokenization
2. Stemming
3. Lemmatization
4. Text normalization

In [4]:
# install necessary packages (only necessary if you run the notebook outside the container)
# !pip install nltk

In [3]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Example text
text = "Hello, world! I'm learning NLP. I've got 2 apples. U.S.A. is great!"

# Lowercasing
text = text.lower()

# Removing punctuation
text = re.sub(r'[^\w\s]', '', text)

# Handling contractions (using a simple dictionary for this example)
contractions = {
    "i'm": "i am",
    "don't": "do not",
    "i've": "i have",
    "u.s.a": "usa"
}
tokens = word_tokenize(text)
normalized_tokens = [contractions.get(word, word) for word in tokens]

# Standardizing numerical values (example of converting digits to words)
# You can use the inflect library for a more comprehensive approach
normalized_tokens = [re.sub(r'\b2\b', 'two', word) for word in normalized_tokens]

# Removing special characters (already handled in punctuation removal, but this is an additional step for completeness)
normalized_tokens = [re.sub(r'[^a-zA-Z0-9\s]', '', word) for word in normalized_tokens]

# Whitespace normalization
normalized_text = ' '.join(normalized_tokens).strip()

# Removing stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in normalized_tokens if word not in stop_words]

# Define custom stopwords
custom_stop_words = ['hello', 'great']
stop_words.update(['hello', 'great'])  # Adding custom stopwords to the stop words list

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word, wordnet.VERB) for word in filtered_tokens]

# Generate n-grams from the lemmatized tokens
bigrams = list(nltk.ngrams(lemmatized_tokens, 2))
trigrams = list(nltk.ngrams(lemmatized_tokens, 3))

print("Original Text:", text)
print("Normalized Text:", normalized_text)
print("Filtered Tokens:", filtered_tokens)
print("Stemmed Tokens:", stemmed_tokens)
print("Lemmatized Tokens:", lemmatized_tokens)
print("Bigrams:", bigrams)
print("Trigrams:", trigrams)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Original Text: hello world im learning nlp ive got 2 apples usa is great
Normalized Text: hello world im learning nlp ive got two apples usa is great
Filtered Tokens: ['hello', 'world', 'im', 'learning', 'nlp', 'ive', 'got', 'two', 'apples', 'usa', 'great']
Stemmed Tokens: ['hello', 'world', 'im', 'learn', 'nlp', 'ive', 'got', 'two', 'appl', 'usa', 'great']
Lemmatized Tokens: ['hello', 'world', 'im', 'learn', 'nlp', 'ive', 'get', 'two', 'apples', 'usa', 'great']
Bigrams: [('hello', 'world'), ('world', 'im'), ('im', 'learn'), ('learn', 'nlp'), ('nlp', 'ive'), ('ive', 'get'), ('get', 'two'), ('two', 'apples'), ('apples', 'usa'), ('usa', 'great')]
Trigrams: [('hello', 'world', 'im'), ('world', 'im', 'learn'), ('im', 'learn', 'nlp'), ('learn', 'nlp', 'ive'), ('nlp', 'ive', 'get'), ('ive', 'get', 'two'), ('get', 'two', 'apples'), ('two', 'apples', 'usa'), ('apples', 'usa', 'great')]


# Things for you to try...
1. Can you add additional custom stop words? How would you test it?
2. Try a few different input to test the differences between Stemming and Lemmatization
3. Try different stemming algorithms: See NLTK documentation