# 3.2 Stopwords and Lemmatization

In [1]:
import numpy as np
import pandas as pd
import re

## Review: Tokenization with NLTK

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/repl/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# nltk functions and methods for tokenization
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import TweetTokenizer

In [4]:
chat_gpt = "ChatGPT is a sibling model to InstructGPT, which is trained to follow an instruction in a prompt and provide a detailed response. We are excited to introduce ChatGPT to get users’ feedback and learn about its strengths and weaknesses. During the research preview, usage of ChatGPT is free. Try it now at chat.openai.com. #ChatGPT #InstructGPT  #OpenAI #ChatGPTisFree"

In [5]:
# from nltk.tokenize import sent_tokenize
chatgpt_sents = sent_tokenize(chat_gpt)
for line in chatgpt_sents:
    print(line,"\n")

ChatGPT is a sibling model to InstructGPT, which is trained to follow an instruction in a prompt and provide a detailed response. 

We are excited to introduce ChatGPT to get users’ feedback and learn about its strengths and weaknesses. 

During the research preview, usage of ChatGPT is free. 

Try it now at chat.openai.com. 

#ChatGPT #InstructGPT  #OpenAI #ChatGPTisFree 



In [6]:
# from nltk.tokenize import word_tokenize
chatgpt_words_v1 = word_tokenize(chat_gpt)
print(chatgpt_words_v1)

['ChatGPT', 'is', 'a', 'sibling', 'model', 'to', 'InstructGPT', ',', 'which', 'is', 'trained', 'to', 'follow', 'an', 'instruction', 'in', 'a', 'prompt', 'and', 'provide', 'a', 'detailed', 'response', '.', 'We', 'are', 'excited', 'to', 'introduce', 'ChatGPT', 'to', 'get', 'users', '’', 'feedback', 'and', 'learn', 'about', 'its', 'strengths', 'and', 'weaknesses', '.', 'During', 'the', 'research', 'preview', ',', 'usage', 'of', 'ChatGPT', 'is', 'free', '.', 'Try', 'it', 'now', 'at', 'chat.openai.com', '.', '#', 'ChatGPT', '#', 'InstructGPT', '#', 'OpenAI', '#', 'ChatGPTisFree']


In [7]:
#from nltk.tokenize import regexp_tokenize

chatgpt_words_v2 = regexp_tokenize(chat_gpt, r"([\@|#]\w+|\w+)")
print(chatgpt_words_v2)


['ChatGPT', 'is', 'a', 'sibling', 'model', 'to', 'InstructGPT', 'which', 'is', 'trained', 'to', 'follow', 'an', 'instruction', 'in', 'a', 'prompt', 'and', 'provide', 'a', 'detailed', 'response', 'We', 'are', 'excited', 'to', 'introduce', 'ChatGPT', 'to', 'get', 'users', 'feedback', 'and', 'learn', 'about', 'its', 'strengths', 'and', 'weaknesses', 'During', 'the', 'research', 'preview', 'usage', 'of', 'ChatGPT', 'is', 'free', 'Try', 'it', 'now', 'at', 'chat', 'openai', 'com', '#ChatGPT', '#InstructGPT', '#OpenAI', '#ChatGPTisFree']


In [8]:
# Initialize TweetTokenizer
tknzr = TweetTokenizer()
chatgpt_words_v3 = tknzr.tokenize(chat_gpt)
print(chatgpt_words_v3)

['ChatGPT', 'is', 'a', 'sibling', 'model', 'to', 'InstructGPT', ',', 'which', 'is', 'trained', 'to', 'follow', 'an', 'instruction', 'in', 'a', 'prompt', 'and', 'provide', 'a', 'detailed', 'response', '.', 'We', 'are', 'excited', 'to', 'introduce', 'ChatGPT', 'to', 'get', 'users', '’', 'feedback', 'and', 'learn', 'about', 'its', 'strengths', 'and', 'weaknesses', '.', 'During', 'the', 'research', 'preview', ',', 'usage', 'of', 'ChatGPT', 'is', 'free', '.', 'Try', 'it', 'now', 'at', 'chat.openai.com', '.', '#ChatGPT', '#InstructGPT', '#OpenAI', '#ChatGPTisFree']


### Tokenizing with n-grams

`N-gram` can be defined as the contiguous sequence of `n` items from a given sample of text or speech. The items can be letters, words, or base pairs according to the application.

First, we tokenize into a sentence in a document.

In [11]:
from nltk.util import ngrams

chatgpt_words_in_sent = [word_tokenize(sentence) for sentence in chatgpt_sents]
chatgpt_bigrams = []
for tokens in chatgpt_words_in_sent:
    bigram = ngrams(tokens, 2)
    chatgpt_bigrams.extend(list(bigram))

unzipped_bigrams = list(zip(*chatgpt_bigrams))
print(unzipped_bigrams)

[('ChatGPT', 'is', 'a', 'sibling', 'model', 'to', 'InstructGPT', ',', 'which', 'is', 'trained', 'to', 'follow', 'an', 'instruction', 'in', 'a', 'prompt', 'and', 'provide', 'a', 'detailed', 'response', 'We', 'are', 'excited', 'to', 'introduce', 'ChatGPT', 'to', 'get', 'users', '’', 'feedback', 'and', 'learn', 'about', 'its', 'strengths', 'and', 'weaknesses', 'During', 'the', 'research', 'preview', ',', 'usage', 'of', 'ChatGPT', 'is', 'free', 'Try', 'it', 'now', 'at', 'chat.openai.com', '#', 'ChatGPT', '#', 'InstructGPT', '#', 'OpenAI', '#'), ('is', 'a', 'sibling', 'model', 'to', 'InstructGPT', ',', 'which', 'is', 'trained', 'to', 'follow', 'an', 'instruction', 'in', 'a', 'prompt', 'and', 'provide', 'a', 'detailed', 'response', '.', 'are', 'excited', 'to', 'introduce', 'ChatGPT', 'to', 'get', 'users', '’', 'feedback', 'and', 'learn', 'about', 'its', 'strengths', 'and', 'weaknesses', '.', 'the', 'research', 'preview', ',', 'usage', 'of', 'ChatGPT', 'is', 'free', '.', 'it', 'now', 'at', 'c

In [14]:
# import nltk
from nltk.util import ngrams

text = "Hi How are you? I am fine and you"
tokens = nltk.word_tokenize(text)

# Tokenizing with bigrams
bigrams = list(ngrams(tokens, 2))

# print the bigrams of the text
unzipped_bigrams = list(zip(*bigrams))
for i in range(len(bigrams)):
    print(unzipped_bigrams[0][i],unzipped_bigrams[1][i])

Hi How
How are
are you
you ?
? I
I am
am fine
fine and
and you


In [15]:
# Tokenizing with trigrams
trigrams = list(ngrams(tokens, 3))

# print the trigrams of the text
unzipped_trigrams = list(zip(*trigrams))
for i in range(len(trigrams)):
    print(unzipped_trigrams[0][i],unzipped_trigrams[1][i], unzipped_trigrams[2][i])

Hi How are
How are you
are you ?
you ? I
? I am
I am fine
am fine and
fine and you


### Extracting words with 1st letter capitalized

In [16]:
for line in chatgpt_sents:
    print(regexp_tokenize(line, r"[A-Z]\w+"))

['ChatGPT', 'InstructGPT']
['We', 'ChatGPT']
['During', 'ChatGPT']
['Try']
['ChatGPT', 'InstructGPT', 'OpenAI', 'ChatGPTisFree']


### Extracting hashtags and emoji

In [17]:
# Ernest Obiena's facebook page. Posted by Djundi Biñas
message2 = """Congratulations Ernest Obiena EJ Obiena - Ernest Obiena for Winning the 1st Gold 🥇Medal for the Philippines 🇵🇭 in the 19th Asian Games and breaking the Championship Record! 💪🏼💯 Boom! 💥Salamat EJ 
#ParaSaBayan 
#PoleVault 
#PhilippinePoleVault 
#TheEJEffectCongratulations Ernest Obiena EJ Obiena - Ernest Obiena for Winning the 1st Gold 🥇Medal for the Philippines 🇵🇭 in the 19th Asian Games and breaking the Championship Record! 💪🏼💯 Boom! 💥Salamat EJ 🫰🏼🙌🏼
#ParaSaBayan 
#PoleVault 
#PhilippinePoleVault 
#TheEJEffect"""
# message2

In [18]:
# Tokenizing all hashtags
regexp_tokenize(message2, r"#\w+")

['#ParaSaBayan',
 '#PoleVault',
 '#PhilippinePoleVault',
 '#TheEJEffectCongratulations',
 '#ParaSaBayan',
 '#PoleVault',
 '#PhilippinePoleVault',
 '#TheEJEffect']

In [19]:
# Tokenize emojis  using NLTK
emoji_pattern = r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U0001FAB0-\U0001FABF\U0001FAC0-\U0001FAFF\U00002600-\U000026FF]+'

emojis = regexp_tokenize(message2, emoji_pattern)

for emoji in emojis:
    print(emoji, end='')

🥇💪🏼💯💥🥇💪🏼💯💥🫰🏼🙌🏼

## Processing from raw text to cleaned text

"clean text" or "cleaned text" typically refers to text data that has been processed to remove any elements that are considered irrelevant or noise for a specific NLP task. 

The cleaning process can include several steps, such as:

*   **Lowercasing**: Converting all text to lowercase to ensure consistency and reduce the dimensionality of the data.

*   **Tokenization**: Breaking the text into individual words or tokens.

*   **Stopword Removal**: Eliminating common words (e.g., "and," "the," "is") that may not carry significant meaning for the task at hand.

*   **Punctuation Removal**: Stripping punctuation marks from the text.

*   **Lemmatization** or **Stemming**: Reducing words to their base or root forms. Lemmatization typically produces real words, while stemming may result in non-real words but is computationally less intensive.

*   **Special Character Removal**: Removing special characters, numbers, or symbols that are not relevant to the analysis.

*   **Spell Correction**: Fixing common spelling errors in the text.

*   **Noise Reduction**: Handling additional data-specific noise, such as HTML tags in web text or emojis in social media text.

### Removing punctuation from a string with regex starting from the raw text data

In [20]:
no_punc = re.sub(r'[^\w\s]', '', chat_gpt)
no_punc

'ChatGPT is a sibling model to InstructGPT which is trained to follow an instruction in a prompt and provide a detailed response We are excited to introduce ChatGPT to get users feedback and learn about its strengths and weaknesses During the research preview usage of ChatGPT is free Try it now at chatopenaicom ChatGPT InstructGPT  OpenAI ChatGPTisFree'

### Lowercasing from a string after punctuation removal

In [21]:
lowercase_no_punc = no_punc.lower()
lowercase_no_punc

'chatgpt is a sibling model to instructgpt which is trained to follow an instruction in a prompt and provide a detailed response we are excited to introduce chatgpt to get users feedback and learn about its strengths and weaknesses during the research preview usage of chatgpt is free try it now at chatopenaicom chatgpt instructgpt  openai chatgptisfree'

In [22]:
print(word_tokenize(lowercase_no_punc))

['chatgpt', 'is', 'a', 'sibling', 'model', 'to', 'instructgpt', 'which', 'is', 'trained', 'to', 'follow', 'an', 'instruction', 'in', 'a', 'prompt', 'and', 'provide', 'a', 'detailed', 'response', 'we', 'are', 'excited', 'to', 'introduce', 'chatgpt', 'to', 'get', 'users', 'feedback', 'and', 'learn', 'about', 'its', 'strengths', 'and', 'weaknesses', 'during', 'the', 'research', 'preview', 'usage', 'of', 'chatgpt', 'is', 'free', 'try', 'it', 'now', 'at', 'chatopenaicom', 'chatgpt', 'instructgpt', 'openai', 'chatgptisfree']


In [23]:
print(regexp_tokenize(lowercase_no_punc, r"([\@|#]\w+|\w+)"))

['chatgpt', 'is', 'a', 'sibling', 'model', 'to', 'instructgpt', 'which', 'is', 'trained', 'to', 'follow', 'an', 'instruction', 'in', 'a', 'prompt', 'and', 'provide', 'a', 'detailed', 'response', 'we', 'are', 'excited', 'to', 'introduce', 'chatgpt', 'to', 'get', 'users', 'feedback', 'and', 'learn', 'about', 'its', 'strengths', 'and', 'weaknesses', 'during', 'the', 'research', 'preview', 'usage', 'of', 'chatgpt', 'is', 'free', 'try', 'it', 'now', 'at', 'chatopenaicom', 'chatgpt', 'instructgpt', 'openai', 'chatgptisfree']


In [24]:
print(tknzr.tokenize(lowercase_no_punc))

['chatgpt', 'is', 'a', 'sibling', 'model', 'to', 'instructgpt', 'which', 'is', 'trained', 'to', 'follow', 'an', 'instruction', 'in', 'a', 'prompt', 'and', 'provide', 'a', 'detailed', 'response', 'we', 'are', 'excited', 'to', 'introduce', 'chatgpt', 'to', 'get', 'users', 'feedback', 'and', 'learn', 'about', 'its', 'strengths', 'and', 'weaknesses', 'during', 'the', 'research', 'preview', 'usage', 'of', 'chatgpt', 'is', 'free', 'try', 'it', 'now', 'at', 'chatopenaicom', 'chatgpt', 'instructgpt', 'openai', 'chatgptisfree']


### Removing stopwords

In [30]:
from nltk.corpus import stopwords
 
nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /home/repl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
def set_clean(raw_text):
    set_stop_words = set(stopwords.words('english'))
    
    no_punc = re.sub(r'[^\w\s]', '', raw_text)
    lowercase_no_punc = no_punc.lower()
    tokenized_text= word_tokenize(lowercase_no_punc)
    return [w for w in tokenized_text if w not in set_stop_words]

In [33]:
clean_tokenized_text = set_clean(chat_gpt)
print(clean_tokenized_text)

['chatgpt', 'sibling', 'model', 'instructgpt', 'trained', 'follow', 'instruction', 'prompt', 'provide', 'detailed', 'response', 'excited', 'introduce', 'chatgpt', 'get', 'users', 'feedback', 'learn', 'strengths', 'weaknesses', 'research', 'preview', 'usage', 'chatgpt', 'free', 'try', 'chatopenaicom', 'chatgpt', 'instructgpt', 'openai', 'chatgptisfree']


In [34]:
print(len(chatgpt_words_v1), len(chatgpt_words_v2), len(chatgpt_words_v3), len(clean_tokenized_text))

68 59 64 31


### On stemming and lemmatization

**Stemming** is the process of producing morphological variants of a root/base word. 

*  stemming programs are commonly referred to as stemming algorithms or stemmers. 
*  Example: `boat` would be the stem for `boat`, `boater`, `boating`, `boats`.

**Lemmatization** looks beyond word reduction and considers a language’s full vocabulary to apply a morphological analysis to words. *  Example: The lemma of`‘was` is `be` and the lemma of `mice` is `mouse`.

In [41]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print("stem for boating: ", stemmer.stem("boating"))
print("stem for running: ", stemmer.stem("running"))
print("stem for better: ", stemmer.stem("better"))
print("stem for mice: ", stemmer.stem("mice"))

stem for boating:  boat
stem for running:  run
stem for better:  better
stem for mice:  mice


In [36]:
#import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /home/repl/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/repl/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [37]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

print("lemma for boating: ", lemmatizer.lemmatize("boating", pos="v"))
print("lemma for running: ", lemmatizer.lemmatize("running", pos="v"))
print("lemma for better: ", lemmatizer.lemmatize("better", pos="a"))
print("lemma for mice: ", lemmatizer.lemmatize("mice", pos="n"))

lemma for boating:  boat
lemma for running:  run
lemma for better:  good
lemma for mice:  mouse


In [40]:
print("lemma for matrices: ", lemmatizer.lemmatize("matrices", pos="n"))
print("stem for matrices: ", stemmer.stem("matrices"))

lemma for matrices:  matrix
stem for matrices:  matric


In [45]:
with open("datasets/wiki_text_debugging.txt", "r") as file:
    wiki_text = file.read()

wiki_text[:500]

"'''Debugging''' is the process of finding and resolving of defects that prevent correct operation of computer software or a system.  \n\nNumerous books have been written about debugging (see below: #Further reading|Further reading), as it involves numerous aspects, including interactive debugging, control flow, integration testing, Logfile|log files, monitoring (Application monitoring|application, System Monitoring|system), memory dumps, Profiling (computer programming)|profiling, Statistical Proc"

In [43]:
clean_wiki_text = set_clean(wiki_text)
print(clean_wiki_text[:50])

['debugging', 'process', 'finding', 'resolving', 'defects', 'prevent', 'correct', 'operation', 'computer', 'software', 'system', 'numerous', 'books', 'written', 'debugging', 'see', 'readingfurther', 'reading', 'involves', 'numerous', 'aspects', 'including', 'interactive', 'debugging', 'control', 'flow', 'integration', 'testing', 'logfilelog', 'files', 'monitoring', 'application', 'monitoringapplication', 'system', 'monitoringsystem', 'memory', 'dumps', 'profiling', 'computer', 'programmingprofiling', 'statistical', 'process', 'control', 'special', 'design', 'tactics', 'improve', 'detection', 'simplifying', 'changes']


In [44]:
lc_wiki_text = [lemmatizer.lemmatize(word, pos="v") for word in clean_wiki_text]
lc_wiki_text = [lemmatizer.lemmatize(word, pos="n") for word in lc_wiki_text]
lc_wiki_text = [lemmatizer.lemmatize(word, pos="a") for word in lc_wiki_text]
lc_wiki_text = [lemmatizer.lemmatize(word, pos="r") for word in lc_wiki_text]
lc_wiki_text = [lemmatizer.lemmatize(word, pos="s") for word in lc_wiki_text]
print(lc_wiki_text[:50])

['debug', 'process', 'find', 'resolve', 'defect', 'prevent', 'correct', 'operation', 'computer', 'software', 'system', 'numerous', 'book', 'write', 'debug', 'see', 'readingfurther', 'read', 'involve', 'numerous', 'aspect', 'include', 'interactive', 'debug', 'control', 'flow', 'integration', 'test', 'logfilelog', 'file', 'monitor', 'application', 'monitoringapplication', 'system', 'monitoringsystem', 'memory', 'dump', 'profile', 'computer', 'programmingprofiling', 'statistical', 'process', 'control', 'special', 'design', 'tactic', 'improve', 'detection', 'simplify', 'change']


** Possible values in the `pos` parameter:**

*   `n` for noun files,
*   `v` for verb files,
*   `a` for adjective files,
*   `s` for adjective (satellite) files,
*   `r` for adverb files.