<a href="https://colab.research.google.com/github/vinay2k2/DataScienceToolBox/blob/main/StopWordsAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLTK

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

print(f"Stopwords in NLTK are {stopwords.words('english')}")
print(f"Length of NLTK Stopwords are {len(stopwords.words('english'))}")
# Example text
text = "This is an example sentence with some stopwords."

# Tokenize the text into words
words = word_tokenize(text)

# Remove stopwords
filtered_words_nltk = [word for word in words if word.lower() not in stopwords.words('english')]

# Print the result
print("Original words:", words)
print("Filtered words (NLTK):", filtered_words_nltk)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Stopwords in NLTK are ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', '

# SPACY

In [3]:
import spacy

# Load the English NLP model from spaCy
nlp = spacy.load("en_core_web_sm")

# Example text
text = "This is an example sentence with some stopwords."

print(f"Stopwords in SPACY are {nlp.Defaults.stop_words}")
print(f"Length of SPACY Stopwords are {len(nlp.Defaults.stop_words)}")

# Process the text using spaCy
doc = nlp(text)

# Remove stopwords
filtered_words_spacy = [token.text for token in doc if not token.is_stop]

# Print the result
print("Original words:", [token.text for token in doc])
print("Filtered words (spaCy):", filtered_words_spacy)

Stopwords in SPACY are {'thence', 'did', 'twenty', 'side', 'when', 'something', 'again', 'seems', 'whole', 'about', 'very', 'eight', 'please', 'it', 'might', 'both', 'rather', 'beyond', 'those', 'anyway', '’ll', 'any', 'everyone', 'other', 'nowhere', 'which', 'they', 'seemed', 'by', 'eleven', 'been', 'others', 'via', 'mostly', 'using', 'as', 'will', 'due', 'in', 'besides', 'out', 'somewhere', 'n’t', 'no', 'throughout', 'somehow', 'alone', 'its', 'there', 'why', 'done', 'say', 'more', 'thereupon', 'here', 'hereupon', "n't", '’s', 're', 'should', 'twelve', 'such', 'of', 'were', 'less', 'she', 'formerly', 'sometime', 'on', 'bottom', 'except', 'former', 'my', 'where', 'i', 'whereupon', 'thereby', 'still', 'how', 'that', 'four', 'anyone', 'hereby', 'never', 'make', 'an', 'nine', 'me', 'n‘t', 'beside', 'whither', 'yourself', 'many', 'none', 'unless', 'anyhow', 'sixty', 'these', 'thus', 'ever', 'our', 'keep', 'even', 'name', 'six', 'without', '’d', 'together', 'you', 'may', 'seem', 'onto', 'f

# TextBlob

In [4]:
from textblob import TextBlob

# Example text
text = "This is an example sentence with some stopwords."


print(f"Stopwords in Textblob are {stopwords.words('english')}")
print(f"Length of Textblob Stopwords are {len(stopwords.words('english'))}")

# Create a TextBlob object
blob = TextBlob(text)

# Remove stopwords
filtered_words_textblob = [word for word in blob.words if word.lower() not in stopwords.words('english')]

# Print the result
print("Original words:", blob.words)
print("Filtered words (TextBlob):", filtered_words_textblob)

Stopwords in Textblob are ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own

# Gensim

In [6]:
from gensim.parsing.preprocessing import STOPWORDS

# Example text
text = "This is an example sentence with some stopwords."



print(f"Stopwords in Gensim are {STOPWORDS}")
print(f"Length of Gensim Stopwords are {len(STOPWORDS)}")

# Tokenize the text into words
words = text.split()

# Remove stopwords
filtered_words_gensim = [word for word in words if word.lower() not in STOPWORDS]

# Print the result
print("Original words:", words)
print("Filtered words (Gensim):", filtered_words_gensim)

Stopwords in Gensim are frozenset({'thence', 'did', 'twenty', 'side', 'when', 'something', 'seems', 'again', 'whole', 'about', 'very', 'eight', 'please', 'ie', 'it', 'might', 'both', 'rather', 'beyond', 'those', 'anyway', 'any', 'didn', 'everyone', 'other', 'find', 'nowhere', 'which', 'they', 'seemed', 'by', 'eleven', 'been', 'others', 'via', 'mostly', 'using', 'co', 'as', 'will', 'due', 'in', 'besides', 'out', 'bill', 'somewhere', 'no', 'throughout', 'somehow', 'alone', 'its', 'there', 'why', 'done', 'say', 'more', 'thereupon', 'interest', 'here', 'hereupon', 're', 'should', 'twelve', 'such', 'of', 'doesn', 'were', 'less', 'inc', 'she', 'formerly', 'sometime', 'con', 'on', 'bottom', 'except', 'former', 'my', 'where', 'i', 'whereupon', 'thereby', 'still', 'how', 'that', 'four', 'anyone', 'hereby', 'never', 'make', 'an', 'nine', 'me', 'beside', 'whither', 'yourself', 'many', 'kg', 'none', 'unless', 'anyhow', 'sixty', 'these', 'thus', 'ever', 'our', 'keep', 'even', 'name', 'six', 'withou

# SKLEARN


In [7]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Example text
text = "This is an example sentence with some stopwords."


print(f"Stopwords in sklearn are {ENGLISH_STOP_WORDS}")
print(f"Length of sklearn Stopwords are {len(ENGLISH_STOP_WORDS)}")

# Tokenize the text into words
words = text.split()

# Remove stopwords
filtered_words_sklearn = [word for word in words if word.lower() not in ENGLISH_STOP_WORDS]

# Print the result
print("Original words:", words)
print("Filtered words (scikit-learn):", filtered_words_sklearn)

Stopwords in sklearn are frozenset({'thence', 'twenty', 'side', 'when', 'something', 'again', 'seems', 'whole', 'about', 'very', 'eight', 'please', 'ie', 'it', 'might', 'both', 'rather', 'beyond', 'those', 'anyway', 'any', 'everyone', 'other', 'find', 'nowhere', 'which', 'they', 'seemed', 'by', 'eleven', 'been', 'others', 'via', 'mostly', 'co', 'as', 'will', 'due', 'in', 'besides', 'out', 'bill', 'somewhere', 'no', 'throughout', 'somehow', 'alone', 'its', 'there', 'why', 'done', 'more', 'thereupon', 'interest', 'here', 'hereupon', 're', 'should', 'twelve', 'such', 'of', 'were', 'less', 'inc', 'she', 'formerly', 'sometime', 'con', 'on', 'bottom', 'except', 'former', 'my', 'where', 'i', 'whereupon', 'thereby', 'still', 'how', 'that', 'four', 'anyone', 'hereby', 'never', 'an', 'nine', 'me', 'beside', 'whither', 'yourself', 'many', 'none', 'anyhow', 'sixty', 'these', 'thus', 'ever', 'our', 'keep', 'even', 'name', 'six', 'without', 'together', 'you', 'may', 'seem', 'onto', 'full', 'same', '

TensorFlow and PyTorch, as deep learning frameworks, do not inherently provide predefined stopword lists like some NLP-specific libraries (e.g., NLTK, spaCy, TextBlob). However, you can still leverage stopwords from other libraries or define your own list of stopwords when working with text data in the context of these frameworks.

In [9]:
#Using NLTK with TensorFlow:
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

# Example text
text = "This is an example sentence with some stopwords."

# Tokenize the text into words
words = word_tokenize(text)

# Remove stopwords using NLTK
filtered_words_nltk = [word for word in words if word.lower() not in stopwords.words('english')]

# Convert to TensorFlow tensor
tensor_text = tf.constant(filtered_words_nltk)

# Print the result
print("Original words:", words)
print("Filtered words (NLTK with TensorFlow):", tensor_text.numpy())


Original words: ['This', 'is', 'an', 'example', 'sentence', 'with', 'some', 'stopwords', '.']
Filtered words (NLTK with TensorFlow): [b'example' b'sentence' b'stopwords' b'.']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
