## Stop words removal using Spacy

In [1]:
import spacy

In [2]:
spacy_nlp = spacy.load("en_core_web_sm")

In [3]:
# Display all the defaults stop words in Spacy

print(spacy_nlp.Defaults.stop_words)

{'be', 'or', 'via', 'nobody', 'how', 'when', 'whereas', 'upon', 'get', 'among', 'six', 'where', 'beyond', 'using', 'becoming', 'done', 'doing', 'beforehand', 'so', 'almost', 'hereby', 'while', 'make', '‘ve', 'has', 'him', 'her', 'several', 'somewhere', 'bottom', 'whereafter', 'hence', 'quite', 'before', 'become', 'first', 'can', 'further', 'who', 'without', 'does', 'his', 'very', 'hereafter', 'just', 'in', '‘m', 'namely', 'anywhere', 'nine', 'few', 'serious', 'why', 'indeed', 'been', 'at', 'an', 'himself', 'full', 'ca', 'noone', 'one', 'being', 'other', 'put', 'hereupon', 'next', 'rather', 'thus', 'am', 'after', 'besides', 'but', 'last', 'least', 'give', 'my', 'should', 'also', 'latter', 'part', 'every', 'much', 'side', 'out', 'really', 'off', 'were', 'whither', 'ours', 'between', 'whom', 'onto', 'those', 'themselves', 'otherwise', 'this', 'yourself', '‘re', 'mine', 'into', 'although', 'on', 'else', 'below', 'have', 'whenever', 'ourselves', 'whose', "n't", 'own', 'even', 'mostly', 'wha

In [11]:
# Checking if a given word is stope word or not

l = spacy_nlp.vocab["the"] # It will return lexeme object
if l.is_stop:
    print("Its a stop word")
else:
    print("Not a stop word")

Its a stop word


In [7]:
text = "All the glitters are not gold"

In [8]:
doc = spacy_nlp(text)

In [12]:
output_list = []
for token in doc:
    l = spacy_nlp.vocab[token.text]
    if not l.is_stop:
        output_list.append(token.text)

In [13]:
print(output_list)

['glitters', 'gold']


In [14]:
l = spacy_nlp.vocab["bcs"]
if l.is_stop:
    print("Yes")

In [15]:
# Adding a new stop word into Default stop word list
# You saw that "bcs" is not a stop word

spacy_nlp.Defaults.stop_words.add("bcs")
spacy_nlp.vocab["bcs"].is_stop = True

In [16]:
l = spacy_nlp.vocab["bcs"]
if l.is_stop:
    print("Yes")

Yes


## Stop words removal using NLTK

In [17]:
import nltk
from nltk.corpus import stopwords

In [18]:
# Displaying set of NLTK stop words
print(set(stopwords.words('english')))


{'be', 'when', 'wouldn', 'how', "couldn't", 'where', 'will', 'o', 'doing', 'while', 'so', 'him', 'has', 'her', 'before', 'hadn', 'who', 'further', 'can', "it's", 'ain', 'doesn', 'y', 'his', 'does', 'very', 'just', "wasn't", 'in', "that'll", 'why', 'few', 'don', 'll', "won't", 'been', 'at', 'an', 'yours', 'himself', 'same', 'being', 'other', "needn't", "mustn't", 'am', 'my', 'theirs', 'but', 'after', 'should', 'were', 'out', 'off', 'ours', 'mightn', 'between', 'whom', 'those', 'themselves', "shan't", 'yourself', 'this', "you'd", 'into', 'on', 'have', 'below', 'ourselves', 'own', 'what', "you'll", 'he', 'by', 'about', "hasn't", 'only', 'both', 'during', 'nor', 'too', 'won', 'over', "hadn't", 'up', 'are', "isn't", 'yourselves', 'they', 'each', 'all', 'more', 'as', "don't", "weren't", 'had', 'its', "you've", 'that', 'me', 'them', 'is', 'which', 'than', 'itself', 'there', 'under', 'if', 'any', 'did', 'once', 'the', "didn't", 'having', 'not', 'from', 'hasn', 'here', 'our', 'was', 'd', 'do', 

In [19]:
from nltk.tokenize import word_tokenize

In [20]:
text = "It's late in the evening; she's wondering what clothes to wear She puts on her make-up and brushes her long blonde hair And then she asks me, Do I look all right?"

In [22]:
tokens = word_tokenize(text) # Getting token from text

In [23]:
stop_words = set(stopwords.words('english')) # Getting set of all NLTK stop words

In [24]:
# We'll check is the token is in stop word or not
output = []
for token in tokens:
    if not token in stop_words:
        output.append(token)
print(output)

['It', "'s", 'late', 'evening', ';', "'s", 'wondering', 'clothes', 'wear', 'She', 'puts', 'make-up', 'brushes', 'long', 'blonde', 'hair', 'And', 'asks', ',', 'Do', 'I', 'look', 'right', '?']


In [25]:
# Addding new word into set of stop words
stop_words.add("bcs")