# Bag of Words Pipeline

    -> Get The Data / Corpus
    -> Tokenisation, Stopword Removal
    -> Stemming
    -> Building A Vocab
    -> Vectorization
    -> Classification

In [1]:
document = """It was a very pleasant day. The weather was cool and there were light showers.
I went to the market to buy some fruits."""

sentence = "Send all the 50 documents related to chapters 1,2,3 at sahil@gmail.com"

## Tokenisation

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [3]:
sents = sent_tokenize(document)
print(sents)
print(len(sents))

['It was a very pleasant day.', 'The weather was cool and there were light showers.', 'I went to the market to buy some fruits.']
3


In [4]:
print(sentence.split())

['Send', 'all', 'the', '50', 'documents', 'related', 'to', 'chapters', '1,2,3', 'at', 'sahil@gmail.com']


In [5]:
words = word_tokenize(sentence)
print(words)
print(len(words))

['Send', 'all', 'the', '50', 'documents', 'related', 'to', 'chapters', '1,2,3', 'at', 'sahil', '@', 'gmail.com']
13


## Stopword Removal

In [6]:
from nltk.corpus import stopwords

In [7]:
sw = set(stopwords.words('english'))
print(sw)

{'you', 'how', 'other', 'weren', 'ma', "you'll", 'my', 'being', 'that', 'the', 'with', 'against', 'such', 'own', 'your', 'me', 'of', 'these', 'isn', 'll', "you're", 'couldn', 'then', 'out', 'shan', 'be', 'but', 're', 'hers', 'themselves', 'same', 'yourself', 'about', 'from', 'who', 'having', 'it', 'have', 'or', 'am', 'on', 'he', 'both', 'over', "haven't", "that'll", 'mightn', 'had', 'any', 'can', 'him', 'again', 'only', 'hasn', "doesn't", 'ours', 'and', 'if', 'up', 'what', "shouldn't", 'above', 'an', "weren't", 'mustn', 'o', 'will', 'should', "should've", 'at', 'don', "don't", 'by', 'when', 'i', 'before', 'down', 'been', "aren't", 'wasn', 'all', "wasn't", 'do', 'into', 'she', 'as', 'our', 'ourselves', "needn't", 'once', 'haven', 'below', 'y', 'has', 'were', 'this', "wouldn't", 'ain', 'in', 'herself', 'there', 'so', 'yours', 'was', 'won', 'during', 'because', 'them', 'for', 'more', 've', 'needn', 'some', 'not', 'until', 'didn', 'himself', 'itself', 'now', 'm', 'is', 'did', "you've", "ha

In [8]:
def remove_stopwords(text, stopwords):
    useful_words = [w for w in text if w not in stopwords]
    return useful_words

In [9]:
text = "I am not bothered about her very much".split()
useful_text = remove_stopwords(text, sw)
print(useful_text)

['I', 'bothered', 'much']


In [10]:
"not" in sw

True

## Tokenization Using Regular Expressions

In [11]:
sentence = "Send all the 50 documents related to chapters 1,2,3 at sahil@gmail.com"

In [12]:
from nltk.tokenize import RegexpTokenizer

In [13]:
tokenizer = RegexpTokenizer('[a-zA-Z@.]+')
useful_text = tokenizer.tokenize(sentence)
print(useful_text)

['Send', 'all', 'the', 'documents', 'related', 'to', 'chapters', 'at', 'sahil@gmail.com']
