In [4]:
document = '''It was a very pleasant day. The weather was so cool and there were light showers. I went to the market to buy some fruits'''

sentence = "Send all the 50 documents related to chapters 1,2,3 at utkarsh@gmail.com"

#### Tokenize

In [9]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [10]:
# Tokenizes sentences
sents = sent_tokenize(document)
sents

['It was a very pleasant day.',
 'The weather was so cool and there were light showers.',
 'I went to the market to buy some fruits']

In [11]:
# Tokenizes words
words = word_tokenize(sentence)
words

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'at',
 'utkarsh',
 '@',
 'gmail.com']

#### Stop words removal

In [14]:
# Fetching stopwords from nltk
from nltk.corpus import stopwords
print(set(stopwords.words("english")))

{"didn't", 'his', "she's", 've', 'between', 'down', 'mustn', 're', 'm', 'do', 'the', 'some', 'who', 'off', 'but', 'will', "isn't", 'by', 'wasn', 'each', 'himself', "hasn't", 'hasn', 'then', 'own', 'here', 'weren', "wasn't", 'doing', 'once', 'being', 'than', "weren't", 'during', 'couldn', 'you', 'up', "don't", 'y', "you'd", 'yourselves', 'or', 'll', 'our', 'have', 't', "you're", 'through', 'while', 'yourself', 'did', 'shan', 'their', 'myself', 'itself', 'until', "couldn't", "wouldn't", 'which', 'aren', "shouldn't", 'further', 'more', 'mightn', 'of', 'a', 'why', 'so', 's', "shan't", "mustn't", 'before', 'was', 'when', 'be', 'don', 'against', "aren't", 'theirs', 'at', 'ourselves', 'nor', 'again', 'd', 'her', 'yours', 'and', 'this', 'those', 'having', 'as', 'been', 'with', 'no', 'can', 'they', 'only', 'me', "won't", 'ain', 'from', 'it', 'them', 'hadn', 'herself', 'whom', 'had', 'all', 'both', "it's", "hadn't", 'him', "that'll", 'over', 'are', 'above', 'hers', 'such', 'too', 'its', 'themsel

These are all the stop words we have

In [20]:
def getUsefulWords(text, stopwords):

    return list(filter(lambda x : x.lower() not in stopwords, text.split()))

In [21]:
text = "I am not bothered about her very much"
sw = set(stopwords.words("english"))

getUsefulWords(text, sw)

['bothered', 'much']

**IMP:** We can see that it also removed negation "not", this can be usecase dependent, as this changes the sentiment

#### Stemming
- Breaking words into their crude form

In [22]:
text = """Foxes love to make jumps. The quick brown fox can be seen jumping over the lovely dog from a 6ft high wall"""

In [23]:
# Importing stemmers (Snowball, Porter, Lancaster)
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

# these are just different stemmers for the same thing

In [24]:
ps = PorterStemmer()

In [25]:
ps.stem("jumps")

'jump'

In [26]:
ps.stem("jumping")

'jump'

In [29]:
ps.stem("lovely")

'love'

In [30]:
ps.stem("loving")

'love'

We can see how any word we give is converted to the basic stem

In [33]:
# Snowball Stemmer is multilingual!
ss = SnowballStemmer("english")

In [34]:
ss.stem("jumping")

'jump'

Lemmatization is similar to stemming, we use `wordnet` for that

In [35]:
from nltk.stem.wordnet import WordNetLemmatizer

In [36]:
wn = WordNetLemmatizer()

In [39]:
wn.lemmatize("jumps")

'jump'

We get a similar result

#### Constructing Vocabulary
- Maintain a unique dictionary using the whole corpus
- Assign frequency to the index for each document
- This gives us `vectorized corpus`

In [40]:
# Making a corpus with 4 documents
corpus = [
    "Indian cricket team will win World Cup, says Capt. Virat Kohli. World cup will be held in Sri Lanka",
    "We will win the next Lok Sabha Elections, says confident Indian PM",
    "The nobel laurate won the hearts of the people.",
    "The movie Raazi is an exciting Indian Spy thriller based upon a real story."
]

Using `SKLearn` for a Count Vectorizer

In [41]:
from sklearn.feature_extraction.text import CountVectorizer

In [42]:
cv = CountVectorizer()

In [43]:
vectorized_corpus = cv.fit_transform(corpus)
vectorized_corpus

<4x41 sparse matrix of type '<class 'numpy.int64'>'
	with 48 stored elements in Compressed Sparse Row format>

We get a sparse matrix, not an array with this, we use the `toarray` method for the arrays

In [44]:
vectorized_corpus = vectorized_corpus.toarray()
vectorized_corpus

array([[0, 0, 1, 1, 0, 1, 2, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 2, 1, 0, 2],
       [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
        0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0]])

We can see the frequencies at different index as required

In [46]:
# To see the complete dictionary mapping
cv.vocabulary_

{'indian': 12,
 'cricket': 5,
 'team': 31,
 'will': 37,
 'win': 38,
 'world': 40,
 'cup': 6,
 'says': 27,
 'capt': 3,
 'virat': 35,
 'kohli': 14,
 'be': 2,
 'held': 10,
 'in': 11,
 'sri': 29,
 'lanka': 15,
 'we': 36,
 'the': 32,
 'next': 19,
 'lok': 17,
 'sabha': 26,
 'elections': 7,
 'confident': 4,
 'pm': 23,
 'nobel': 20,
 'laurate': 16,
 'won': 39,
 'hearts': 9,
 'of': 21,
 'people': 22,
 'movie': 18,
 'raazi': 24,
 'is': 13,
 'an': 0,
 'exciting': 8,
 'spy': 28,
 'thriller': 33,
 'based': 1,
 'upon': 34,
 'real': 25,
 'story': 30}

This is the generated dictionary for us

To get the words from the vector, we `reverse map`

In [47]:
numbers = vectorized_corpus[2]
numbers

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0])

In [50]:
cv.inverse_transform(numbers.reshape(1, -1))

[array(['hearts', 'laurate', 'nobel', 'of', 'people', 'the', 'won'],
       dtype='<U9')]

As this is a bag of words model, **order is not saved**, and we get only the **unique words**

#### Vectorization with stop words removal
- We can pass a tokenizer in the `CountVectorizer`

In [52]:
# Making a custom tokenizer
from nltk.tokenize import RegexpTokenizer

In [54]:
# This will follow this Regex for tokenizing
tokenizer = RegexpTokenizer('[a-zA-Z@.]+')

In [57]:
def removeStopWords(text, stopWords):

    return list(filter(lambda x : x not in stopWords, text))

In [58]:
def myTokenizer(document):
    words = tokenizer.tokenize(document.lower())

    # Stop words Removal
    words = removeStopWords(words, sw)
    return words

In [59]:
# Testing
myTokenizer(sentence)

['send', 'documents', 'related', 'chapters', 'utkarsh@gmail.com']

In [60]:
# Custom Vectorizer
cv = CountVectorizer(tokenizer=myTokenizer)

In [61]:
vectorized_corpus = cv.fit_transform(corpus)
vectorized_corpus = vectorized_corpus.toarray()

In [63]:
len(vectorized_corpus[0])

32

Our length went from 48 to 32, hence this will be much more efficient approach

In [64]:
cv.inverse_transform(vectorized_corpus)

[array(['capt.', 'cricket', 'cup', 'held', 'indian', 'kohli.', 'lanka',
        'says', 'sri', 'team', 'virat', 'win', 'world'], dtype='<U9'),
 array(['confident', 'elections', 'indian', 'lok', 'next', 'pm', 'sabha',
        'says', 'win'], dtype='<U9'),
 array(['hearts', 'laurate', 'nobel', 'people.'], dtype='<U9'),
 array(['based', 'exciting', 'indian', 'movie', 'raazi', 'real', 'spy',
        'story.', 'thriller', 'upon'], dtype='<U9')]

This is our vectorized corpus after stopword removal

### Ways to create features
- Unigram
  - Every word is a feature
- Bigram
  - Club 2 consecutive words to make a feature
- Trigram
  - 3 consecutives
- n-gram
- TF - IDF Normalisation

#### Why the need?
- In sentences like *This movie is good* and *This movie is **not good***
- Both have "good" there, if we consider good as a feature, they both are classified same
- We need to have __not good__ as a feature, hence bigram

In [65]:
new_corpus = [
    "This is a good movie",
    "This is good movie, but actor not present",
    "This is not a good movie"
]

In [70]:
cv = CountVectorizer(ngram_range=(2,2))
vectorized_corpus = cv.fit_transform(new_corpus).toarray()
vectorized_corpus

array([[0, 0, 1, 1, 0, 0, 0, 0, 1],
       [1, 1, 1, 1, 0, 1, 0, 1, 1],
       [0, 0, 1, 0, 1, 0, 1, 0, 1]])

In [71]:
cv.vocabulary_

{'this is': 8,
 'is good': 3,
 'good movie': 2,
 'movie but': 5,
 'but actor': 1,
 'actor not': 0,
 'not present': 7,
 'is not': 4,
 'not good': 6}

Here we have features of 2 words clubbed

For multiple grams

In [72]:
cv = CountVectorizer(ngram_range=(1,3))
vectorized_corpus = cv.fit_transform(new_corpus).toarray()
vectorized_corpus

array([[0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 1, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,
        1, 1, 1, 0],
       [0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0,
        1, 1, 0, 1]])

In [73]:
cv.vocabulary_

{'this': 22,
 'is': 9,
 'good': 6,
 'movie': 14,
 'this is': 23,
 'is good': 10,
 'good movie': 7,
 'this is good': 24,
 'is good movie': 11,
 'but': 3,
 'actor': 0,
 'not': 17,
 'present': 21,
 'movie but': 15,
 'but actor': 4,
 'actor not': 1,
 'not present': 20,
 'good movie but': 8,
 'movie but actor': 16,
 'but actor not': 5,
 'actor not present': 2,
 'is not': 12,
 'not good': 18,
 'this is not': 25,
 'is not good': 13,
 'not good movie': 19}

Now we get all the features, one word, 2 words and 3 words

### TF - IDF Normalization
- Avoid features that occur very often
- Information decreases as the number of occurences increases **across documents**
- We define `Term document frequency` which associates a weight

$tf(t,d)$ gives the term frequency, that is the frequency of the term in the given document

$$idf(t,d) = log\left( \frac{N}{1 + count(t, D)} \right)$$
where,\
N is the total number of documents\
$count(t,D)$ gives the frequency of term in all the documents

Finally the matrix is computed by $tf \cdot idf$

In [74]:
corpus = [
    "this is good movie",
    "this was good movie",
    "this is not good movie"
]

In [75]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [76]:
tfidf = TfidfVectorizer()

In [77]:
vc = tfidf.fit_transform(corpus).toarray()


In [78]:
vc

array([[0.46333427, 0.59662724, 0.46333427, 0.        , 0.46333427,
        0.        ],
       [0.41285857, 0.        , 0.41285857, 0.        , 0.41285857,
        0.69903033],
       [0.3645444 , 0.46941728, 0.3645444 , 0.61722732, 0.3645444 ,
        0.        ]])

In [79]:
tfidf.vocabulary_

{'this': 4, 'is': 1, 'good': 0, 'movie': 2, 'was': 5, 'not': 3}

We can see that here `not` gets a good weight, which is good, because now negation can be handled, also good loses its weight cause it is present in all the sentences